From 594c11d0e1d445f580898a2b8c850f2e3f099368 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 27 Jan 2026 07:22:35 -0600 Subject: ipmi: Fix use-after-free and list corruption on sender error The analysis from Breno: When the SMI sender returns an error, smi_work() delivers an error response but then jumps back to restart without cleaning up properly: 1. intf->curr_msg is not cleared, so no new message is pulled 2. newmsg still points to the message, causing sender() to be called again with the same message 3. If sender() fails again, deliver_err_response() is called with the same recv_msg that was already queued for delivery This causes list_add corruption ("list_add double add") because the recv_msg is added to the user_msgs list twice. Subsequently, the corrupted list leads to use-after-free when the memory is freed and reused, and eventually a NULL pointer dereference when accessing recv_msg->done. The buggy sequence: sender() fails -> deliver_err_response(recv_msg) // recv_msg queued for delivery -> goto restart // curr_msg not cleared! sender() fails again (same message!) -> deliver_err_response(recv_msg) // tries to queue same recv_msg -> LIST CORRUPTION Fix this by freeing the message and setting it to NULL on a send error. Also, always free the newmsg on a send error, otherwise it will leak. Reported-by: Breno Leitao Closes: https://lore.kernel.org/lkml/20260127-ipmi-v1-0-ba5cc90f516f@debian.org/ Fixes: 9cf93a8fa9513 ("ipmi: Allow an SMI sender to return an error") Cc: stable@vger.kernel.org # 4.18 Reviewed-by: Breno Leitao Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 3f48fc6ab596..a590a67294e2 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4852,8 +4852,15 @@ restart: if (newmsg->recv_msg) deliver_err_response(intf, newmsg->recv_msg, cc); - else - ipmi_free_smi_msg(newmsg); + if (!run_to_completion) + spin_lock_irqsave(&intf->xmit_msgs_lock, + flags); + intf->curr_msg = NULL; + if (!run_to_completion) + spin_unlock_irqrestore(&intf->xmit_msgs_lock, + flags); + ipmi_free_smi_msg(newmsg); + newmsg = NULL; goto restart; } } -- cgit v1.2.3 From 1d90e6c1a56f6ab83e5c9d30ded19e7ac8155713 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 27 Jan 2026 07:35:02 -0600 Subject: ipmi: Consolidate the run to completion checking for xmit msgs lock It made things hard to read, move the check to a function. Signed-off-by: Corey Minyard Reviewed-by: Breno Leitao --- drivers/char/ipmi/ipmi_msghandler.c | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a590a67294e2..a042b1596933 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -602,6 +602,22 @@ static int __ipmi_bmc_register(struct ipmi_smi *intf, static int __scan_channels(struct ipmi_smi *intf, struct ipmi_device_id *id, bool rescan); +static void ipmi_lock_xmit_msgs(struct ipmi_smi *intf, int run_to_completion, + unsigned long *flags) +{ + if (run_to_completion) + return; + spin_lock_irqsave(&intf->xmit_msgs_lock, *flags); +} + +static void ipmi_unlock_xmit_msgs(struct ipmi_smi *intf, int run_to_completion, + unsigned long *flags) +{ + if (run_to_completion) + return; + spin_unlock_irqrestore(&intf->xmit_msgs_lock, *flags); +} + static void free_ipmi_user(struct kref *ref) { struct ipmi_user *user = container_of(ref, struct ipmi_user, refcount); @@ -1878,11 +1894,9 @@ static void smi_send(struct ipmi_smi *intf, int run_to_completion = READ_ONCE(intf->run_to_completion); unsigned long flags = 0; - if (!run_to_completion) - spin_lock_irqsave(&intf->xmit_msgs_lock, flags); + ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); smi_msg = smi_add_send_msg(intf, smi_msg, priority); - if (!run_to_completion) - spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); + ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); if (smi_msg) handlers->sender(intf->send_info, smi_msg); @@ -4826,8 +4840,7 @@ static void smi_work(struct work_struct *t) * message delivery. */ restart: - if (!run_to_completion) - spin_lock_irqsave(&intf->xmit_msgs_lock, flags); + ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); if (intf->curr_msg == NULL && !intf->in_shutdown) { struct list_head *entry = NULL; @@ -4843,8 +4856,7 @@ restart: intf->curr_msg = newmsg; } } - if (!run_to_completion) - spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); + ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); if (newmsg) { cc = intf->handlers->sender(intf->send_info, newmsg); @@ -4852,13 +4864,9 @@ restart: if (newmsg->recv_msg) deliver_err_response(intf, newmsg->recv_msg, cc); - if (!run_to_completion) - spin_lock_irqsave(&intf->xmit_msgs_lock, - flags); + ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); intf->curr_msg = NULL; - if (!run_to_completion) - spin_unlock_irqrestore(&intf->xmit_msgs_lock, - flags); + ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); ipmi_free_smi_msg(newmsg); newmsg = NULL; goto restart; @@ -4928,16 +4936,14 @@ void ipmi_smi_msg_received(struct ipmi_smi *intf, spin_unlock_irqrestore(&intf->waiting_rcv_msgs_lock, flags); - if (!run_to_completion) - spin_lock_irqsave(&intf->xmit_msgs_lock, flags); + ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); /* * We can get an asynchronous event or receive message in addition * to commands we send. */ if (msg == intf->curr_msg) intf->curr_msg = NULL; - if (!run_to_completion) - spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); + ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); if (run_to_completion) smi_work(&intf->smi_work); -- cgit v1.2.3 From 9f235ccecd03c436cb1683eac16b12f119e54aa9 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Tue, 13 Jan 2026 17:41:34 +0800 Subject: ipmi: ipmb: initialise event handler read bytes IPMB doesn't use i2c reads, but the handler needs to set a value. Otherwise an i2c read will return an uninitialised value from the bus driver. Fixes: 63c4eb347164 ("ipmi:ipmb: Add initial support for IPMI over IPMB") Signed-off-by: Matt Johnston Message-ID: <20260113-ipmb-read-init-v1-1-a9cbce7b94e3@codeconstruct.com.au> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 3a51e58b2487..28818952a7a4 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -202,11 +202,16 @@ static int ipmi_ipmb_slave_cb(struct i2c_client *client, break; case I2C_SLAVE_READ_REQUESTED: + *val = 0xff; + ipmi_ipmb_check_msg_done(iidev); + break; + case I2C_SLAVE_STOP: ipmi_ipmb_check_msg_done(iidev); break; case I2C_SLAVE_READ_PROCESSED: + *val = 0xff; break; } -- cgit v1.2.3 From 6b157b408d0c7d125e4d7c62e11e7d9376a5d150 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 16 Jan 2026 17:22:01 -0600 Subject: ipmi:ls2k: Make ipmi_ls2k_platform_driver static No need for it to be global. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601170753.3zDBerGP-lkp@intel.com/ Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_ls2k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_ls2k.c b/drivers/char/ipmi/ipmi_si_ls2k.c index 45442c257efd..4c1da80f256c 100644 --- a/drivers/char/ipmi/ipmi_si_ls2k.c +++ b/drivers/char/ipmi/ipmi_si_ls2k.c @@ -168,7 +168,7 @@ static void ipmi_ls2k_remove(struct platform_device *pdev) ipmi_si_remove_by_dev(&pdev->dev); } -struct platform_driver ipmi_ls2k_platform_driver = { +static struct platform_driver ipmi_ls2k_platform_driver = { .driver = { .name = "ls2k-ipmi-si", }, -- cgit v1.2.3 From 211ecfaaef186ee5230a77d054cdec7fbfc6724a Mon Sep 17 00:00:00 2001 From: Brad Spengler Date: Wed, 7 Jan 2026 12:12:36 -0500 Subject: drm/vmwgfx: Fix invalid kref_put callback in vmw_bo_dirty_release The kref_put() call uses (void *)kvfree as the release callback, which is incorrect. kref_put() expects a function with signature void (*release)(struct kref *), but kvfree has signature void (*)(const void *). Calling through an incompatible function pointer is undefined behavior. The code only worked by accident because ref_count is the first member of vmw_bo_dirty, making the kref pointer equal to the struct pointer. Fix this by adding a proper release callback that uses container_of() to retrieve the containing structure before freeing. Fixes: c1962742ffff ("drm/vmwgfx: Use kref in vmw_bo_dirty") Signed-off-by: Brad Spengler Signed-off-by: Zack Rusin Cc: Ian Forbes Link: https://patch.msgid.link/20260107171236.3573118-1-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c index fd4e76486f2d..45561bc1c9ef 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c @@ -260,6 +260,13 @@ out_no_dirty: return ret; } +static void vmw_bo_dirty_free(struct kref *kref) +{ + struct vmw_bo_dirty *dirty = container_of(kref, struct vmw_bo_dirty, ref_count); + + kvfree(dirty); +} + /** * vmw_bo_dirty_release - Release a dirty-tracking user from a buffer object * @vbo: The buffer object @@ -274,7 +281,7 @@ void vmw_bo_dirty_release(struct vmw_bo *vbo) { struct vmw_bo_dirty *dirty = vbo->dirty; - if (dirty && kref_put(&dirty->ref_count, (void *)kvfree)) + if (dirty && kref_put(&dirty->ref_count, vmw_bo_dirty_free)) vbo->dirty = NULL; } -- cgit v1.2.3 From 922f9dec5d19df4cfbb7070275e5c131d10c80f3 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Fri, 9 Jan 2026 09:51:39 -0600 Subject: drm/vmwgfx: Set a unique ID for each submitted command buffer These IDs are logged by the Hypervisor when debug logging is enabled. Having the IDs in the log makes it much easier to see when command buffers start and finish. They can also be used by logging/tracing in the Guest to help correlate between Guest and Hypervisor logs. Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patch.msgid.link/20260109155139.3259493-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c index 94e8982f5616..1ee37690b940 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf.c @@ -105,6 +105,7 @@ struct vmw_cmdbuf_context { * @handle: DMA address handle for the command buffer space if @using_mob is * false. Immutable. * @size: The size of the command buffer space. Immutable. + * @id: Monotonically increasing ID of the last cmdbuf submitted. * @num_contexts: Number of contexts actually enabled. */ struct vmw_cmdbuf_man { @@ -132,6 +133,7 @@ struct vmw_cmdbuf_man { bool has_pool; dma_addr_t handle; size_t size; + u64 id; u32 num_contexts; }; @@ -303,6 +305,8 @@ static int vmw_cmdbuf_header_submit(struct vmw_cmdbuf_header *header) struct vmw_cmdbuf_man *man = header->man; u32 val; + header->cb_header->id = man->id++; + val = upper_32_bits(header->handle); vmw_write(man->dev_priv, SVGA_REG_COMMAND_HIGH, val); -- cgit v1.2.3 From 5023ca80f9589295cb60735016e39fc5cc714243 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 13 Jan 2026 11:53:57 -0600 Subject: drm/vmwgfx: Return the correct value in vmw_translate_ptr functions Before the referenced fixes these functions used a lookup function that returned a pointer. This was changed to another lookup function that returned an error code with the pointer becoming an out parameter. The error path when the lookup failed was not changed to reflect this change and the code continued to return the PTR_ERR of the now uninitialized pointer. This could cause the vmw_translate_ptr functions to return success when they actually failed causing further uninitialized and OOB accesses. Reported-by: Kuzey Arda Bulut Fixes: a309c7194e8a ("drm/vmwgfx: Remove rcu locks from user resources") Signed-off-by: Ian Forbes Reviewed-by: Zack Rusin Signed-off-by: Zack Rusin Link: https://patch.msgid.link/20260113175357.129285-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 3057f8baa7d2..e1f18020170a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -1143,7 +1143,7 @@ static int vmw_translate_mob_ptr(struct vmw_private *dev_priv, ret = vmw_user_bo_lookup(sw_context->filp, handle, &vmw_bo); if (ret != 0) { drm_dbg(&dev_priv->drm, "Could not find or use MOB buffer.\n"); - return PTR_ERR(vmw_bo); + return ret; } vmw_bo_placement_set(vmw_bo, VMW_BO_DOMAIN_MOB, VMW_BO_DOMAIN_MOB); ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo); @@ -1199,7 +1199,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv, ret = vmw_user_bo_lookup(sw_context->filp, handle, &vmw_bo); if (ret != 0) { drm_dbg(&dev_priv->drm, "Could not find or use GMR region.\n"); - return PTR_ERR(vmw_bo); + return ret; } vmw_bo_placement_set(vmw_bo, VMW_BO_DOMAIN_GMR | VMW_BO_DOMAIN_VRAM, VMW_BO_DOMAIN_GMR | VMW_BO_DOMAIN_VRAM); -- cgit v1.2.3 From 52c9ee202edd21d0599ac3b5a6fe1da2a2f053e5 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 6 Feb 2026 09:59:32 -0600 Subject: ipmi:si: Handle waiting messages when BMC failure detected If a BMC failure is detected, the current message is returned with an error. However, if there was a waiting message, it would not be handled. Add a check for the waiting message after handling the current message. Suggested-by: Guenter Roeck Reported-by: Rafael J. Wysocki Closes: https://lore.kernel.org/linux-acpi/CAK8fFZ58fidGUCHi5WFX0uoTPzveUUDzT=k=AAm4yWo3bAuCFg@mail.gmail.com/ Fixes: bc3a9d217755 ("ipmi:si: Gracefully handle if the BMC is non-functional") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 5459ffdde8dc..ff159b1162b9 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -809,6 +809,12 @@ restart: */ return_hosed_msg(smi_info, IPMI_BUS_ERR); } + if (smi_info->waiting_msg != NULL) { + /* Also handle if there was a message waiting. */ + smi_info->curr_msg = smi_info->waiting_msg; + smi_info->waiting_msg = NULL; + return_hosed_msg(smi_info, IPMI_BUS_ERR); + } smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED); goto out; } -- cgit v1.2.3 From c3bb3295637cc9bf514f690941ca9a385bf30113 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 6 Feb 2026 10:33:52 -0600 Subject: ipmi:si: Use a long timeout when the BMC is misbehaving If the driver goes into HOSED state, don't reset the timeout to the short timeout in the timeout handler. Reported-by: Igor Raits Closes: https://lore.kernel.org/linux-acpi/CAK8fFZ58fidGUCHi5WFX0uoTPzveUUDzT=k=AAm4yWo3bAuCFg@mail.gmail.com/ Fixes: bc3a9d217755 ("ipmi:si: Gracefully handle if the BMC is non-functional") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index ff159b1162b9..0049e3792ba1 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -1119,7 +1119,9 @@ static void smi_timeout(struct timer_list *t) * SI_USEC_PER_JIFFY); smi_result = smi_event_handler(smi_info, time_diff); - if ((smi_info->io.irq) && (!smi_info->interrupt_disabled)) { + if (smi_info->si_state == SI_HOSED) { + timeout = jiffies + SI_TIMEOUT_HOSED; + } else if ((smi_info->io.irq) && (!smi_info->interrupt_disabled)) { /* Running with interrupts, only do long timeouts. */ timeout = jiffies + SI_TIMEOUT_JIFFIES; smi_inc_stat(smi_info, long_timeouts); -- cgit v1.2.3 From fd3634312a04f336dcbfb481060219f0cd320738 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Feb 2026 14:27:05 +0100 Subject: debugobject: Make it work with deferred page initialization - again debugobjects uses __GFP_HIGH for allocations as it might be invoked within locked regions. That worked perfectly fine until v6.18. It still works correctly when deferred page initialization is disabled and works by chance when no page allocation is required before deferred page initialization has completed. Since v6.18 allocations w/o a reclaim flag cause new_slab() to end up in alloc_frozen_pages_nolock_noprof(), which returns early when deferred page initialization has not yet completed. As the deferred page initialization takes quite a while the debugobject pool is depleted and debugobjects are disabled. This can be worked around when PREEMPT_COUNT is enabled as that allows debugobjects to add __GFP_KSWAPD_RECLAIM to the GFP flags when the context is preemtible. When PREEMPT_COUNT is disabled the context is unknown and the reclaim bit can't be set because the caller might hold locks which might deadlock in the allocator. In preemptible context the reclaim bit is harmless and not a performance issue as that's usually invoked from slow path initialization context. That makes debugobjects depend on PREEMPT_COUNT || !DEFERRED_STRUCT_PAGE_INIT. Fixes: af92793e52c3 ("slab: Introduce kmalloc_nolock() and kfree_nolock().") Signed-off-by: Thomas Gleixner Tested-by: Sebastian Andrzej Siewior Acked-by: Alexei Starovoitov Acked-by: Vlastimil Babka Link: https://patch.msgid.link/87pl6gznti.ffs@tglx --- lib/Kconfig.debug | 1 + lib/debugobjects.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ba36939fda79..a874146ad828 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -723,6 +723,7 @@ source "mm/Kconfig.debug" config DEBUG_OBJECTS bool "Debug object operations" + depends on PREEMPT_COUNT || !DEFERRED_STRUCT_PAGE_INIT depends on DEBUG_KERNEL help If you say Y here, additional code will be inserted into the diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 89a1d6745dc2..12f50de85b62 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -398,9 +398,26 @@ static void fill_pool(void) atomic_inc(&cpus_allocating); while (pool_should_refill(&pool_global)) { + gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; HLIST_HEAD(head); - if (!kmem_alloc_batch(&head, obj_cache, __GFP_HIGH | __GFP_NOWARN)) + /* + * Allow reclaim only in preemptible context and during + * early boot. If not preemptible, the caller might hold + * locks causing a deadlock in the allocator. + * + * If the reclaim flag is not set during early boot then + * allocations, which happen before deferred page + * initialization has completed, will fail. + * + * In preemptible context the flag is harmless and not a + * performance issue as that's usually invoked from slow + * path initialization context. + */ + if (preemptible() || system_state < SYSTEM_SCHEDULING) + gfp |= __GFP_KSWAPD_RECLAIM; + + if (!kmem_alloc_batch(&head, obj_cache, gfp)) break; guard(raw_spinlock_irqsave)(&pool_lock); -- cgit v1.2.3 From fef0e649f8b42bdffe4a916dd46e1b1e9ad2f207 Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Fri, 30 Jan 2026 00:21:19 +0800 Subject: drm/logicvc: Fix device node reference leak in logicvc_drm_config_parse() The logicvc_drm_config_parse() function calls of_get_child_by_name() to find the "layers" node but fails to release the reference, leading to a device node reference leak. Fix this by using the __free(device_node) cleanup attribute to automatic release the reference when the variable goes out of scope. Fixes: efeeaefe9be5 ("drm: Add support for the LogiCVC display controller") Signed-off-by: Felix Gu Reviewed-by: Luca Ceresoli Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20260130-logicvc_drm-v1-1-04366463750c@gmail.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/logicvc/logicvc_drm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/logicvc/logicvc_drm.c b/drivers/gpu/drm/logicvc/logicvc_drm.c index 204b0fee55d0..bbebf4fc7f51 100644 --- a/drivers/gpu/drm/logicvc/logicvc_drm.c +++ b/drivers/gpu/drm/logicvc/logicvc_drm.c @@ -92,7 +92,6 @@ static int logicvc_drm_config_parse(struct logicvc_drm *logicvc) struct device *dev = drm_dev->dev; struct device_node *of_node = dev->of_node; struct logicvc_drm_config *config = &logicvc->config; - struct device_node *layers_node; int ret; logicvc_of_property_parse_bool(of_node, LOGICVC_OF_PROPERTY_DITHERING, @@ -128,7 +127,8 @@ static int logicvc_drm_config_parse(struct logicvc_drm *logicvc) if (ret) return ret; - layers_node = of_get_child_by_name(of_node, "layers"); + struct device_node *layers_node __free(device_node) = + of_get_child_by_name(of_node, "layers"); if (!layers_node) { drm_err(drm_dev, "Missing non-optional layers node\n"); return -EINVAL; -- cgit v1.2.3 From b777b5e09eabeefc6ba80f4296366a4742701103 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Feb 2026 17:02:25 +0000 Subject: time/jiffies: Inline jiffies_to_msecs() and jiffies_to_usecs() For common cases (HZ=100, 250 or 1000), these helpers are at most one multiply, so there is no point calling a tiny function. Keep them out of line for HZ=300 and others. This saves cycles in TCP fast path, among other things. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/8 grow/shrink: 25/89 up/down: 530/-3474 (-2944) ... nla_put_msecs 193 - -193 message_stats_print 2131 920 -1211 Total: Before=25365208, After=25362264, chg -0.01% Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260210170226.57209-1-edumazet@google.com --- include/linux/jiffies.h | 40 ++++++++++++++++++++++++++++++++++++++-- kernel/time/time.c | 19 +++++++------------ 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..d1c3d4941854 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -434,8 +434,44 @@ extern unsigned long preset_lpj; /* * Convert various time units to each other: */ -extern unsigned int jiffies_to_msecs(const unsigned long j); -extern unsigned int jiffies_to_usecs(const unsigned long j); + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value + * + * This inline version takes care of HZ in {100,250,1000}. + * + * Return: milliseconds value + */ +static inline unsigned int jiffies_to_msecs(const unsigned long j) +{ + return (MSEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_msecs(const unsigned long j); +#endif + +#if !(USEC_PER_SEC % HZ) +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ +static inline unsigned int jiffies_to_usecs(const unsigned long j) +{ + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + + return (USEC_PER_SEC / HZ) * j; +} +#else +unsigned int jiffies_to_usecs(const unsigned long j); +#endif /** * jiffies_to_nsecs - Convert jiffies to nanoseconds diff --git a/kernel/time/time.c b/kernel/time/time.c index 0ba8e3c50d62..36fd2313ae7e 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) } #endif +#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ) /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases. - * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 @@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j) #endif } EXPORT_SYMBOL(jiffies_to_msecs); +#endif +#if (USEC_PER_SEC % HZ) /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value @@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j) */ BUILD_BUG_ON(HZ > USEC_PER_SEC); -#if !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#else -# if BITS_PER_LONG == 32 +#if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else +#else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); +#endif /** * mktime64 - Converts date to seconds. -- cgit v1.2.3 From 5ee01f1a7343d6a3547b6802ca2d4cdce0edacb1 Mon Sep 17 00:00:00 2001 From: Qingye Zhao Date: Wed, 11 Feb 2026 09:24:04 +0000 Subject: cgroup: fix race between task migration and iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a task is migrated out of a css_set, cgroup_migrate_add_task() first moves it from cset->tasks to cset->mg_tasks via: list_move_tail(&task->cg_list, &cset->mg_tasks); If a css_task_iter currently has it->task_pos pointing to this task, css_set_move_task() calls css_task_iter_skip() to keep the iterator valid. However, since the task has already been moved to ->mg_tasks, the iterator is advanced relative to the mg_tasks list instead of the original tasks list. As a result, remaining tasks on cset->tasks, as well as tasks queued on cset->mg_tasks, can be skipped by iteration. Fix this by calling css_set_skip_task_iters() before unlinking task->cg_list from cset->tasks. This advances all active iterators to the next task on cset->tasks, so iteration continues correctly even when a task is concurrently being migrated. This race is hard to hit in practice without instrumentation, but it can be reproduced by artificially slowing down cgroup_procs_show(). For example, on an Android device a temporary /sys/kernel/cgroup/cgroup_test knob can be added to inject a delay into cgroup_procs_show(), and then: 1) Spawn three long-running tasks (PIDs 101, 102, 103). 2) Create a test cgroup and move the tasks into it. 3) Enable a large delay via /sys/kernel/cgroup/cgroup_test. 4) In one shell, read cgroup.procs from the test cgroup. 5) Within the delay window, in another shell migrate PID 102 by writing it to a different cgroup.procs file. Under this setup, cgroup.procs can intermittently show only PID 101 while skipping PID 103. Once the migration completes, reading the file again shows all tasks as expected. Note that this change does not allow removing the existing css_set_skip_task_iters() call in css_set_move_task(). The new call in cgroup_migrate_add_task() only handles iterators that are racing with migration while the task is still on cset->tasks. Iterators may also start after the task has been moved to cset->mg_tasks. If we dropped css_set_skip_task_iters() from css_set_move_task(), such iterators could keep task_pos pointing to a migrating task, causing css_task_iter_advance() to malfunction on the destination css_set, up to and including crashes or infinite loops. The race window between migration and iteration is very small, and css_task_iter is not on a hot path. In the worst case, when an iterator is positioned on the first thread of the migrating process, cgroup_migrate_add_task() may have to skip multiple tasks via css_set_skip_task_iters(). However, this only happens when migration and iteration actually race, so the performance impact is negligible compared to the correctness fix provided here. Fixes: b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Qingye Zhao Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8af4351536cf..49da0874a023 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2608,6 +2608,7 @@ static void cgroup_migrate_add_task(struct task_struct *task, mgctx->tset.nr_tasks++; + css_set_skip_task_iters(cset, task); list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, -- cgit v1.2.3 From f66857bafd4f151c5cc6856e47be2e12c1721e43 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 13 Feb 2026 14:38:12 +0000 Subject: KVM: arm64: Hide S1POE from guests when not supported by the host When CONFIG_ARM64_POE is disabled, KVM does not save/restore POR_EL1. However, ID_AA64MMFR3_EL1 sanitisation currently exposes the feature to guests whenever the hardware supports it, ignoring the host kernel configuration. If a guest detects this feature and attempts to use it, the host will fail to context-switch POR_EL1, potentially leading to state corruption. Fix this by masking ID_AA64MMFR3_EL1.S1POE in the sanitised system registers, preventing KVM from advertising the feature when the host does not support it (i.e. system_supports_poe() is false). Fixes: 70ed7238297f ("KVM: arm64: Sanitise ID_AA64MMFR3_EL1") Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260213143815.1732675-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a7cd0badc20c..1b4cacb6e918 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1816,6 +1816,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, ID_AA64MMFR3_EL1_SCTLRX | ID_AA64MMFR3_EL1_S1POE | ID_AA64MMFR3_EL1_S1PIE; + + if (!system_supports_poe()) + val &= ~ID_AA64MMFR3_EL1_S1POE; break; case SYS_ID_MMFR4_EL1: val &= ~ID_MMFR4_EL1_CCIDX; -- cgit v1.2.3 From 9cb0468d0b335ccf769bd8e161cc96195e82d8b1 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 13 Feb 2026 14:38:13 +0000 Subject: KVM: arm64: Optimise away S1POE handling when not supported by host Although ID register sanitisation prevents guests from seeing the feature, adding this check to the helper allows the compiler to entirely eliminate S1POE-specific code paths (such as context switching POR_EL1) when the host kernel is compiled without support (CONFIG_ARM64_POE is disabled). This aligns with the pattern used for other optional features like SVE (kvm_has_sve()) and FPMR (kvm_has_fpmr()), ensuring no POE logic if the host lacks support, regardless of the guest configuration state. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260213143815.1732675-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5d5a3bbdb95e..2ca264b3db5f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1616,7 +1616,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) #define kvm_has_s1poe(k) \ - (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + (system_supports_poe() && \ + kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) #define kvm_has_ras(k) \ (kvm_has_feat((k), ID_AA64PFR0_EL1, RAS, IMP)) -- cgit v1.2.3 From 7e7c2cf0024d89443a7af52e09e47b1fe634ab17 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 13 Feb 2026 14:38:14 +0000 Subject: KVM: arm64: Fix ID register initialization for non-protected pKVM guests In protected mode, the hypervisor maintains a separate instance of the `kvm` structure for each VM. For non-protected VMs, this structure is initialized from the host's `kvm` state. Currently, `pkvm_init_features_from_host()` copies the `KVM_ARCH_FLAG_ID_REGS_INITIALIZED` flag from the host without the underlying `id_regs` data being initialized. This results in the hypervisor seeing the flag as set while the ID registers remain zeroed. Consequently, `kvm_has_feat()` checks at EL2 fail (return 0) for non-protected VMs. This breaks logic that relies on feature detection, such as `ctxt_has_tcrx()` for TCR2_EL1 support. As a result, certain system registers (e.g., TCR2_EL1, PIR_EL1, POR_EL1) are not saved/restored during the world switch, which could lead to state corruption. Fix this by explicitly copying the ID registers from the host `kvm` to the hypervisor `kvm` for non-protected VMs during initialization, since we trust the host with its non-protected guests' features. Also ensure `KVM_ARCH_FLAG_ID_REGS_INITIALIZED` is cleared initially in `pkvm_init_features_from_host` so that `vm_copy_id_regs` can properly initialize them and set the flag once done. Fixes: 41d6028e28bd ("KVM: arm64: Convert the SVE guest vcpu flag to a vm flag") Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260213143815.1732675-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 8e29d7734a15..f3c1c6955163 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -342,6 +342,7 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { hyp_vm->kvm.arch.flags = host_arch_flags; + hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED); bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, @@ -471,6 +472,35 @@ err: return ret; } +static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + const struct kvm *host_kvm = hyp_vm->host_kvm; + struct kvm *kvm = &hyp_vm->kvm; + + if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags)) + return -EINVAL; + + if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return 0; + + memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs)); + + return 0; +} + +static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + int ret = 0; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + else + ret = vm_copy_id_regs(hyp_vcpu); + + return ret; +} + static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, struct kvm_vcpu *host_vcpu) @@ -490,8 +520,9 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; - if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) - kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + ret = pkvm_vcpu_init_sysregs(hyp_vcpu); + if (ret) + goto done; ret = pkvm_vcpu_init_traps(hyp_vcpu); if (ret) -- cgit v1.2.3 From 02471a78a052b631204aed051ab718e4d14ae687 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 13 Feb 2026 14:38:15 +0000 Subject: KVM: arm64: Remove redundant kern_hyp_va() in unpin_host_sve_state() The `sve_state` pointer in `hyp_vcpu->vcpu.arch` is initialized as a hypervisor virtual address during vCPU initialization in `pkvm_vcpu_init_sve()`. `unpin_host_sve_state()` calls `kern_hyp_va()` on this address. Since `kern_hyp_va()` is idempotent, it's not a bug. However, it is unnecessary and potentially confusing. Remove the redundant conversion. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260213143815.1732675-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index f3c1c6955163..2f029bfe4755 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -392,7 +392,7 @@ static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) return; - sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + sve_state = hyp_vcpu->vcpu.arch.sve_state; hyp_unpin_shared_mem(sve_state, sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); } -- cgit v1.2.3 From ee5c38a8d31e5dea52299c43c2ec3213351ab6e1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Feb 2026 14:30:23 -0800 Subject: KVM: arm64: vgic: Handle const qualifier from gic_kvm_info allocation type In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type is "struct gic_kvm_info", but the returned type, while matching, is const qualified. To get them exactly matching, just use the dereferenced pointer for the sizeof(). Signed-off-by: Kees Cook Link: https://patch.msgid.link/20260206223022.it.052-kees@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 86c149537493..a53f93546aa0 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -654,7 +654,7 @@ static struct gic_kvm_info *gic_kvm_info; void __init vgic_set_kvm_info(const struct gic_kvm_info *info) { BUG_ON(gic_kvm_info != NULL); - gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL); + gic_kvm_info = kmalloc(sizeof(*gic_kvm_info), GFP_KERNEL); if (gic_kvm_info) *gic_kvm_info = *info; } -- cgit v1.2.3 From 0b87d51690dd5131cbe9fbd23746b037aab89815 Mon Sep 17 00:00:00 2001 From: Franz Schnyder Date: Fri, 6 Feb 2026 13:37:36 +0100 Subject: drm/bridge: ti-sn65dsi86: Enable HPD polling if IRQ is not used Fallback to polling to detect hotplug events on systems without interrupts. On systems where the interrupt line of the bridge is not connected, the bridge cannot notify hotplug events. Only add the DRM_BRIDGE_OP_HPD flag if an interrupt has been registered otherwise remain in polling mode. Fixes: 55e8ff842051 ("drm/bridge: ti-sn65dsi86: Add HPD for DisplayPort connector type") Cc: stable@vger.kernel.org # 6.16: 9133bc3f0564: drm/bridge: ti-sn65dsi86: Add Signed-off-by: Franz Schnyder Reviewed-by: Douglas Anderson [dianders: Adjusted Fixes/stable line based on discussion] Signed-off-by: Douglas Anderson Link: https://patch.msgid.link/20260206123758.374555-1-fra.schnyder@gmail.com --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 276d05d25ad8..98d64ad791d0 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -1415,6 +1415,7 @@ static int ti_sn_bridge_probe(struct auxiliary_device *adev, { struct ti_sn65dsi86 *pdata = dev_get_drvdata(adev->dev.parent); struct device_node *np = pdata->dev->of_node; + const struct i2c_client *client = to_i2c_client(pdata->dev); int ret; pdata->next_bridge = devm_drm_of_get_bridge(&adev->dev, np, 1, 0); @@ -1433,8 +1434,9 @@ static int ti_sn_bridge_probe(struct auxiliary_device *adev, ? DRM_MODE_CONNECTOR_DisplayPort : DRM_MODE_CONNECTOR_eDP; if (pdata->bridge.type == DRM_MODE_CONNECTOR_DisplayPort) { - pdata->bridge.ops = DRM_BRIDGE_OP_EDID | DRM_BRIDGE_OP_DETECT | - DRM_BRIDGE_OP_HPD; + pdata->bridge.ops = DRM_BRIDGE_OP_EDID | DRM_BRIDGE_OP_DETECT; + if (client->irq) + pdata->bridge.ops |= DRM_BRIDGE_OP_HPD; /* * If comms were already enabled they would have been enabled * with the wrong value of HPD_DISABLE. Update it now. Comms -- cgit v1.2.3 From e9e0b48cd15b46dcb2bbc165f6b0fee698b855d6 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Sun, 8 Feb 2026 22:47:26 +0000 Subject: drm/fourcc: fix plane order for 10/12/16-bit YCbCr formats The short comments had the correct order, but the long comments had the planes reversed. Fixes: 2271e0a20ef7 ("drm: drm_fourcc: add 10/12/16bit software decoder YCbCr formats") Signed-off-by: Simon Ser Reviewed-by: Daniel Stone Reviewed-by: Robert Mader Link: https://patch.msgid.link/20260208224718.57199-1-contact@emersion.fr --- include/uapi/drm/drm_fourcc.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index e527b24bd824..c89aede3cb12 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -401,8 +401,8 @@ extern "C" { * implementation can multiply the values by 2^6=64. For that reason the padding * must only contain zeros. * index 0 = Y plane, [15:0] z:Y [6:10] little endian - * index 1 = Cr plane, [15:0] z:Cr [6:10] little endian - * index 2 = Cb plane, [15:0] z:Cb [6:10] little endian + * index 1 = Cb plane, [15:0] z:Cb [6:10] little endian + * index 2 = Cr plane, [15:0] z:Cr [6:10] little endian */ #define DRM_FORMAT_S010 fourcc_code('S', '0', '1', '0') /* 2x2 subsampled Cb (1) and Cr (2) planes 10 bits per channel */ #define DRM_FORMAT_S210 fourcc_code('S', '2', '1', '0') /* 2x1 subsampled Cb (1) and Cr (2) planes 10 bits per channel */ @@ -414,8 +414,8 @@ extern "C" { * implementation can multiply the values by 2^4=16. For that reason the padding * must only contain zeros. * index 0 = Y plane, [15:0] z:Y [4:12] little endian - * index 1 = Cr plane, [15:0] z:Cr [4:12] little endian - * index 2 = Cb plane, [15:0] z:Cb [4:12] little endian + * index 1 = Cb plane, [15:0] z:Cb [4:12] little endian + * index 2 = Cr plane, [15:0] z:Cr [4:12] little endian */ #define DRM_FORMAT_S012 fourcc_code('S', '0', '1', '2') /* 2x2 subsampled Cb (1) and Cr (2) planes 12 bits per channel */ #define DRM_FORMAT_S212 fourcc_code('S', '2', '1', '2') /* 2x1 subsampled Cb (1) and Cr (2) planes 12 bits per channel */ @@ -424,8 +424,8 @@ extern "C" { /* * 3 plane YCbCr * index 0 = Y plane, [15:0] Y little endian - * index 1 = Cr plane, [15:0] Cr little endian - * index 2 = Cb plane, [15:0] Cb little endian + * index 1 = Cb plane, [15:0] Cb little endian + * index 2 = Cr plane, [15:0] Cr little endian */ #define DRM_FORMAT_S016 fourcc_code('S', '0', '1', '6') /* 2x2 subsampled Cb (1) and Cr (2) planes 16 bits per channel */ #define DRM_FORMAT_S216 fourcc_code('S', '2', '1', '6') /* 2x1 subsampled Cb (1) and Cr (2) planes 16 bits per channel */ -- cgit v1.2.3 From 1cb968a2013ffa8112d52ebe605009ea1c6a582c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 24 Jan 2026 04:18:40 +0000 Subject: nfsd: Fix cred ref leak in nfsd_nl_threads_set_doit(). syzbot reported memory leak of struct cred. [0] nfsd_nl_threads_set_doit() passes get_current_cred() to nfsd_svc(), but put_cred() is not called after that. The cred is finally passed down to _svc_xprt_create(), which calls get_cred() with the cred for struct svc_xprt. The ownership of the refcount by get_current_cred() is not transferred to anywhere and is just leaked. nfsd_svc() is also called from write_threads(), but it does not bump file->f_cred there. nfsd_nl_threads_set_doit() is called from sendmsg() and current->cred does not go away. Let's use current_cred() in nfsd_nl_threads_set_doit(). [0]: BUG: memory leak unreferenced object 0xffff888108b89480 (size 184): comm "syz-executor", pid 5994, jiffies 4294943386 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 369454a7): kmemleak_alloc_recursive include/linux/kmemleak.h:44 [inline] slab_post_alloc_hook mm/slub.c:4958 [inline] slab_alloc_node mm/slub.c:5263 [inline] kmem_cache_alloc_noprof+0x412/0x580 mm/slub.c:5270 prepare_creds+0x22/0x600 kernel/cred.c:185 copy_creds+0x44/0x290 kernel/cred.c:286 copy_process+0x7a7/0x2870 kernel/fork.c:2086 kernel_clone+0xac/0x6e0 kernel/fork.c:2651 __do_sys_clone+0x7f/0xb0 kernel/fork.c:2792 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xa4/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 924f4fb003ba ("NFSD: convert write_threads to netlink command") Cc: stable@vger.kernel.org Reported-by: syzbot+dd3b43aa0204089217ee@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69744674.a00a0220.33ccc7.0000.GAE@google.com/ Tested-by: syzbot+dd3b43aa0204089217ee@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 89fe2c0e8d44..43ced6b0e197 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1647,7 +1647,7 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) if (attr) nn->min_threads = nla_get_u32(attr); - ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope); + ret = nfsd_svc(nrpools, nthreads, net, current_cred(), scope); if (ret > 0) ret = 0; out_unlock: -- cgit v1.2.3 From 92978c83bb4eef55d02a6c990c01c423131eefa7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 24 Jan 2026 04:18:41 +0000 Subject: nfsd: Fix cred ref leak in nfsd_nl_listener_set_doit(). nfsd_nl_listener_set_doit() uses get_current_cred() without put_cred(). As we can see from other callers, svc_xprt_create_from_sa() does not require the extra refcount. nfsd_nl_listener_set_doit() is always in the process context, sendmsg(), and current->cred does not go away. Let's use current_cred() in nfsd_nl_listener_set_doit(). Fixes: 16a471177496 ("NFSD: add listener-{set,get} netlink command") Cc: stable@vger.kernel.org Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 43ced6b0e197..b06adb5d5a2e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2000,7 +2000,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) } ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, - get_current_cred()); + current_cred()); /* always save the latest error */ if (ret < 0) err = ret; -- cgit v1.2.3 From 9478c166c46934160135e197b049b5a05753f2ad Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 21 Nov 2024 11:46:01 +1000 Subject: nouveau/gsp: drop WARN_ON in ACPI probes These WARN_ONs seem to trigger a lot, and we don't seem to have a plan to fix them, so just drop them, as they are most likely harmless. Cc: stable@vger.kernel.org Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Dave Airlie Link: https://patch.msgid.link/20241121014601.229391-1-airlied@gmail.com Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c index 7fb13434c051..a575a8dbf727 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c @@ -737,8 +737,8 @@ r535_gsp_acpi_caps(acpi_handle handle, CAPS_METHOD_DATA *caps) if (!obj) goto done; - if (WARN_ON(obj->type != ACPI_TYPE_BUFFER) || - WARN_ON(obj->buffer.length != 4)) + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length != 4) goto done; caps->status = 0; @@ -773,8 +773,8 @@ r535_gsp_acpi_jt(acpi_handle handle, JT_METHOD_DATA *jt) if (!obj) goto done; - if (WARN_ON(obj->type != ACPI_TYPE_BUFFER) || - WARN_ON(obj->buffer.length != 4)) + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length != 4) goto done; jt->status = 0; @@ -861,8 +861,8 @@ r535_gsp_acpi_dod(acpi_handle handle, DOD_METHOD_DATA *dod) _DOD = output.pointer; - if (WARN_ON(_DOD->type != ACPI_TYPE_PACKAGE) || - WARN_ON(_DOD->package.count > ARRAY_SIZE(dod->acpiIdList))) + if (_DOD->type != ACPI_TYPE_PACKAGE || + _DOD->package.count > ARRAY_SIZE(dod->acpiIdList)) return; for (int i = 0; i < _DOD->package.count; i++) { -- cgit v1.2.3 From 46120745bb4e7e1f09959624716b4c5d6e2c2e9e Mon Sep 17 00:00:00 2001 From: Ethan Tidmore Date: Sun, 15 Feb 2026 22:04:38 -0600 Subject: drm/tiny: sharp-memory: fix pointer error dereference The function devm_drm_dev_alloc() returns a pointer error upon failure not NULL. Change null check to pointer error check. Detected by Smatch: drivers/gpu/drm/tiny/sharp-memory.c:549 sharp_memory_probe() error: 'smd' dereferencing possible ERR_PTR() Fixes: b8f9f21716fec ("drm/tiny: Add driver for Sharp Memory LCD") Signed-off-by: Ethan Tidmore Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20260216040438.43702-1-ethantidmore06@gmail.com --- drivers/gpu/drm/tiny/sharp-memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/tiny/sharp-memory.c b/drivers/gpu/drm/tiny/sharp-memory.c index 64272cd0f6e2..cbf69460ebf3 100644 --- a/drivers/gpu/drm/tiny/sharp-memory.c +++ b/drivers/gpu/drm/tiny/sharp-memory.c @@ -541,8 +541,8 @@ static int sharp_memory_probe(struct spi_device *spi) smd = devm_drm_dev_alloc(dev, &sharp_memory_drm_driver, struct sharp_memory_device, drm); - if (!smd) - return -ENOMEM; + if (IS_ERR(smd)) + return PTR_ERR(smd); spi_set_drvdata(spi, smd); -- cgit v1.2.3 From 1072020685f4b81f6efad3b412cdae0bd62bb043 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 12 Feb 2026 12:41:25 +0100 Subject: irqchip/sifive-plic: Fix frozen interrupt due to affinity setting PLIC ignores interrupt completion message for disabled interrupt, explained by the specification: The PLIC signals it has completed executing an interrupt handler by writing the interrupt ID it received from the claim to the claim/complete register. The PLIC does not check whether the completion ID is the same as the last claim ID for that target. If the completion ID does not match an interrupt source that is currently enabled for the target, the completion is silently ignored. This caused problems in the past, because an interrupt can be disabled while still being handled and plic_irq_eoi() had no effect. That was fixed by checking if the interrupt is disabled, and if so enable it, before sending the completion message. That check is done with irqd_irq_disabled(). However, that is not sufficient because the enable bit for the handling hart can be zero despite irqd_irq_disabled(d) being false. This can happen when affinity setting is changed while a hart is still handling the interrupt. This problem is easily reproducible by dumping a large file to uart (which generates lots of interrupts) and at the same time keep changing the uart interrupt's affinity setting. The uart port becomes frozen almost instantaneously. Fix this by checking PLIC's enable bit instead of irqd_irq_disabled(). Fixes: cc9f04f9a84f ("irqchip/sifive-plic: Implement irq_set_affinity() for SMP host") Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260212114125.3148067-1-namcao@linutronix.de --- drivers/irqchip/irq-sifive-plic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index 60fd8f91762b..70058871d2fb 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -172,8 +172,13 @@ static void plic_irq_disable(struct irq_data *d) static void plic_irq_eoi(struct irq_data *d) { struct plic_handler *handler = this_cpu_ptr(&plic_handlers); + u32 __iomem *reg; + bool enabled; + + reg = handler->enable_base + (d->hwirq / 32) * sizeof(u32); + enabled = readl(reg) & BIT(d->hwirq % 32); - if (unlikely(irqd_irq_disabled(d))) { + if (unlikely(!enabled)) { plic_toggle(handler, d->hwirq, 1); writel(d->hwirq, handler->hart_base + CONTEXT_CLAIM); plic_toggle(handler, d->hwirq, 0); -- cgit v1.2.3 From ce9e40a9a5e5cff0b1b0d2fa582b3d71a8ce68e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Feb 2026 15:48:16 +0000 Subject: irqchip/gic-v3-its: Limit number of per-device MSIs to the range the ITS supports The ITS driver blindly assumes that EventIDs are in abundant supply, to the point where it never checks how many the hardware actually supports. It turns out that some pretty esoteric integrations make it so that only a few bits are available, all the way down to a single bit. Enforce the advertised limitation at the point of allocating the device structure, and hope that the endpoint driver can deal with such limitation. Fixes: 84a6a2e7fc18d ("irqchip: GICv3: ITS: device allocation and configuration") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Reviewed-by: Robin Murphy Reviewed-by: Zenghui Yu Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260206154816.3582887-1-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 4 ++++ include/linux/irqchip/arm-gic-v3.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 2988def30972..a51e8e6a8181 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3475,6 +3475,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, int lpi_base; int nr_lpis; int nr_ites; + int id_bits; int sz; if (!its_alloc_device_table(its, dev_id)) @@ -3486,7 +3487,10 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, /* * Even if the device wants a single LPI, the ITT must be * sized as a power of two (and you need at least one bit...). + * Also honor the ITS's own EID limit. */ + id_bits = FIELD_GET(GITS_TYPER_IDBITS, its->typer) + 1; + nvecs = min_t(unsigned int, nvecs, BIT(id_bits)); nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); sz = max(sz, ITS_ITT_ALIGN); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 70c0948f978e..0225121f3013 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -394,6 +394,7 @@ #define GITS_TYPER_VLPIS (1UL << 1) #define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 #define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS GENMASK_ULL(12, 8) #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) -- cgit v1.2.3 From f0617176be5e497b67b3c87bac35b26ebccac499 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Feb 2026 12:04:50 +0100 Subject: irqchip/mmp: Make icu_irq_chip variable static const File-scope 'icu_irq_chip' is not used outside of this unit and is not modified anywhere, so make it static const to silence sparse warning: irq-mmp.c:139:17: warning: symbol 'icu_irq_chip' was not declared. Should it be static? Signed-off-by: Krzysztof Kozlowski Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260216110449.160277-2-krzysztof.kozlowski@oss.qualcomm.com --- drivers/irqchip/irq-mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mmp.c b/drivers/irqchip/irq-mmp.c index 09e640430208..8e210cb451be 100644 --- a/drivers/irqchip/irq-mmp.c +++ b/drivers/irqchip/irq-mmp.c @@ -136,7 +136,7 @@ static void icu_unmask_irq(struct irq_data *d) } } -struct irq_chip icu_irq_chip = { +static const struct irq_chip icu_irq_chip = { .name = "icu_irq", .irq_mask = icu_mask_irq, .irq_mask_ack = icu_mask_ack_irq, -- cgit v1.2.3 From bffda93a51b40afd67c11bf558dc5aae83ca0943 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 12 Feb 2026 11:23:27 -0800 Subject: scsi: lpfc: Properly set WC for DPP mapping Using set_memory_wc() to enable write-combining for the DPP portion of the MMIO mapping is wrong as set_memory_*() is meant to operate on RAM only, not MMIO mappings. In fact, as used currently triggers a BUG_ON() with enabled CONFIG_DEBUG_VIRTUAL. Simply map the DPP region separately and in addition to the already existing mappings, avoiding any possible negative side effects for these. Fixes: 1351e69fc6db ("scsi: lpfc: Add push-to-adapter support to sli4") Signed-off-by: Mathias Krause Signed-off-by: Justin Tee Reviewed-by: Mathias Krause Link: https://patch.msgid.link/20260212192327.141104-1-justintee8345@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 2 ++ drivers/scsi/lpfc/lpfc_sli.c | 36 ++++++++++++++++++++++++++++++------ drivers/scsi/lpfc/lpfc_sli4.h | 3 +++ 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index a116a16c4a6f..b5e53c7d33e7 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -12039,6 +12039,8 @@ lpfc_sli4_pci_mem_unset(struct lpfc_hba *phba) iounmap(phba->sli4_hba.conf_regs_memmap_p); if (phba->sli4_hba.dpp_regs_memmap_p) iounmap(phba->sli4_hba.dpp_regs_memmap_p); + if (phba->sli4_hba.dpp_regs_memmap_wc_p) + iounmap(phba->sli4_hba.dpp_regs_memmap_wc_p); break; case LPFC_SLI_INTF_IF_TYPE_1: break; diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 734af3d039f8..690763e0aa55 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -15981,6 +15981,32 @@ lpfc_dual_chute_pci_bar_map(struct lpfc_hba *phba, uint16_t pci_barset) return NULL; } +static __maybe_unused void __iomem * +lpfc_dpp_wc_map(struct lpfc_hba *phba, uint8_t dpp_barset) +{ + + /* DPP region is supposed to cover 64-bit BAR2 */ + if (dpp_barset != WQ_PCI_BAR_4_AND_5) { + lpfc_log_msg(phba, KERN_WARNING, LOG_INIT, + "3273 dpp_barset x%x != WQ_PCI_BAR_4_AND_5\n", + dpp_barset); + return NULL; + } + + if (!phba->sli4_hba.dpp_regs_memmap_wc_p) { + void __iomem *dpp_map; + + dpp_map = ioremap_wc(phba->pci_bar2_map, + pci_resource_len(phba->pcidev, + PCI_64BIT_BAR4)); + + if (dpp_map) + phba->sli4_hba.dpp_regs_memmap_wc_p = dpp_map; + } + + return phba->sli4_hba.dpp_regs_memmap_wc_p; +} + /** * lpfc_modify_hba_eq_delay - Modify Delay Multiplier on EQs * @phba: HBA structure that EQs are on. @@ -16944,9 +16970,6 @@ lpfc_wq_create(struct lpfc_hba *phba, struct lpfc_queue *wq, uint8_t dpp_barset; uint32_t dpp_offset; uint8_t wq_create_version; -#ifdef CONFIG_X86 - unsigned long pg_addr; -#endif /* sanity check on queue memory */ if (!wq || !cq) @@ -17132,14 +17155,15 @@ lpfc_wq_create(struct lpfc_hba *phba, struct lpfc_queue *wq, #ifdef CONFIG_X86 /* Enable combined writes for DPP aperture */ - pg_addr = (unsigned long)(wq->dpp_regaddr) & PAGE_MASK; - rc = set_memory_wc(pg_addr, 1); - if (rc) { + bar_memmap_p = lpfc_dpp_wc_map(phba, dpp_barset); + if (!bar_memmap_p) { lpfc_printf_log(phba, KERN_ERR, LOG_INIT, "3272 Cannot setup Combined " "Write on WQ[%d] - disable DPP\n", wq->queue_id); phba->cfg_enable_dpp = 0; + } else { + wq->dpp_regaddr = bar_memmap_p + dpp_offset; } #else phba->cfg_enable_dpp = 0; diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index ee58383492b2..b6d90604bb61 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -785,6 +785,9 @@ struct lpfc_sli4_hba { void __iomem *dpp_regs_memmap_p; /* Kernel memory mapped address for * dpp registers */ + void __iomem *dpp_regs_memmap_wc_p;/* Kernel memory mapped address for + * dpp registers with write combining + */ union { struct { /* IF Type 0, BAR 0 PCI cfg space reg mem map */ -- cgit v1.2.3 From 57297736c08233987e5d29ce6584c6ca2a831b12 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 29 Jan 2026 15:30:39 +0100 Subject: scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT This resolves the follow splat and lock-up when running with PREEMPT_RT enabled on Hyper-V: [ 415.140818] BUG: scheduling while atomic: stress-ng-iomix/1048/0x00000002 [ 415.140822] INFO: lockdep is turned off. [ 415.140823] Modules linked in: intel_rapl_msr intel_rapl_common intel_uncore_frequency_common intel_pmc_core pmt_telemetry pmt_discovery pmt_class intel_pmc_ssram_telemetry intel_vsec ghash_clmulni_intel aesni_intel rapl binfmt_misc nls_ascii nls_cp437 vfat fat snd_pcm hyperv_drm snd_timer drm_client_lib drm_shmem_helper snd sg soundcore drm_kms_helper pcspkr hv_balloon hv_utils evdev joydev drm configfs efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vsock vmw_vmci efivarfs autofs4 ext4 crc16 mbcache jbd2 sr_mod sd_mod cdrom hv_storvsc serio_raw hid_generic scsi_transport_fc hid_hyperv scsi_mod hid hv_netvsc hyperv_keyboard scsi_common [ 415.140846] Preemption disabled at: [ 415.140847] [] storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc] [ 415.140854] CPU: 8 UID: 0 PID: 1048 Comm: stress-ng-iomix Not tainted 6.19.0-rc7 #30 PREEMPT_{RT,(full)} [ 415.140856] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/04/2024 [ 415.140857] Call Trace: [ 415.140861] [ 415.140861] ? storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc] [ 415.140863] dump_stack_lvl+0x91/0xb0 [ 415.140870] __schedule_bug+0x9c/0xc0 [ 415.140875] __schedule+0xdf6/0x1300 [ 415.140877] ? rtlock_slowlock_locked+0x56c/0x1980 [ 415.140879] ? rcu_is_watching+0x12/0x60 [ 415.140883] schedule_rtlock+0x21/0x40 [ 415.140885] rtlock_slowlock_locked+0x502/0x1980 [ 415.140891] rt_spin_lock+0x89/0x1e0 [ 415.140893] hv_ringbuffer_write+0x87/0x2a0 [ 415.140899] vmbus_sendpacket_mpb_desc+0xb6/0xe0 [ 415.140900] ? rcu_is_watching+0x12/0x60 [ 415.140902] storvsc_queuecommand+0x669/0xbe0 [hv_storvsc] [ 415.140904] ? HARDIRQ_verbose+0x10/0x10 [ 415.140908] ? __rq_qos_issue+0x28/0x40 [ 415.140911] scsi_queue_rq+0x760/0xd80 [scsi_mod] [ 415.140926] __blk_mq_issue_directly+0x4a/0xc0 [ 415.140928] blk_mq_issue_direct+0x87/0x2b0 [ 415.140931] blk_mq_dispatch_queue_requests+0x120/0x440 [ 415.140933] blk_mq_flush_plug_list+0x7a/0x1a0 [ 415.140935] __blk_flush_plug+0xf4/0x150 [ 415.140940] __submit_bio+0x2b2/0x5c0 [ 415.140944] ? submit_bio_noacct_nocheck+0x272/0x360 [ 415.140946] submit_bio_noacct_nocheck+0x272/0x360 [ 415.140951] ext4_read_bh_lock+0x3e/0x60 [ext4] [ 415.140995] ext4_block_write_begin+0x396/0x650 [ext4] [ 415.141018] ? __pfx_ext4_da_get_block_prep+0x10/0x10 [ext4] [ 415.141038] ext4_da_write_begin+0x1c4/0x350 [ext4] [ 415.141060] generic_perform_write+0x14e/0x2c0 [ 415.141065] ext4_buffered_write_iter+0x6b/0x120 [ext4] [ 415.141083] vfs_write+0x2ca/0x570 [ 415.141087] ksys_write+0x76/0xf0 [ 415.141089] do_syscall_64+0x99/0x1490 [ 415.141093] ? rcu_is_watching+0x12/0x60 [ 415.141095] ? finish_task_switch.isra.0+0xdf/0x3d0 [ 415.141097] ? rcu_is_watching+0x12/0x60 [ 415.141098] ? lock_release+0x1f0/0x2a0 [ 415.141100] ? rcu_is_watching+0x12/0x60 [ 415.141101] ? finish_task_switch.isra.0+0xe4/0x3d0 [ 415.141103] ? rcu_is_watching+0x12/0x60 [ 415.141104] ? __schedule+0xb34/0x1300 [ 415.141106] ? hrtimer_try_to_cancel+0x1d/0x170 [ 415.141109] ? do_nanosleep+0x8b/0x160 [ 415.141111] ? hrtimer_nanosleep+0x89/0x100 [ 415.141114] ? __pfx_hrtimer_wakeup+0x10/0x10 [ 415.141116] ? xfd_validate_state+0x26/0x90 [ 415.141118] ? rcu_is_watching+0x12/0x60 [ 415.141120] ? do_syscall_64+0x1e0/0x1490 [ 415.141121] ? do_syscall_64+0x1e0/0x1490 [ 415.141123] ? rcu_is_watching+0x12/0x60 [ 415.141124] ? do_syscall_64+0x1e0/0x1490 [ 415.141125] ? do_syscall_64+0x1e0/0x1490 [ 415.141127] ? irqentry_exit+0x140/0x7e0 [ 415.141129] entry_SYSCALL_64_after_hwframe+0x76/0x7e get_cpu() disables preemption while the spinlock hv_ringbuffer_write is using is converted to an rt-mutex under PREEMPT_RT. Signed-off-by: Jan Kiszka Tested-by: Florian Bezdeka Reviewed-by: Michael Kelley Tested-by: Michael Kelley Link: https://patch.msgid.link/0c7fb5cd-fb21-4760-8593-e04bade84744@siemens.com Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 19e18939d3a5..19ff0004e259 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1855,8 +1855,9 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host, cmd_request->payload_sz = payload_sz; /* Invokes the vsc to start an IO */ - ret = storvsc_do_io(dev, cmd_request, get_cpu()); - put_cpu(); + migrate_disable(); + ret = storvsc_do_io(dev, cmd_request, smp_processor_id()); + migrate_enable(); if (ret) scsi_dma_unmap(scmnd); -- cgit v1.2.3 From 2e6b5cd6a4b37a95b78cf8c39a979b58c915c8ed Mon Sep 17 00:00:00 2001 From: Alexey Charkov Date: Mon, 9 Feb 2026 19:17:34 +0400 Subject: scsi: ufs: core: Fix RPMB region size detection for UFS 2.2 Older UFS spec devices (2.2 and earlier) do not expose per-region RPMB sizes, as only one RPMB region is supported. In such cases, the size of the single RPMB region can be deduced from the Logical Block Count and Logical Block Size fields in the RPMB Unit Descriptor. Add a fallback mechanism to calculate the RPMB region size from these fields if the device implements an older spec, so that the RPMB driver can work with such devices - otherwise it silently skips the whole RPMB. Section 14.1.4.6 (RPMB Unit Descriptor) Link: https://www.jedec.org/system/files/docs/JESD220C-2_2.pdf Cc: stable@vger.kernel.org Fixes: b06b8c421485 ("scsi: ufs: core: Add OP-TEE based RPMB driver for UFS devices") Reviewed-by: Bean Huo Signed-off-by: Alexey Charkov Link: https://patch.msgid.link/20260209-ufs-rpmb-v3-1-b1804e71bd38@flipper.net Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8349fe2090db..12f1bdc70587 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -5248,6 +5249,25 @@ static void ufshcd_lu_init(struct ufs_hba *hba, struct scsi_device *sdev) hba->dev_info.rpmb_region_size[1] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION1_SIZE]; hba->dev_info.rpmb_region_size[2] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION2_SIZE]; hba->dev_info.rpmb_region_size[3] = desc_buf[RPMB_UNIT_DESC_PARAM_REGION3_SIZE]; + + if (hba->dev_info.wspecversion <= 0x0220) { + /* + * These older spec chips have only one RPMB region, + * sized between 128 kB minimum and 16 MB maximum. + * No per region size fields are provided (respective + * REGIONX_SIZE fields always contain zeros), so get + * it from the logical block count and size fields for + * compatibility + * + * (See JESD220C-2_2 Section 14.1.4.6 + * RPMB Unit Descriptor,* offset 13h, 4 bytes) + */ + hba->dev_info.rpmb_region_size[0] = + (get_unaligned_be64(desc_buf + + RPMB_UNIT_DESC_PARAM_LOGICAL_BLK_COUNT) + << desc_buf[RPMB_UNIT_DESC_PARAM_LOGICAL_BLK_SIZE]) + / SZ_128K; + } } -- cgit v1.2.3 From 70ca8caa96ce473647054f5c7b9dab5423902402 Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Tue, 10 Feb 2026 20:18:50 +0100 Subject: scsi: ses: Fix devices attaching to different hosts On a multipath SAS system some devices don't end up with correct symlinks from the SCSI device to its enclosure. Some devices even have enclosure links pointing to enclosures attached to different SCSI hosts. ses_match_to_enclosure() calls enclosure_for_each_device() which iterates over all enclosures on the system, not just enclosures attached to the current SCSI host. Replace the iteration with a direct call to ses_enclosure_find_by_addr(). Reviewed-by: David Jeffery Signed-off-by: Tomas Henzl Link: https://patch.msgid.link/20260210191850.36784-1-thenzl@redhat.com Signed-off-by: Martin K. Petersen --- drivers/scsi/ses.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index 789b170da652..98a7367d2f20 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -528,9 +528,8 @@ struct efd { }; static int ses_enclosure_find_by_addr(struct enclosure_device *edev, - void *data) + struct efd *efd) { - struct efd *efd = data; int i; struct ses_component *scomp; @@ -683,7 +682,7 @@ static void ses_match_to_enclosure(struct enclosure_device *edev, if (efd.addr) { efd.dev = &sdev->sdev_gendev; - enclosure_for_each_device(ses_enclosure_find_by_addr, &efd); + ses_enclosure_find_by_addr(edev, &efd); } } -- cgit v1.2.3 From 5b313760059c9df7d60aba7832279bcb81b4aec0 Mon Sep 17 00:00:00 2001 From: Won Jung Date: Wed, 11 Feb 2026 15:01:05 +0900 Subject: scsi: ufs: core: Reset urgent_bkops_lvl to allow runtime PM power mode Ensures that UFS Runtime PM can achieve power saving after System PM suspend by resetting hba->urgent_bkops_lvl. Also modify the ufshcd_bkops_exception_event_handler to avoid setting urgent_bkops_lvl when status is 0, which helps maintain optimal power management. On UFS devices supporting UFSHCD_CAP_AUTO_BKOPS_SUSPEND, a BKOPS exception event can lead to a situation where UFS Runtime PM can't enter low-power mode states even after the BKOPS exception has been resolved. BKOPS exception with bkops status 0 occurs, the driver logs: "ufshcd_bkops_exception_event_handler: device raised urgent BKOPS exception for bkops status 0" When a BKOPS exception occurs, ufshcd_bkops_exception_event_handler() reads the BKOPS status and sets hba->urgent_bkops_lvl to BKOPS_STATUS_NO_OP(0). This allows the device to perform Runtime PM without changing the UFS power mode. (__ufshcd_wl_suspend(hba, UFS_RUNTIME_PM)) During system PM suspend, ufshcd_disable_auto_bkops() is called, disabling auto bkops. After UFS System PM Resume, when runtime PM attempts to suspend again, ufshcd_urgent_bkops() is invoked. Since hba->urgent_bkops_lvl remains at BKOPS_STATUS_NO_OP(0), ufshcd_enable_auto_bkops() is triggered. However, in ufshcd_bkops_ctrl(), the driver compares the current BKOPS status with hba->urgent_bkops_lvl, and only enables auto bkops if curr_status >= hba->urgent_bkops_lvl. Since both values are 0, the condition is met As a result, __ufshcd_wl_suspend(hba, UFS_RUNTIME_PM) skips power mode transitions and remains in an active state, preventing power saving even though no urgent BKOPS condition exists. Signed-off-by: Won Jung Reviewed-by: Peter Wang Link: https://patch.msgid.link/1891546521.01770806581968.JavaMail.epsvc@epcpadp2new Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 12f1bdc70587..3bbc2b51c87b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5982,6 +5982,7 @@ static int ufshcd_disable_auto_bkops(struct ufs_hba *hba) hba->auto_bkops_enabled = false; trace_ufshcd_auto_bkops_state(hba, "Disabled"); + hba->urgent_bkops_lvl = BKOPS_STATUS_PERF_IMPACT; hba->is_urgent_bkops_lvl_checked = false; out: return err; @@ -6085,7 +6086,7 @@ static void ufshcd_bkops_exception_event_handler(struct ufs_hba *hba) * impacted or critical. Handle these device by determining their urgent * bkops status at runtime. */ - if (curr_status < BKOPS_STATUS_PERF_IMPACT) { + if ((curr_status > BKOPS_STATUS_NO_OP) && (curr_status < BKOPS_STATUS_PERF_IMPACT)) { dev_err(hba->dev, "%s: device raised urgent BKOPS exception for bkops status %d\n", __func__, curr_status); /* update the current status as the urgent bkops level */ -- cgit v1.2.3 From fa96392ebebc8fade2b878acb14cce0f71016503 Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Thu, 12 Feb 2026 12:30:26 +0530 Subject: scsi: mpi3mr: Add NULL checks when resetting request and reply queues The driver encountered a crash during resource cleanup when the reply and request queues were NULL due to freed memory. This issue occurred when the creation of reply or request queues failed, and the driver freed the memory first, but attempted to mem set the content of the freed memory, leading to a system crash. Add NULL pointer checks for reply and request queues before accessing the reply/request memory during cleanup Signed-off-by: Ranjan Kumar Link: https://patch.msgid.link/20260212070026.30263-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 1cfbdb773353..04d4a2aea7d7 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -4806,21 +4806,25 @@ void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc) } for (i = 0; i < mrioc->num_queues; i++) { - mrioc->op_reply_qinfo[i].qid = 0; - mrioc->op_reply_qinfo[i].ci = 0; - mrioc->op_reply_qinfo[i].num_replies = 0; - mrioc->op_reply_qinfo[i].ephase = 0; - atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0); - atomic_set(&mrioc->op_reply_qinfo[i].in_use, 0); - mpi3mr_memset_op_reply_q_buffers(mrioc, i); - - mrioc->req_qinfo[i].ci = 0; - mrioc->req_qinfo[i].pi = 0; - mrioc->req_qinfo[i].num_requests = 0; - mrioc->req_qinfo[i].qid = 0; - mrioc->req_qinfo[i].reply_qid = 0; - spin_lock_init(&mrioc->req_qinfo[i].q_lock); - mpi3mr_memset_op_req_q_buffers(mrioc, i); + if (mrioc->op_reply_qinfo) { + mrioc->op_reply_qinfo[i].qid = 0; + mrioc->op_reply_qinfo[i].ci = 0; + mrioc->op_reply_qinfo[i].num_replies = 0; + mrioc->op_reply_qinfo[i].ephase = 0; + atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0); + atomic_set(&mrioc->op_reply_qinfo[i].in_use, 0); + mpi3mr_memset_op_reply_q_buffers(mrioc, i); + } + + if (mrioc->req_qinfo) { + mrioc->req_qinfo[i].ci = 0; + mrioc->req_qinfo[i].pi = 0; + mrioc->req_qinfo[i].num_requests = 0; + mrioc->req_qinfo[i].qid = 0; + mrioc->req_qinfo[i].reply_qid = 0; + spin_lock_init(&mrioc->req_qinfo[i].q_lock); + mpi3mr_memset_op_req_q_buffers(mrioc, i); + } } atomic_set(&mrioc->pend_large_data_sz, 0); -- cgit v1.2.3 From 38353c26db28efd984f51d426eac2396d299cca7 Mon Sep 17 00:00:00 2001 From: Salomon Dushimirimana Date: Fri, 13 Feb 2026 19:28:06 +0000 Subject: scsi: pm8001: Fix use-after-free in pm8001_queue_command() Commit e29c47fe8946 ("scsi: pm8001: Simplify pm8001_task_exec()") refactors pm8001_queue_command(), however it introduces a potential cause of a double free scenario when it changes the function to return -ENODEV in case of phy down/device gone state. In this path, pm8001_queue_command() updates task status and calls task_done to indicate to upper layer that the task has been handled. However, this also frees the underlying SAS task. A -ENODEV is then returned to the caller. When libsas sas_ata_qc_issue() receives this error value, it assumes the task wasn't handled/queued by LLDD and proceeds to clean up and free the task again, resulting in a double free. Since pm8001_queue_command() handles the SAS task in this case, it should return 0 to the caller indicating that the task has been handled. Fixes: e29c47fe8946 ("scsi: pm8001: Simplify pm8001_task_exec()") Signed-off-by: Salomon Dushimirimana Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260213192806.439432-1-salomondush@google.com Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 6a8d35aea93a..645524f3fe2d 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -525,8 +525,9 @@ int pm8001_queue_command(struct sas_task *task, gfp_t gfp_flags) } else { task->task_done(task); } - rc = -ENODEV; - goto err_out; + spin_unlock_irqrestore(&pm8001_ha->lock, flags); + pm8001_dbg(pm8001_ha, IO, "pm8001_task_exec device gone\n"); + return 0; } ccb = pm8001_ccb_alloc(pm8001_ha, pm8001_dev, task); -- cgit v1.2.3 From af3973e7b4fd91e6884b312526168869fb013d68 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Mon, 16 Feb 2026 15:10:55 +0100 Subject: scsi: snic: Remove unused linkstatus The (struct vnic_dev).linkstatus buffer is freed in svnic_dev_unregister() and referenced in svnic_dev_link_status() but never alloc'd. This means (struct vnic_dev).linkstatus is always null and the dealloc the reference in svnic_dev_link_status() is dead code. Signed-off-by: Thomas Fourier Acked-by: Karan Tilak Kumar Link: https://patch.msgid.link/20260216141056.59429-2-fourier.thomas@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/snic/vnic_dev.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/scsi/snic/vnic_dev.c b/drivers/scsi/snic/vnic_dev.c index 760f3f22095c..c4df0b17c86c 100644 --- a/drivers/scsi/snic/vnic_dev.c +++ b/drivers/scsi/snic/vnic_dev.c @@ -42,8 +42,6 @@ struct vnic_dev { struct vnic_devcmd_notify *notify; struct vnic_devcmd_notify notify_copy; dma_addr_t notify_pa; - u32 *linkstatus; - dma_addr_t linkstatus_pa; struct vnic_stats *stats; dma_addr_t stats_pa; struct vnic_devcmd_fw_info *fw_info; @@ -650,8 +648,6 @@ int svnic_dev_init(struct vnic_dev *vdev, int arg) int svnic_dev_link_status(struct vnic_dev *vdev) { - if (vdev->linkstatus) - return *vdev->linkstatus; if (!vnic_dev_notify_ready(vdev)) return 0; @@ -686,11 +682,6 @@ void svnic_dev_unregister(struct vnic_dev *vdev) sizeof(struct vnic_devcmd_notify), vdev->notify, vdev->notify_pa); - if (vdev->linkstatus) - dma_free_coherent(&vdev->pdev->dev, - sizeof(u32), - vdev->linkstatus, - vdev->linkstatus_pa); if (vdev->stats) dma_free_coherent(&vdev->pdev->dev, sizeof(struct vnic_stats), -- cgit v1.2.3 From 97af85787c1964a7e7b146c5d4e021f089af47ea Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Feb 2026 12:46:58 -0800 Subject: scsi: snic: MAINTAINERS: Update snic maintainers Update snic maintainers. Signed-off-by: Karan Tilak Kumar Link: https://patch.msgid.link/20260217204658.5465-1-kartilak@cisco.com Signed-off-by: Martin K. Petersen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 70292bd3ed71..0900af786cce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6115,6 +6115,7 @@ F: drivers/scsi/fnic/ CISCO SCSI HBA DRIVER M: Karan Tilak Kumar +M: Narsimhulu Musini M: Sesidhar Baddela L: linux-scsi@vger.kernel.org S: Supported -- cgit v1.2.3 From 7be41fb00e2c2a823f271a8318b453ca11812f1e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 Oct 2025 08:30:11 +0300 Subject: accel: ethosu: Fix shift overflow in cmd_to_addr() The "((cmd[0] & 0xff0000) << 16)" shift is zero. This was intended to be (((u64)cmd[0] & 0xff0000) << 16). Move the cast to the correct location. Fixes: 5a5e9c0228e6 ("accel: Add Arm Ethos-U NPU driver") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aQGmY64tWcwOGFP4@stanley.mountain Signed-off-by: Rob Herring (Arm) --- drivers/accel/ethosu/ethosu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ethosu/ethosu_gem.c b/drivers/accel/ethosu/ethosu_gem.c index 473b5f5d7514..7b073116314b 100644 --- a/drivers/accel/ethosu/ethosu_gem.c +++ b/drivers/accel/ethosu/ethosu_gem.c @@ -154,7 +154,7 @@ static void cmd_state_init(struct cmd_state *st) static u64 cmd_to_addr(u32 *cmd) { - return ((u64)((cmd[0] & 0xff0000) << 16)) | cmd[1]; + return (((u64)cmd[0] & 0xff0000) << 16) | cmd[1]; } static u64 dma_length(struct ethosu_validated_cmdstream_info *info, -- cgit v1.2.3 From 023cd6d90f8aa2ef7b72d84be84a18e61ecebd64 Mon Sep 17 00:00:00 2001 From: Piotr Mazek Date: Thu, 5 Feb 2026 23:05:02 +0100 Subject: ACPI: PM: Save NVS memory on Lenovo G70-35 [821d6f0359b0614792ab8e2fb93b503e25a65079] prevented machines produced later than 2012 from saving NVS region to accelerate S3. Despite being made after 2012, Lenovo G70-35 still needs NVS memory saving during S3. A quirk is introduced for this platform. Signed-off-by: Piotr Mazek [ rjw: Subject adjustment ] Link: https://patch.msgid.link/GV2PPF3CD5B63CC2442EE3F76F8443EAD90D499A@GV2PPF3CD5B63CC.EURP251.PROD.OUTLOOK.COM Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 66ec81e306d4..132a9df98471 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -386,6 +386,14 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "80E1"), }, }, + { + .callback = init_nvs_save_s3, + .ident = "Lenovo G70-35", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "80Q5"), + }, + }, /* * ThinkPad X1 Tablet(2016) cannot do suspend-to-idle using * the Low Power S0 Idle firmware interface (see -- cgit v1.2.3 From ab140365fb62c0bdab22b2f516aff563b2559e3b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 19 Feb 2026 15:20:12 +0100 Subject: drbd: fix "LOGIC BUG" in drbd_al_begin_io_nonblock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though we check that we "should" be able to do lc_get_cumulative() while holding the device->al_lock spinlock, it may still fail, if some other code path decided to do lc_try_lock() with bad timing. If that happened, we logged "LOGIC BUG for enr=...", but still did not return an error. The rest of the code now assumed that this request has references for the relevant activity log extents. The implcations are that during an active resync, mutual exclusivity of resync versus application IO is not guaranteed. And a potential crash at this point may not realizs that these extents could have been target of in-flight IO and would need to be resynced just in case. Also, once the request completes, it will give up activity log references it does not even hold, which will trigger a BUG_ON(refcnt == 0) in lc_put(). Fix: Do not crash the kernel for a condition that is harmless during normal operation: also catch "e->refcnt == 0", not only "e == NULL" when being noisy about "al_complete_io() called on inactive extent %u\n". And do not try to be smart and "guess" whether something will work, then be surprised when it does not. Deal with the fact that it may or may not work. If it does not, remember a possible "partially in activity log" state (only possible for requests that cross extent boundaries), and return an error code from drbd_al_begin_io_nonblock(). A latter call for the same request will then resume from where we left off. Cc: stable@vger.kernel.org Signed-off-by: Lars Ellenberg Signed-off-by: Christoph Böhmwalder Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 53 +++++++++++++++++--------------------- drivers/block/drbd/drbd_interval.h | 5 +++- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 742b2908ff68..b3dbf6c76e98 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -483,38 +483,20 @@ void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) { - struct lru_cache *al = device->act_log; /* for bios crossing activity log extent boundaries, * we may need to activate two extents in one go */ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - unsigned nr_al_extents; - unsigned available_update_slots; unsigned enr; - D_ASSERT(device, first <= last); - - nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ - available_update_slots = min(al->nr_elements - al->used, - al->max_pending_changes - al->pending_changes); - - /* We want all necessary updates for a given request within the same transaction - * We could first check how many updates are *actually* needed, - * and use that instead of the worst-case nr_al_extents */ - if (available_update_slots < nr_al_extents) { - /* Too many activity log extents are currently "hot". - * - * If we have accumulated pending changes already, - * we made progress. - * - * If we cannot get even a single pending change through, - * stop the fast path until we made some progress, - * or requests to "cold" extents could be starved. */ - if (!al->pending_changes) - __set_bit(__LC_STARVING, &device->act_log->flags); - return -ENOBUFS; + if (i->partially_in_al_next_enr) { + D_ASSERT(device, first < i->partially_in_al_next_enr); + D_ASSERT(device, last >= i->partially_in_al_next_enr); + first = i->partially_in_al_next_enr; } + D_ASSERT(device, first <= last); + /* Is resync active in this area? */ for (enr = first; enr <= last; enr++) { struct lc_element *tmp; @@ -529,14 +511,21 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval * } } - /* Checkout the refcounts. - * Given that we checked for available elements and update slots above, - * this has to be successful. */ + /* Try to checkout the refcounts. */ for (enr = first; enr <= last; enr++) { struct lc_element *al_ext; al_ext = lc_get_cumulative(device->act_log, enr); - if (!al_ext) - drbd_info(device, "LOGIC BUG for enr=%u\n", enr); + + if (!al_ext) { + /* Did not work. We may have exhausted the possible + * changes per transaction. Or raced with someone + * "locking" it against changes. + * Remember where to continue from. + */ + if (enr > first) + i->partially_in_al_next_enr = enr; + return -ENOBUFS; + } } return 0; } @@ -556,7 +545,11 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) for (enr = first; enr <= last; enr++) { extent = lc_find(device->act_log, enr); - if (!extent) { + /* Yes, this masks a bug elsewhere. However, during normal + * operation this is harmless, so no need to crash the kernel + * by the BUG_ON(refcount == 0) in lc_put(). + */ + if (!extent || extent->refcnt == 0) { drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); continue; } diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h index 366489b72fe9..5d3213b81eed 100644 --- a/drivers/block/drbd/drbd_interval.h +++ b/drivers/block/drbd/drbd_interval.h @@ -8,12 +8,15 @@ struct drbd_interval { struct rb_node rb; sector_t sector; /* start sector of the interval */ - unsigned int size; /* size in bytes */ sector_t end; /* highest interval end in subtree */ + unsigned int size; /* size in bytes */ unsigned int local:1 /* local or remote request? */; unsigned int waiting:1; /* someone is waiting for completion */ unsigned int completed:1; /* this has been completed already; * ignore for conflict detection */ + + /* to resume a partially successful drbd_al_begin_io_nonblock(); */ + unsigned int partially_in_al_next_enr; }; static inline void drbd_clear_interval(struct drbd_interval *i) -- cgit v1.2.3 From 81b1f046ff8a5ad5da2c970cff354b61dfa1d6b1 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 12 Jan 2026 18:04:12 +0100 Subject: drbd: Replace deprecated strcpy with strscpy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit strcpy() has been deprecated [1] because it performs no bounds checking on the destination buffer, which can lead to buffer overflows. Replace it with the safer strscpy(). No functional changes. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1] Signed-off-by: Thorsten Blum Reviewed-by: Christoph Böhmwalder Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 14 +++++++++----- drivers/block/drbd/drbd_receiver.c | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1f6ac9202b66..ba00b5f21a49 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -732,9 +733,9 @@ int drbd_send_sync_param(struct drbd_peer_device *peer_device) } if (apv >= 88) - strcpy(p->verify_alg, nc->verify_alg); + strscpy(p->verify_alg, nc->verify_alg); if (apv >= 89) - strcpy(p->csums_alg, nc->csums_alg); + strscpy(p->csums_alg, nc->csums_alg); rcu_read_unlock(); return drbd_send_command(peer_device, sock, cmd, size, NULL, 0); @@ -745,6 +746,7 @@ int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cm struct drbd_socket *sock; struct p_protocol *p; struct net_conf *nc; + size_t integrity_alg_len; int size, cf; sock = &connection->data; @@ -762,8 +764,10 @@ int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cm } size = sizeof(*p); - if (connection->agreed_pro_version >= 87) - size += strlen(nc->integrity_alg) + 1; + if (connection->agreed_pro_version >= 87) { + integrity_alg_len = strlen(nc->integrity_alg) + 1; + size += integrity_alg_len; + } p->protocol = cpu_to_be32(nc->wire_protocol); p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); @@ -778,7 +782,7 @@ int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cm p->conn_flags = cpu_to_be32(cf); if (connection->agreed_pro_version >= 87) - strcpy(p->integrity_alg, nc->integrity_alg); + strscpy(p->integrity_alg, nc->integrity_alg, integrity_alg_len); rcu_read_unlock(); return __conn_send_command(connection, sock, cmd, size, NULL, 0); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3de919b6f0e1..3d0a061b6c40 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3801,14 +3801,14 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i *new_net_conf = *old_net_conf; if (verify_tfm) { - strcpy(new_net_conf->verify_alg, p->verify_alg); + strscpy(new_net_conf->verify_alg, p->verify_alg); new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; crypto_free_shash(peer_device->connection->verify_tfm); peer_device->connection->verify_tfm = verify_tfm; drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); } if (csums_tfm) { - strcpy(new_net_conf->csums_alg, p->csums_alg); + strscpy(new_net_conf->csums_alg, p->csums_alg); new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; crypto_free_shash(peer_device->connection->csums_tfm); peer_device->connection->csums_tfm = csums_tfm; -- cgit v1.2.3 From c5f8658f97ec392eeaf355d4e9775ae1f23ca1d3 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 4 Feb 2026 17:06:29 +0800 Subject: drm/imx: parallel-display: check return value of devm_drm_bridge_add() in imx_pd_probe() Return the value of devm_drm_bridge_add() in order to propagate the error properly, if it fails due to resource allocation failure or bridge registration failure. This ensures that the probe function fails safely rather than proceeding with a potentially incomplete bridge setup. Fixes: bf7e97910b9f ("drm/imx: parallel-display: add the bridge before attaching it") Signed-off-by: Chen Ni Reviewed-by: Luca Ceresoli Link: https://patch.msgid.link/20260204090629.2209542-1-nichen@iscas.ac.cn Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/imx/ipuv3/parallel-display.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/imx/ipuv3/parallel-display.c b/drivers/gpu/drm/imx/ipuv3/parallel-display.c index 6fbf505d2801..590120a33fa0 100644 --- a/drivers/gpu/drm/imx/ipuv3/parallel-display.c +++ b/drivers/gpu/drm/imx/ipuv3/parallel-display.c @@ -256,7 +256,9 @@ static int imx_pd_probe(struct platform_device *pdev) platform_set_drvdata(pdev, imxpd); - devm_drm_bridge_add(dev, &imxpd->bridge); + ret = devm_drm_bridge_add(dev, &imxpd->bridge); + if (ret) + return ret; return component_add(dev, &imx_pd_ops); } -- cgit v1.2.3 From 496daa2759260374bb9c9b2196a849aa3bc513a8 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Fri, 6 Feb 2026 12:06:21 +0800 Subject: drm/bridge: synopsys: dw-dp: Check return value of devm_drm_bridge_add() in dw_dp_bind() Return the value of devm_drm_bridge_add() in order to propagate the error properly, if it fails due to resource allocation failure or bridge registration failure. This ensures that the bind function fails safely rather than proceeding with a potentially incomplete bridge setup. Fixes: b726970486d8 ("drm/bridge: synopsys: dw-dp: add bridge before attaching") Signed-off-by: Chen Ni Reviewed-by: Andy Yan Reviewed-by: Luca Ceresoli Link: https://patch.msgid.link/20260206040621.4095517-1-nichen@iscas.ac.cn Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/synopsys/dw-dp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/synopsys/dw-dp.c b/drivers/gpu/drm/bridge/synopsys/dw-dp.c index 432342452484..07f7a2e0d9f2 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-dp.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-dp.c @@ -2049,7 +2049,9 @@ struct dw_dp *dw_dp_bind(struct device *dev, struct drm_encoder *encoder, bridge->type = DRM_MODE_CONNECTOR_DisplayPort; bridge->ycbcr_420_allowed = true; - devm_drm_bridge_add(dev, bridge); + ret = devm_drm_bridge_add(dev, bridge); + if (ret) + return ERR_PTR(ret); dp->aux.dev = dev; dp->aux.drm_dev = encoder->dev; -- cgit v1.2.3 From 803ec1faf7c1823e6e3b1f2aaa81be18528c9436 Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Mon, 9 Feb 2026 19:41:14 +0100 Subject: drm/bridge: samsung-dsim: Fix memory leak in error path In samsung_dsim_host_attach(), drm_bridge_add() is called to add the bridge. However, if samsung_dsim_register_te_irq() or pdata->host_ops->attach() fails afterwards, the function returns without removing the bridge, causing a memory leak. Fix this by adding proper error handling with goto labels to ensure drm_bridge_remove() is called in all error paths. Also ensure that samsung_dsim_unregister_te_irq() is called if the attach operation fails after the TE IRQ has been registered. samsung_dsim_unregister_te_irq() function is moved without changes to be before samsung_dsim_host_attach() to avoid forward declaration. Fixes: e7447128ca4a ("drm: bridge: Generalize Exynos-DSI driver into a Samsung DSIM bridge") Cc: stable@vger.kernel.org Signed-off-by: Osama Abdelkader Reviewed-by: Luca Ceresoli Link: https://patch.msgid.link/20260209184115.10937-1-osama.abdelkader@gmail.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/samsung-dsim.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c index eabc4c32f6ab..ad8c6aa49d48 100644 --- a/drivers/gpu/drm/bridge/samsung-dsim.c +++ b/drivers/gpu/drm/bridge/samsung-dsim.c @@ -1881,6 +1881,14 @@ static int samsung_dsim_register_te_irq(struct samsung_dsim *dsi, struct device return 0; } +static void samsung_dsim_unregister_te_irq(struct samsung_dsim *dsi) +{ + if (dsi->te_gpio) { + free_irq(gpiod_to_irq(dsi->te_gpio), dsi); + gpiod_put(dsi->te_gpio); + } +} + static int samsung_dsim_host_attach(struct mipi_dsi_host *host, struct mipi_dsi_device *device) { @@ -1955,13 +1963,13 @@ of_find_panel_or_bridge: if (!(device->mode_flags & MIPI_DSI_MODE_VIDEO)) { ret = samsung_dsim_register_te_irq(dsi, &device->dev); if (ret) - return ret; + goto err_remove_bridge; } if (pdata->host_ops && pdata->host_ops->attach) { ret = pdata->host_ops->attach(dsi, device); if (ret) - return ret; + goto err_unregister_te_irq; } dsi->lanes = device->lanes; @@ -1969,14 +1977,13 @@ of_find_panel_or_bridge: dsi->mode_flags = device->mode_flags; return 0; -} -static void samsung_dsim_unregister_te_irq(struct samsung_dsim *dsi) -{ - if (dsi->te_gpio) { - free_irq(gpiod_to_irq(dsi->te_gpio), dsi); - gpiod_put(dsi->te_gpio); - } +err_unregister_te_irq: + if (!(device->mode_flags & MIPI_DSI_MODE_VIDEO)) + samsung_dsim_unregister_te_irq(dsi); +err_remove_bridge: + drm_bridge_remove(&dsi->bridge); + return ret; } static int samsung_dsim_host_detach(struct mipi_dsi_host *host, -- cgit v1.2.3 From 0d195d3b205ca90db30d70d09d7bb6909aac178f Mon Sep 17 00:00:00 2001 From: Christoph Böhmwalder Date: Fri, 20 Feb 2026 12:39:37 +0100 Subject: drbd: fix null-pointer dereference on local read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In drbd_request_endio(), READ_COMPLETED_WITH_ERROR is passed to __req_mod() with a NULL peer_device: __req_mod(req, what, NULL, &m); The READ_COMPLETED_WITH_ERROR handler then unconditionally passes this NULL peer_device to drbd_set_out_of_sync(), which dereferences it, causing a null-pointer dereference. Fix this by obtaining the peer_device via first_peer_device(device), matching how drbd_req_destroy() handles the same situation. Cc: stable@vger.kernel.org Reported-by: Tuo Li Link: https://lore.kernel.org/linux-block/20260104165355.151864-1-islituo@gmail.com Signed-off-by: Christoph Böhmwalder Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d15826f6ee81..70f75ef07945 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -621,7 +621,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case READ_COMPLETED_WITH_ERROR: - drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size); + drbd_set_out_of_sync(first_peer_device(device), + req->i.sector, req->i.size); drbd_report_io_error(device, req); __drbd_chk_io_error(device, DRBD_READ_ERROR); fallthrough; -- cgit v1.2.3 From cfc83a3c71517b59c1047db57da31e26a9dc2f33 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 16 Feb 2026 11:20:29 +0100 Subject: batman-adv: Avoid double-rtnl_lock ELP metric worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batadv_v_elp_get_throughput() might be called when the RTNL lock is already held. This could be problematic when the work queue item is cancelled via cancel_delayed_work_sync() in batadv_v_elp_iface_disable(). In this case, an rtnl_lock() would cause a deadlock. To avoid this, rtnl_trylock() was used in this function to skip the retrieval of the ethtool information in case the RTNL lock was already held. But for cfg80211 interfaces, batadv_get_real_netdev() was called - which also uses rtnl_lock(). The approach for __ethtool_get_link_ksettings() must also be used instead and the lockless version __batadv_get_real_netdev() has to be called. Cc: stable@vger.kernel.org Fixes: 8c8ecc98f5c6 ("batman-adv: Drop unmanaged ELP metric worker") Reported-by: Christian Schmidbauer Signed-off-by: Sven Eckelmann Tested-by: Sören Skaarup Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 10 +++++++++- net/batman-adv/hard-interface.c | 8 ++++---- net/batman-adv/hard-interface.h | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index cb16c1ed2a58..fe832093d421 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -111,7 +111,15 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, /* unsupported WiFi driver version */ goto default_throughput; - real_netdev = batadv_get_real_netdev(hard_iface->net_dev); + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + real_netdev = __batadv_get_real_netdev(hard_iface->net_dev); + rtnl_unlock(); if (!real_netdev) goto default_throughput; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5113f879736b..1c488049d554 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -204,7 +204,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) } /** - * batadv_get_real_netdevice() - check if the given netdev struct is a virtual + * __batadv_get_real_netdev() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * @@ -214,7 +214,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) * Return: the 'real' net device or the original net device and NULL in case * of an error. */ -static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) +struct net_device *__batadv_get_real_netdev(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; @@ -267,7 +267,7 @@ struct net_device *batadv_get_real_netdev(struct net_device *net_device) struct net_device *real_netdev; rtnl_lock(); - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); rtnl_unlock(); return real_netdev; @@ -336,7 +336,7 @@ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); if (!real_netdev) return wifi_flags; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 9db8a310961e..9ba8fb2bdceb 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -67,6 +67,7 @@ enum batadv_hard_if_bcast { extern struct notifier_block batadv_hard_if_notifier; +struct net_device *__batadv_get_real_netdev(struct net_device *net_device); struct net_device *batadv_get_real_netdev(struct net_device *net_device); bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface); bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); -- cgit v1.2.3 From 08f97454b7fa39bfcf82524955c771d2d693d6fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 22 Feb 2026 13:35:13 +0000 Subject: KVM: arm64: Fix protected mode handling of pages larger than 4kB Since 3669ddd8fa8b5 ("KVM: arm64: Add a range to pkvm_mappings"), pKVM tracks the memory that has been mapped into a guest in a side data structure. Crucially, it uses it to find out whether a page has already been mapped, and therefore refuses to map it twice. So far, so good. However, this very patch completely breaks non-4kB page support, with guests being unable to boot. The most obvious symptom is that we take the same fault repeatedly, and not making forward progress. A quick investigation shows that this is because of the above rejection code. As it turns out, there are multiple issues at play: - while the HPFAR_EL2 register gives you the faulting IPA minus the bottom 12 bits, it will still give you the extra bits that are part of the page offset for anything larger than 4kB, even for a level-3 mapping - pkvm_pgtable_stage2_map() assumes that the address passed as a parameter is aligned to the size of the intended mapping - the faulting address is only aligned for a non-page mapping When the planets are suitably aligned (pun intended), the guest faults on a page by accessing it past the bottom 4kB, and extra bits get set in the HPFAR_EL2 register. If this results in a page mapping (which is likely with large granule sizes), nothing aligns it further down, and pkvm_mapping_iter_first() finds an intersection that doesn't really exist. We assume this is a spurious fault and return -EAGAIN. And again... This doesn't hit outside of the protected code, as the page table code always aligns the IPA down to a page boundary, hiding the issue for everyone else. Fix it by always forcing the alignment on vma_pagesize, irrespective of the value of vma_pagesize. Fixes: 3669ddd8fa8b5 ("KVM: arm64: Add a range to pkvm_mappings") Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://https://patch.msgid.link/20260222141000.3084258-1-maz@kernel.org Cc: stable@vger.kernel.org --- arch/arm64/kvm/mmu.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8c5d259810b2..3952415c4f83 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1753,14 +1753,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* - * Both the canonical IPA and fault IPA must be hugepage-aligned to - * ensure we find the right PFN and lay down the mapping in the right - * place. + * Both the canonical IPA and fault IPA must be aligned to the + * mapping size to ensure we find the right PFN and lay down the + * mapping in the right place. */ - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { - fault_ipa &= ~(vma_pagesize - 1); - ipa &= ~(vma_pagesize - 1); - } + fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); + ipa = ALIGN_DOWN(ipa, vma_pagesize); gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); -- cgit v1.2.3 From 663c28469d3274d6456f206a6671c91493d85ff1 Mon Sep 17 00:00:00 2001 From: Henrique Carvalho Date: Sat, 21 Feb 2026 01:59:44 -0300 Subject: smb: client: fix cifs_pick_channel when channels are equally loaded cifs_pick_channel uses (start % chan_count) when channels are equally loaded, but that can return a channel that failed the eligibility checks. Drop the fallback and return the scan-selected channel instead. If none is eligible, keep the existing behavior of using the primary channel. Signed-off-by: Henrique Carvalho Acked-by: Paulo Alcantara (Red Hat) Acked-by: Meetakshi Setiya Reviewed-by: Shyam Prasad N Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/transport.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 75697f6d2566..05f8099047e1 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -807,16 +807,21 @@ cifs_cancelled_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* - * Return a channel (master if none) of @ses that can be used to send - * regular requests. + * cifs_pick_channel - pick an eligible channel for network operations * - * If we are currently binding a new channel (negprot/sess.setup), - * return the new incomplete channel. + * @ses: session reference + * + * Select an eligible channel (not terminating and not marked as needing + * reconnect), preferring the least loaded one. If no eligible channel is + * found, fall back to the primary channel (index 0). + * + * Return: TCP_Server_Info pointer for the chosen channel, or NULL if @ses is + * NULL. */ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) { uint index = 0; - unsigned int min_in_flight = UINT_MAX, max_in_flight = 0; + unsigned int min_in_flight = UINT_MAX; struct TCP_Server_Info *server = NULL; int i, start, cur; @@ -846,14 +851,8 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) min_in_flight = server->in_flight; index = cur; } - if (server->in_flight > max_in_flight) - max_in_flight = server->in_flight; } - /* if all channels are equally loaded, fall back to round-robin */ - if (min_in_flight == max_in_flight) - index = (uint)start % ses->chan_count; - server = ses->chans[index].server; spin_unlock(&ses->chan_lock); -- cgit v1.2.3 From 2692c614f8f05929d692b3dbfd3faef1f00fbaf0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 10 Feb 2026 14:58:22 +0100 Subject: device property: Allow secondary lookup in fwnode_get_next_child_node() When device_get_child_node_count() got split to the fwnode and device respective APIs, the fwnode didn't inherit the ability to traverse over the secondary fwnode. Hence any user, that switches from device to fwnode API misses this feature. In particular, this was revealed by the commit 1490cbb9dbfd ("device property: Split fwnode_get_child_node_count()") that effectively broke the GPIO enumeration on Intel Galileo boards. Fix this by moving the secondary lookup from device to fwnode API. Note, in general no device_*() API should go into the depth of the fwnode implementation. Fixes: 114dbb4fa7c4 ("drivers property: When no children in primary, try secondary") Cc: stable@vger.kernel.org Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Sakari Ailus Link: https://patch.msgid.link/20260210135822.47335-1-andriy.shevchenko@linux.intel.com Signed-off-by: Danilo Krummrich --- drivers/base/property.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index 6a63860579dd..8d9a34be57fb 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -797,7 +797,18 @@ struct fwnode_handle * fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { - return fwnode_call_ptr_op(fwnode, get_next_child_node, child); + struct fwnode_handle *next; + + if (IS_ERR_OR_NULL(fwnode)) + return NULL; + + /* Try to find a child in primary fwnode */ + next = fwnode_call_ptr_op(fwnode, get_next_child_node, child); + if (next) + return next; + + /* When no more children in primary, continue with secondary */ + return fwnode_call_ptr_op(fwnode->secondary, get_next_child_node, child); } EXPORT_SYMBOL_GPL(fwnode_get_next_child_node); @@ -841,19 +852,7 @@ EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node); struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child) { - const struct fwnode_handle *fwnode = dev_fwnode(dev); - struct fwnode_handle *next; - - if (IS_ERR_OR_NULL(fwnode)) - return NULL; - - /* Try to find a child in primary fwnode */ - next = fwnode_get_next_child_node(fwnode, child); - if (next) - return next; - - /* When no more children in primary, continue with secondary */ - return fwnode_get_next_child_node(fwnode->secondary, child); + return fwnode_get_next_child_node(dev_fwnode(dev), child); } EXPORT_SYMBOL_GPL(device_get_next_child_node); -- cgit v1.2.3 From be704107e79696e855aa41e901a926039b6d2410 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Thu, 19 Feb 2026 16:55:30 -0600 Subject: regulator: dt-bindings: mt6359: make regulator names unique Update the example devicetree with unique regulator names for all regulators. This reflects the same change made to the actual .dtsi file. Signed-off-by: David Lechner Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260219-mtk-mt6359-fix-regulator-names-v1-2-ee0fcebfe1d9@baylibre.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml b/Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml index d6b3b5a5c0b3..fe4ac9350ba0 100644 --- a/Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml @@ -287,7 +287,7 @@ examples: regulator-max-microvolt = <1700000>; }; mt6359_vrfck_1_ldo_reg: ldo_vrfck_1 { - regulator-name = "vrfck"; + regulator-name = "vrfck_1"; regulator-min-microvolt = <1240000>; regulator-max-microvolt = <1600000>; }; @@ -309,7 +309,7 @@ examples: regulator-max-microvolt = <3300000>; }; mt6359_vemc_1_ldo_reg: ldo_vemc_1 { - regulator-name = "vemc"; + regulator-name = "vemc_1"; regulator-min-microvolt = <2500000>; regulator-max-microvolt = <3300000>; }; -- cgit v1.2.3 From bfcaf4e8ae9b4f52b97fb04ce03be3a01d41cdd0 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 16 Feb 2026 14:14:33 +0100 Subject: rust: io: macro_export io_define_read!() and io_define_write!() Currently, the define_read!() and define_write!() I/O macros are crate public. The only user outside of the I/O module is PCI (for the configurations space I/O backend). Consequently, when CONFIG_PCI=n this causes a compile time warning [1]. In order to fix this, rename the macros to io_define_read!() and io_define_write!() and use #[macro_export] to export them. This is better than making the crate public visibility conditional, as eventually subsystems will have their own crate. Also, I/O backends are valid to be implemented by drivers as well. For instance, there are devices (such as GPUs) that run firmware which allows to program other devices only accessible through the primary device through indirect I/O. Since the macros are now public, also add the corresponding documentation. Fixes: 121d87b28e1d ("rust: io: separate generic I/O helpers from MMIO implementation") Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/driver-core/CANiq72khOYkt6t5zwMvSiyZvWWHMZuNCMERXu=7K=_5tT-8Pgg@mail.gmail.com/ [1] Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Link: https://patch.msgid.link/20260216131534.65008-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/io.rs | 131 ++++++++++++++++++++++++++++++++++---------------- rust/kernel/pci/io.rs | 24 ++++----- 2 files changed, 101 insertions(+), 54 deletions(-) diff --git a/rust/kernel/io.rs b/rust/kernel/io.rs index c1cca7b438c3..e5fba6bf6db0 100644 --- a/rust/kernel/io.rs +++ b/rust/kernel/io.rs @@ -139,9 +139,9 @@ pub struct Mmio(MmioRaw); /// Internal helper macros used to invoke C MMIO read functions. /// -/// This macro is intended to be used by higher-level MMIO access macros (define_read) and provides -/// a unified expansion for infallible vs. fallible read semantics. It emits a direct call into the -/// corresponding C helper and performs the required cast to the Rust return type. +/// This macro is intended to be used by higher-level MMIO access macros (io_define_read) and +/// provides a unified expansion for infallible vs. fallible read semantics. It emits a direct call +/// into the corresponding C helper and performs the required cast to the Rust return type. /// /// # Parameters /// @@ -166,9 +166,9 @@ macro_rules! call_mmio_read { /// Internal helper macros used to invoke C MMIO write functions. /// -/// This macro is intended to be used by higher-level MMIO access macros (define_write) and provides -/// a unified expansion for infallible vs. fallible write semantics. It emits a direct call into the -/// corresponding C helper and performs the required cast to the Rust return type. +/// This macro is intended to be used by higher-level MMIO access macros (io_define_write) and +/// provides a unified expansion for infallible vs. fallible write semantics. It emits a direct call +/// into the corresponding C helper and performs the required cast to the Rust return type. /// /// # Parameters /// @@ -193,7 +193,30 @@ macro_rules! call_mmio_write { }}; } -macro_rules! define_read { +/// Generates an accessor method for reading from an I/O backend. +/// +/// This macro reduces boilerplate by automatically generating either compile-time bounds-checked +/// (infallible) or runtime bounds-checked (fallible) read methods. It abstracts the address +/// calculation and bounds checking, and delegates the actual I/O read operation to a specified +/// helper macro, making it generic over different I/O backends. +/// +/// # Parameters +/// +/// * `infallible` / `fallible` - Determines the bounds-checking strategy. `infallible` relies on +/// `IoKnownSize` for compile-time checks and returns the value directly. `fallible` performs +/// runtime checks against `maxsize()` and returns a `Result`. +/// * `$(#[$attr:meta])*` - Optional attributes to apply to the generated method (e.g., +/// `#[cfg(CONFIG_64BIT)]` or inline directives). +/// * `$vis:vis` - The visibility of the generated method (e.g., `pub`). +/// * `$name:ident` / `$try_name:ident` - The name of the generated method (e.g., `read32`, +/// `try_read8`). +/// * `$call_macro:ident` - The backend-specific helper macro used to emit the actual I/O call +/// (e.g., `call_mmio_read`). +/// * `$c_fn:ident` - The backend-specific C function or identifier to be passed into the +/// `$call_macro`. +/// * `$type_name:ty` - The Rust type of the value being read (e.g., `u8`, `u32`). +#[macro_export] +macro_rules! io_define_read { (infallible, $(#[$attr:meta])* $vis:vis $name:ident, $call_macro:ident($c_fn:ident) -> $type_name:ty) => { /// Read IO data from a given offset known at compile time. @@ -226,9 +249,33 @@ macro_rules! define_read { } }; } -pub(crate) use define_read; +pub use io_define_read; -macro_rules! define_write { +/// Generates an accessor method for writing to an I/O backend. +/// +/// This macro reduces boilerplate by automatically generating either compile-time bounds-checked +/// (infallible) or runtime bounds-checked (fallible) write methods. It abstracts the address +/// calculation and bounds checking, and delegates the actual I/O write operation to a specified +/// helper macro, making it generic over different I/O backends. +/// +/// # Parameters +/// +/// * `infallible` / `fallible` - Determines the bounds-checking strategy. `infallible` relies on +/// `IoKnownSize` for compile-time checks and returns `()`. `fallible` performs runtime checks +/// against `maxsize()` and returns a `Result`. +/// * `$(#[$attr:meta])*` - Optional attributes to apply to the generated method (e.g., +/// `#[cfg(CONFIG_64BIT)]` or inline directives). +/// * `$vis:vis` - The visibility of the generated method (e.g., `pub`). +/// * `$name:ident` / `$try_name:ident` - The name of the generated method (e.g., `write32`, +/// `try_write8`). +/// * `$call_macro:ident` - The backend-specific helper macro used to emit the actual I/O call +/// (e.g., `call_mmio_write`). +/// * `$c_fn:ident` - The backend-specific C function or identifier to be passed into the +/// `$call_macro`. +/// * `$type_name:ty` - The Rust type of the value being written (e.g., `u8`, `u32`). Note the use +/// of `<-` before the type to denote a write operation. +#[macro_export] +macro_rules! io_define_write { (infallible, $(#[$attr:meta])* $vis:vis $name:ident, $call_macro:ident($c_fn:ident) <- $type_name:ty) => { /// Write IO data from a given offset known at compile time. @@ -259,7 +306,7 @@ macro_rules! define_write { } }; } -pub(crate) use define_write; +pub use io_define_write; /// Checks whether an access of type `U` at the given `offset` /// is valid within this region. @@ -509,40 +556,40 @@ impl Io for Mmio { self.0.maxsize() } - define_read!(fallible, try_read8, call_mmio_read(readb) -> u8); - define_read!(fallible, try_read16, call_mmio_read(readw) -> u16); - define_read!(fallible, try_read32, call_mmio_read(readl) -> u32); - define_read!( + io_define_read!(fallible, try_read8, call_mmio_read(readb) -> u8); + io_define_read!(fallible, try_read16, call_mmio_read(readw) -> u16); + io_define_read!(fallible, try_read32, call_mmio_read(readl) -> u32); + io_define_read!( fallible, #[cfg(CONFIG_64BIT)] try_read64, call_mmio_read(readq) -> u64 ); - define_write!(fallible, try_write8, call_mmio_write(writeb) <- u8); - define_write!(fallible, try_write16, call_mmio_write(writew) <- u16); - define_write!(fallible, try_write32, call_mmio_write(writel) <- u32); - define_write!( + io_define_write!(fallible, try_write8, call_mmio_write(writeb) <- u8); + io_define_write!(fallible, try_write16, call_mmio_write(writew) <- u16); + io_define_write!(fallible, try_write32, call_mmio_write(writel) <- u32); + io_define_write!( fallible, #[cfg(CONFIG_64BIT)] try_write64, call_mmio_write(writeq) <- u64 ); - define_read!(infallible, read8, call_mmio_read(readb) -> u8); - define_read!(infallible, read16, call_mmio_read(readw) -> u16); - define_read!(infallible, read32, call_mmio_read(readl) -> u32); - define_read!( + io_define_read!(infallible, read8, call_mmio_read(readb) -> u8); + io_define_read!(infallible, read16, call_mmio_read(readw) -> u16); + io_define_read!(infallible, read32, call_mmio_read(readl) -> u32); + io_define_read!( infallible, #[cfg(CONFIG_64BIT)] read64, call_mmio_read(readq) -> u64 ); - define_write!(infallible, write8, call_mmio_write(writeb) <- u8); - define_write!(infallible, write16, call_mmio_write(writew) <- u16); - define_write!(infallible, write32, call_mmio_write(writel) <- u32); - define_write!( + io_define_write!(infallible, write8, call_mmio_write(writeb) <- u8); + io_define_write!(infallible, write16, call_mmio_write(writew) <- u16); + io_define_write!(infallible, write32, call_mmio_write(writel) <- u32); + io_define_write!( infallible, #[cfg(CONFIG_64BIT)] write64, @@ -566,40 +613,40 @@ impl Mmio { unsafe { &*core::ptr::from_ref(raw).cast() } } - define_read!(infallible, pub read8_relaxed, call_mmio_read(readb_relaxed) -> u8); - define_read!(infallible, pub read16_relaxed, call_mmio_read(readw_relaxed) -> u16); - define_read!(infallible, pub read32_relaxed, call_mmio_read(readl_relaxed) -> u32); - define_read!( + io_define_read!(infallible, pub read8_relaxed, call_mmio_read(readb_relaxed) -> u8); + io_define_read!(infallible, pub read16_relaxed, call_mmio_read(readw_relaxed) -> u16); + io_define_read!(infallible, pub read32_relaxed, call_mmio_read(readl_relaxed) -> u32); + io_define_read!( infallible, #[cfg(CONFIG_64BIT)] pub read64_relaxed, call_mmio_read(readq_relaxed) -> u64 ); - define_read!(fallible, pub try_read8_relaxed, call_mmio_read(readb_relaxed) -> u8); - define_read!(fallible, pub try_read16_relaxed, call_mmio_read(readw_relaxed) -> u16); - define_read!(fallible, pub try_read32_relaxed, call_mmio_read(readl_relaxed) -> u32); - define_read!( + io_define_read!(fallible, pub try_read8_relaxed, call_mmio_read(readb_relaxed) -> u8); + io_define_read!(fallible, pub try_read16_relaxed, call_mmio_read(readw_relaxed) -> u16); + io_define_read!(fallible, pub try_read32_relaxed, call_mmio_read(readl_relaxed) -> u32); + io_define_read!( fallible, #[cfg(CONFIG_64BIT)] pub try_read64_relaxed, call_mmio_read(readq_relaxed) -> u64 ); - define_write!(infallible, pub write8_relaxed, call_mmio_write(writeb_relaxed) <- u8); - define_write!(infallible, pub write16_relaxed, call_mmio_write(writew_relaxed) <- u16); - define_write!(infallible, pub write32_relaxed, call_mmio_write(writel_relaxed) <- u32); - define_write!( + io_define_write!(infallible, pub write8_relaxed, call_mmio_write(writeb_relaxed) <- u8); + io_define_write!(infallible, pub write16_relaxed, call_mmio_write(writew_relaxed) <- u16); + io_define_write!(infallible, pub write32_relaxed, call_mmio_write(writel_relaxed) <- u32); + io_define_write!( infallible, #[cfg(CONFIG_64BIT)] pub write64_relaxed, call_mmio_write(writeq_relaxed) <- u64 ); - define_write!(fallible, pub try_write8_relaxed, call_mmio_write(writeb_relaxed) <- u8); - define_write!(fallible, pub try_write16_relaxed, call_mmio_write(writew_relaxed) <- u16); - define_write!(fallible, pub try_write32_relaxed, call_mmio_write(writel_relaxed) <- u32); - define_write!( + io_define_write!(fallible, pub try_write8_relaxed, call_mmio_write(writeb_relaxed) <- u8); + io_define_write!(fallible, pub try_write16_relaxed, call_mmio_write(writew_relaxed) <- u16); + io_define_write!(fallible, pub try_write32_relaxed, call_mmio_write(writel_relaxed) <- u32); + io_define_write!( fallible, #[cfg(CONFIG_64BIT)] pub try_write64_relaxed, diff --git a/rust/kernel/pci/io.rs b/rust/kernel/pci/io.rs index 6ca4cf75594c..fb6edab2aea7 100644 --- a/rust/kernel/pci/io.rs +++ b/rust/kernel/pci/io.rs @@ -8,8 +8,8 @@ use crate::{ device, devres::Devres, io::{ - define_read, - define_write, + io_define_read, + io_define_write, Io, IoCapable, IoKnownSize, @@ -88,7 +88,7 @@ pub struct ConfigSpace<'a, S: ConfigSpaceKind = Extended> { /// Internal helper macros used to invoke C PCI configuration space read functions. /// /// This macro is intended to be used by higher-level PCI configuration space access macros -/// (define_read) and provides a unified expansion for infallible vs. fallible read semantics. It +/// (io_define_read) and provides a unified expansion for infallible vs. fallible read semantics. It /// emits a direct call into the corresponding C helper and performs the required cast to the Rust /// return type. /// @@ -117,9 +117,9 @@ macro_rules! call_config_read { /// Internal helper macros used to invoke C PCI configuration space write functions. /// /// This macro is intended to be used by higher-level PCI configuration space access macros -/// (define_write) and provides a unified expansion for infallible vs. fallible read semantics. It -/// emits a direct call into the corresponding C helper and performs the required cast to the Rust -/// return type. +/// (io_define_write) and provides a unified expansion for infallible vs. fallible read semantics. +/// It emits a direct call into the corresponding C helper and performs the required cast to the +/// Rust return type. /// /// # Parameters /// @@ -163,13 +163,13 @@ impl<'a, S: ConfigSpaceKind> Io for ConfigSpace<'a, S> { // PCI configuration space does not support fallible operations. // The default implementations from the Io trait are not used. - define_read!(infallible, read8, call_config_read(pci_read_config_byte) -> u8); - define_read!(infallible, read16, call_config_read(pci_read_config_word) -> u16); - define_read!(infallible, read32, call_config_read(pci_read_config_dword) -> u32); + io_define_read!(infallible, read8, call_config_read(pci_read_config_byte) -> u8); + io_define_read!(infallible, read16, call_config_read(pci_read_config_word) -> u16); + io_define_read!(infallible, read32, call_config_read(pci_read_config_dword) -> u32); - define_write!(infallible, write8, call_config_write(pci_write_config_byte) <- u8); - define_write!(infallible, write16, call_config_write(pci_write_config_word) <- u16); - define_write!(infallible, write32, call_config_write(pci_write_config_dword) <- u32); + io_define_write!(infallible, write8, call_config_write(pci_write_config_byte) <- u8); + io_define_write!(infallible, write16, call_config_write(pci_write_config_word) <- u16); + io_define_write!(infallible, write32, call_config_write(pci_write_config_dword) <- u32); } impl<'a, S: ConfigSpaceKind> IoKnownSize for ConfigSpace<'a, S> { -- cgit v1.2.3 From c5794709bc9105935dbedef8b9cf9c06f2b559fa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 17 Feb 2026 20:28:29 -0800 Subject: ksmbd: Compare MACs in constant time To prevent timing attacks, MAC comparisons need to be constant-time. Replace the memcmp() with the correct function, crypto_memneq(). Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/Kconfig | 1 + fs/smb/server/auth.c | 4 +++- fs/smb/server/smb2pdu.c | 5 +++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 2775162c535c..12594879cb64 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -13,6 +13,7 @@ config SMB_SERVER select CRYPTO_LIB_MD5 select CRYPTO_LIB_SHA256 select CRYPTO_LIB_SHA512 + select CRYPTO_LIB_UTILS select CRYPTO_CMAC select CRYPTO_AEAD2 select CRYPTO_CCM diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 580c4d303dc3..5fe8c667c6b1 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -165,7 +166,8 @@ int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, ntlmv2_rsp, CIFS_HMAC_MD5_HASH_SIZE, sess->sess_key); - if (memcmp(ntlmv2->ntlmv2_hash, ntlmv2_rsp, CIFS_HMAC_MD5_HASH_SIZE) != 0) + if (crypto_memneq(ntlmv2->ntlmv2_hash, ntlmv2_rsp, + CIFS_HMAC_MD5_HASH_SIZE)) return -EINVAL; return 0; } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 95901a78951c..743c629fe7ec 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4,6 +4,7 @@ * Copyright (C) 2018 Samsung Electronics Co., Ltd. */ +#include #include #include #include @@ -8880,7 +8881,7 @@ int smb2_check_sign_req(struct ksmbd_work *work) ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, 1, signature); - if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) { + if (crypto_memneq(signature, signature_req, SMB2_SIGNATURE_SIZE)) { pr_err("bad smb2 signature\n"); return 0; } @@ -8968,7 +8969,7 @@ int smb3_check_sign_req(struct ksmbd_work *work) if (ksmbd_sign_smb3_pdu(conn, signing_key, iov, 1, signature)) return 0; - if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) { + if (crypto_memneq(signature, signature_req, SMB2_SIGNATURE_SIZE)) { pr_err("bad smb2 signature\n"); return 0; } -- cgit v1.2.3 From 6b4f875aac344cdd52a1f34cc70ed2f874a65757 Mon Sep 17 00:00:00 2001 From: Nicholas Carlini Date: Thu, 19 Feb 2026 20:58:57 +0900 Subject: ksmbd: fix signededness bug in smb_direct_prepare_negotiation() smb_direct_prepare_negotiation() casts an unsigned __u32 value from sp->max_recv_size and req->preferred_send_size to a signed int before computing min_t(int, ...). A maliciously provided preferred_send_size of 0x80000000 will return as smaller than max_recv_size, and then be used to set the maximum allowed alowed receive size for the next message. By sending a second message with a large value (>1420 bytes) the attacker can then achieve a heap buffer overflow. This fix replaces min_t(int, ...) with min_t(u32) Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Nicholas Carlini Reviewed-by: Stefan Metzmacher Acked-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7c53b78b818e..188572491d53 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2540,9 +2540,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) goto put; req = (struct smbdirect_negotiate_req *)recvmsg->packet; - sp->max_recv_size = min_t(int, sp->max_recv_size, + sp->max_recv_size = min_t(u32, sp->max_recv_size, le32_to_cpu(req->preferred_send_size)); - sp->max_send_size = min_t(int, sp->max_send_size, + sp->max_send_size = min_t(u32, sp->max_send_size, le32_to_cpu(req->max_receive_size)); sp->max_fragmented_send_size = le32_to_cpu(req->max_fragmented_size); -- cgit v1.2.3 From 47322c469d4a63ac45b705ca83680671ff71c975 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 9 Feb 2026 16:38:05 +0100 Subject: dma-mapping: avoid random addr value print out on error path dma_addr is unitialized in dma_direct_map_phys() when swiotlb is forced and DMA_ATTR_MMIO is set which leads to random value print out in warning. Fix that by just returning DMA_MAPPING_ERROR. Fixes: e53d29f957b3 ("dma-mapping: convert dma_direct_*map_page to be phys_addr_t based") Signed-off-by: Jiri Pirko Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260209153809.250835-2-jiri@resnulli.us --- kernel/dma/direct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f476c63b668c..e89f175e9c2d 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -85,7 +85,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, if (is_swiotlb_force_bounce(dev)) { if (attrs & DMA_ATTR_MMIO) - goto err_overflow; + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); } -- cgit v1.2.3 From d5b5e8149af0f5efed58653cbebf1cb3258ce49a Mon Sep 17 00:00:00 2001 From: Stian Halseth Date: Wed, 18 Feb 2026 13:00:24 +0100 Subject: sparc: Fix page alignment in dma mapping 'phys' may include an offset within the page, while previously used 'base_paddr' was already page-aligned. This caused incorrect DMA mapping in dma_4u_map_phys and dma_4v_map_phys. Fix both functions by masking 'phys' with IO_PAGE_MASK, covering both generic SPARC code and sun4v. Fixes: 38c0d0ebf520 ("sparc: Use physical address DMA mapping") Reported-by: Stian Halseth Closes: https://github.com/sparclinux/issues/issues/75 Suggested-by: Marek Szyprowski Signed-off-by: Stian Halseth Tested-by: Nathaniel Roach Tested-by: Han Gao # on SPARC Enterprise T5220 [mszyprow: adjusted commit description a bit] Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260218120056.3366-2-stian@itx.no --- arch/sparc/kernel/iommu.c | 2 ++ arch/sparc/kernel/pci_sun4v.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 46ef88bc9c26..7613ab0ffb89 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -312,6 +312,8 @@ static dma_addr_t dma_4u_map_phys(struct device *dev, phys_addr_t phys, if (direction != DMA_TO_DEVICE) iopte_protection |= IOPTE_WRITE; + phys &= IO_PAGE_MASK; + for (i = 0; i < npages; i++, base++, phys += IO_PAGE_SIZE) iopte_val(*base) = iopte_protection | phys; diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 440284cc804e..61f14b4c8f90 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -410,6 +410,8 @@ static dma_addr_t dma_4v_map_phys(struct device *dev, phys_addr_t phys, iommu_batch_start(dev, prot, entry); + phys &= IO_PAGE_MASK; + for (i = 0; i < npages; i++, phys += IO_PAGE_SIZE) { long err = iommu_batch_add(phys, mask); if (unlikely(err < 0L)) -- cgit v1.2.3 From 1cb3c20688fc8380c9b365d03aea7e84faf6a9fd Mon Sep 17 00:00:00 2001 From: Sean Rhodes Date: Thu, 19 Feb 2026 20:14:26 +0000 Subject: ALSA: hda/realtek: Fix speaker pop on Star Labs StarFighter On Star Labs StarFighter (Realtek ALC233/235), the internal speakers can emit an audible pop when entering or leaving runtime suspend. Mute the speaker output paths via snd_hda_gen_shutup_speakers() in the Realtek shutup callback before the codec is powered down. This is enough to avoid the pop without special EAPD handling. Test results: - runtime PM pop fixed - still reaches D3 (PCI 0000:00:1f.3 power_state=D3hot) - does not address pops on cold boot (G3 exit) or around display manager start/shutdown journalctl -k (boot): - snd_hda_codec_alc269 hdaudioC0D0: ALC233: picked fixup for PCI SSID 7017:2014 - snd_hda_codec_alc269 hdaudioC0D0: autoconfig for ALC233: line_outs=1 (0x1b/0x0/0x0/0x0/0x0) type:speaker Suggested-by: Takashi Iwai Tested-by: Sean Rhodes Signed-off-by: Sean Rhodes Link: https://patch.msgid.link/4d5fb71b132bb283fd41c622b8413770b2065242.1771532060.git.sean@starlabs.systems Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 36053042ca77..9f64bb97c3f9 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -1017,6 +1017,24 @@ static int alc269_resume(struct hda_codec *codec) return 0; } +#define STARLABS_STARFIGHTER_SHUTUP_DELAY_MS 30 + +static void starlabs_starfighter_shutup(struct hda_codec *codec) +{ + if (snd_hda_gen_shutup_speakers(codec)) + msleep(STARLABS_STARFIGHTER_SHUTUP_DELAY_MS); +} + +static void alc233_fixup_starlabs_starfighter(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PRE_PROBE) + spec->shutup = starlabs_starfighter_shutup; +} + static void alc269_fixup_pincfg_no_hp_to_lineout(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -4040,6 +4058,7 @@ enum { ALC245_FIXUP_CLEVO_NOISY_MIC, ALC269_FIXUP_VAIO_VJFH52_MIC_NO_PRESENCE, ALC233_FIXUP_MEDION_MTL_SPK, + ALC233_FIXUP_STARLABS_STARFIGHTER, ALC294_FIXUP_BASS_SPEAKER_15, ALC283_FIXUP_DELL_HP_RESUME, ALC294_FIXUP_ASUS_CS35L41_SPI_2, @@ -6499,6 +6518,10 @@ static const struct hda_fixup alc269_fixups[] = { { } }, }, + [ALC233_FIXUP_STARLABS_STARFIGHTER] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc233_fixup_starlabs_starfighter, + }, [ALC294_FIXUP_BASS_SPEAKER_15] = { .type = HDA_FIXUP_FUNC, .v.func = alc294_fixup_bass_speaker_15, @@ -7651,6 +7674,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x2782, 0x1705, "MEDION E15433", ALC269VC_FIXUP_INFINIX_Y4_MAX), SND_PCI_QUIRK(0x2782, 0x1707, "Vaio VJFE-ADL", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x2782, 0x4900, "MEDION E15443", ALC233_FIXUP_MEDION_MTL_SPK), + SND_PCI_QUIRK(0x7017, 0x2014, "Star Labs StarFighter", ALC233_FIXUP_STARLABS_STARFIGHTER), SND_PCI_QUIRK(0x8086, 0x2074, "Intel NUC 8", ALC233_FIXUP_INTEL_NUC8_DMIC), SND_PCI_QUIRK(0x8086, 0x2080, "Intel NUC 8 Rugged", ALC256_FIXUP_INTEL_NUC8_RUGGED), SND_PCI_QUIRK(0x8086, 0x2081, "Intel NUC 10", ALC256_FIXUP_INTEL_NUC10), @@ -7747,6 +7771,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC298_FIXUP_TPT470_DOCK_FIX, .name = "tpt470-dock-fix"}, {.id = ALC298_FIXUP_TPT470_DOCK, .name = "tpt470-dock"}, {.id = ALC233_FIXUP_LENOVO_MULTI_CODECS, .name = "dual-codecs"}, + {.id = ALC233_FIXUP_STARLABS_STARFIGHTER, .name = "starlabs-starfighter"}, {.id = ALC700_FIXUP_INTEL_REFERENCE, .name = "alc700-ref"}, {.id = ALC269_FIXUP_SONY_VAIO, .name = "vaio"}, {.id = ALC269_FIXUP_DELL_M101Z, .name = "dell-m101z"}, -- cgit v1.2.3 From 1d241483368f2fd87fbaba64d6aec6bad3a1e12e Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Fri, 20 Feb 2026 21:58:48 +1030 Subject: ALSA: scarlett2: Fix DSP filter control array handling scarlett2_add_dsp_ctls() was incorrectly storing the precomp and PEQ filter coefficient control pointers into the precomp_flt_switch_ctls and peq_flt_switch_ctls arrays instead of the intended targets precomp_flt_ctls and peq_flt_ctls. Pass NULL instead, as the filter coefficient control pointers are not used, and remove the unused precomp_flt_ctls and peq_flt_ctls arrays from struct scarlett2_data. Additionally, scarlett2_update_filter_values() was reading dsp_input_count * peq_flt_count values for SCARLETT2_CONFIG_PEQ_FLT_SWITCH, but the peq_flt_switch array is indexed only by dsp_input_count (one switch per DSP input, not per filter). Fix the read count. Fixes: b64678eb4e70 ("ALSA: scarlett2: Add DSP controls") Signed-off-by: Geoffrey D. Bennett Link: https://patch.msgid.link/86497b71db060677d97c38a6ce5f89bb3b25361b.1771581197.git.g@b4.vu Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 85a0316889d4..ef3150581eab 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -1328,8 +1328,6 @@ struct scarlett2_data { struct snd_kcontrol *mux_ctls[SCARLETT2_MUX_MAX]; struct snd_kcontrol *mix_ctls[SCARLETT2_MIX_MAX]; struct snd_kcontrol *compressor_ctls[SCARLETT2_COMPRESSOR_CTLS_MAX]; - struct snd_kcontrol *precomp_flt_ctls[SCARLETT2_PRECOMP_FLT_CTLS_MAX]; - struct snd_kcontrol *peq_flt_ctls[SCARLETT2_PEQ_FLT_CTLS_MAX]; struct snd_kcontrol *precomp_flt_switch_ctls[SCARLETT2_DSP_SWITCH_MAX]; struct snd_kcontrol *peq_flt_switch_ctls[SCARLETT2_DSP_SWITCH_MAX]; struct snd_kcontrol *direct_monitor_ctl; @@ -3447,7 +3445,6 @@ static int scarlett2_update_autogain(struct usb_mixer_interface *mixer) private->autogain_status[i] = private->num_autogain_status_texts - 1; - for (i = 0; i < SCARLETT2_AG_TARGET_COUNT; i++) if (scarlett2_has_config_item(private, scarlett2_ag_target_configs[i])) { @@ -5372,8 +5369,7 @@ static int scarlett2_update_filter_values(struct usb_mixer_interface *mixer) err = scarlett2_usb_get_config( mixer, SCARLETT2_CONFIG_PEQ_FLT_SWITCH, - info->dsp_input_count * info->peq_flt_count, - private->peq_flt_switch); + info->dsp_input_count, private->peq_flt_switch); if (err < 0) return err; @@ -6546,7 +6542,7 @@ static int scarlett2_add_dsp_ctls(struct usb_mixer_interface *mixer, int i) err = scarlett2_add_new_ctl( mixer, &scarlett2_precomp_flt_ctl, i * info->precomp_flt_count + j, - 1, s, &private->precomp_flt_switch_ctls[j]); + 1, s, NULL); if (err < 0) return err; } @@ -6556,7 +6552,7 @@ static int scarlett2_add_dsp_ctls(struct usb_mixer_interface *mixer, int i) err = scarlett2_add_new_ctl( mixer, &scarlett2_peq_flt_ctl, i * info->peq_flt_count + j, - 1, s, &private->peq_flt_switch_ctls[j]); + 1, s, NULL); if (err < 0) return err; } -- cgit v1.2.3 From cbddd303416456db5ceeedaf9e262096f079e861 Mon Sep 17 00:00:00 2001 From: Panagiotis Foliadis Date: Sat, 21 Feb 2026 19:40:58 +0000 Subject: ALSA: hda/realtek: Add quirk for Acer Aspire V3-572G The Acer Aspire V3-572G has a combo jack (ALC283) but the BIOS sets pin 0x19 to 0x411111f0 (not connected), so the headset mic is not detected. Add a quirk to override pin 0x19 as a headset mic and enable headset mode. Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221075 Suggested-by: Charalampos Mitrodimas Signed-off-by: Panagiotis Foliadis Reviewed-by: Charalampos Mitrodimas Link: https://patch.msgid.link/20260221-fix-detect-mic-v1-1-b6e427b5275d@posteo.net Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 9f64bb97c3f9..bba173645b1f 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6614,6 +6614,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS), SND_PCI_QUIRK(0x1025, 0x080d, "Acer Aspire V5-122P", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x0840, "Acer Aspire E1", ALC269VB_FIXUP_ASPIRE_E1_COEF), + SND_PCI_QUIRK(0x1025, 0x0943, "Acer Aspire V3-572G", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x100c, "Acer Aspire E5-574G", ALC255_FIXUP_ACER_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1025, 0x101c, "Acer Veriton N2510G", ALC269_FIXUP_LIFEBOOK), SND_PCI_QUIRK(0x1025, 0x102b, "Acer Aspire C24-860", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), -- cgit v1.2.3 From 43a44fb7f2fa163926b23149805e989ba2395db1 Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Sun, 22 Feb 2026 21:26:08 +0900 Subject: ALSA: hda/realtek: fix model name typo for Samsung Galaxy Book Flex (NT950QCG-X716) There's no product named "Samsung Galaxy Flex Book". Use the correct "Samsung Galaxy Book Flex" name. Link: https://www.samsung.com/sec/support/model/NT950QCG-X716 Link: https://www.samsung.com/us/computing/galaxy-books/galaxy-book-flex/galaxy-book-flex-15-6-qled-512gb-storage-s-pen-included-np950qcg-k01us Cc: Signed-off-by: Juhyung Park Link: https://patch.msgid.link/20260222122609.281191-1-qkrwngud825@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index bba173645b1f..7166cbeb0925 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7335,7 +7335,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x144d, 0xc169, "Samsung Notebook 9 Pen (NP930SBE-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc176, "Samsung Notebook 9 Pro (NP930MBE-K04US)", ALC298_FIXUP_SAMSUNG_AMP), - SND_PCI_QUIRK(0x144d, 0xc189, "Samsung Galaxy Flex Book (NT950QCG-X716)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x144d, 0xc189, "Samsung Galaxy Book Flex (NT950QCG-X716)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc18a, "Samsung Galaxy Book Ion (NP930XCJ-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc1a3, "Samsung Galaxy Book Pro (NP935XDB-KC1SE)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc1a4, "Samsung Galaxy Book Pro 360 (NT935QBD)", ALC298_FIXUP_SAMSUNG_AMP), -- cgit v1.2.3 From 9fb16a5c5ff93058851099a2b80a899b0c53fe3f Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Sun, 22 Feb 2026 21:26:09 +0900 Subject: ALSA: hda/realtek: add quirk for Samsung Galaxy Book Flex (NT950QCT-A38A) Similar to other Samsung laptops, NT950QCT also requires the ALC298_FIXUP_SAMSUNG_AMP quirk applied. Cc: Signed-off-by: Juhyung Park Link: https://patch.msgid.link/20260222122609.281191-2-qkrwngud825@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 7166cbeb0925..43ecfc63ef87 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7335,6 +7335,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x144d, 0xc169, "Samsung Notebook 9 Pen (NP930SBE-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc176, "Samsung Notebook 9 Pro (NP930MBE-K04US)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x144d, 0xc188, "Samsung Galaxy Book Flex (NT950QCT-A38A)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc189, "Samsung Galaxy Book Flex (NT950QCG-X716)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc18a, "Samsung Galaxy Book Ion (NP930XCJ-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc1a3, "Samsung Galaxy Book Pro (NP935XDB-KC1SE)", ALC298_FIXUP_SAMSUNG_AMP), -- cgit v1.2.3 From 24d2d3c5f94007a5a0554065ab7349bb69e28bcb Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Sat, 21 Feb 2026 02:33:45 +1030 Subject: ALSA: usb-audio: Improve Focusrite sample rate filtering Replace the bLength == 10 max_rate check in focusrite_valid_sample_rate() with filtering that also examines the bmControls VAL_ALT_SETTINGS bit. When VAL_ALT_SETTINGS is readable, the device uses strict per-altsetting rate filtering (only the highest rate pair for that altsetting is valid). When it is not readable, all rates up to max_rate are valid. For devices without the bLength == 10 Format Type descriptor extension but with VAL_ALT_SETTINGS readable and multiple altsettings (only seen in Scarlett 18i8 3rd Gen playback), fall back to the Focusrite convention: alt 1 = 48kHz, alt 2 = 96kHz, alt 3 = 192kHz. This produces correct rate tables for all tested Focusrite devices (all Scarlett 2nd, 3rd, and 4th Gen, Clarett+, and Vocaster) using only USB descriptors, allowing QUIRK_FLAG_VALIDATE_RATES to be removed for Focusrite in the next commit. Signed-off-by: Geoffrey D. Bennett Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/7e18c1f393a6ecb6fc75dd867a2c4dbe135e3e22.1771594828.git.g@b4.vu --- sound/usb/format.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/sound/usb/format.c b/sound/usb/format.c index 64cfe4a9d8cd..1207c507882a 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -305,17 +305,48 @@ static bool s1810c_valid_sample_rate(struct audioformat *fp, } /* - * Many Focusrite devices supports a limited set of sampling rates per - * altsetting. Maximum rate is exposed in the last 4 bytes of Format Type - * descriptor which has a non-standard bLength = 10. + * Focusrite devices use rate pairs: 44100/48000, 88200/96000, and + * 176400/192000. Return true if rate is in the pair for max_rate. + */ +static bool focusrite_rate_pair(unsigned int rate, + unsigned int max_rate) +{ + switch (max_rate) { + case 48000: return rate == 44100 || rate == 48000; + case 96000: return rate == 88200 || rate == 96000; + case 192000: return rate == 176400 || rate == 192000; + default: return true; + } +} + +/* + * Focusrite devices report all supported rates in a single clock + * source but only a subset is valid per altsetting. + * + * Detection uses two descriptor features: + * + * 1. Format Type descriptor bLength == 10: non-standard extension + * with max sample rate in bytes 6..9. + * + * 2. bmControls VAL_ALT_SETTINGS readable bit: when set, the device + * only supports the highest rate pair for that altsetting, and when + * clear, all rates up to max_rate are valid. + * + * For devices without the bLength == 10 extension but with + * VAL_ALT_SETTINGS readable and multiple altsettings (only seen in + * Scarlett 18i8 3rd Gen playback), fall back to the Focusrite + * convention: alt 1 = 48kHz, alt 2 = 96kHz, alt 3 = 192kHz. */ static bool focusrite_valid_sample_rate(struct snd_usb_audio *chip, struct audioformat *fp, unsigned int rate) { + struct usb_interface *iface; struct usb_host_interface *alts; + struct uac2_as_header_descriptor *as; unsigned char *fmt; unsigned int max_rate; + bool val_alt; alts = snd_usb_get_host_interface(chip, fp->iface, fp->altsetting); if (!alts) @@ -326,9 +357,21 @@ static bool focusrite_valid_sample_rate(struct snd_usb_audio *chip, if (!fmt) return true; + as = snd_usb_find_csint_desc(alts->extra, alts->extralen, + NULL, UAC_AS_GENERAL); + if (!as) + return true; + + val_alt = uac_v2v3_control_is_readable(as->bmControls, + UAC2_AS_VAL_ALT_SETTINGS); + if (fmt[0] == 10) { /* bLength */ max_rate = combine_quad(&fmt[6]); + if (val_alt) + return focusrite_rate_pair(rate, max_rate); + + /* No val_alt: rates fall through from higher */ switch (max_rate) { case 192000: if (rate == 176400 || rate == 192000) @@ -344,12 +387,29 @@ static bool focusrite_valid_sample_rate(struct snd_usb_audio *chip, usb_audio_info(chip, "%u:%d : unexpected max rate: %u\n", fp->iface, fp->altsetting, max_rate); - return true; } } - return true; + if (!val_alt) + return true; + + /* Multi-altsetting device with val_alt but no max_rate + * in the format descriptor. Use Focusrite convention: + * alt 1 = 48kHz, alt 2 = 96kHz, alt 3 = 192kHz. + */ + iface = usb_ifnum_to_if(chip->dev, fp->iface); + if (!iface || iface->num_altsetting <= 2) + return true; + + switch (fp->altsetting) { + case 1: max_rate = 48000; break; + case 2: max_rate = 96000; break; + case 3: max_rate = 192000; break; + default: return true; + } + + return focusrite_rate_pair(rate, max_rate); } /* -- cgit v1.2.3 From a8cc55bf81a45772cad44c83ea7bb0e98431094a Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Sat, 21 Feb 2026 02:34:48 +1030 Subject: ALSA: usb-audio: Remove VALIDATE_RATES quirk for Focusrite devices Remove QUIRK_FLAG_VALIDATE_RATES for Focusrite. With the previous commit, focusrite_valid_sample_rate() produces correct rate tables without USB probing. QUIRK_FLAG_VALIDATE_RATES sends SET_CUR requests for each rate (~25ms each) and leaves the device at 192kHz. This is a problem because that rate: 1) disables the internal mixer, so outputs are silent until an application opens the PCM and sets a lower rate, and 2) the Air and Safe modes get disabled. Fixes: 5963e5262180 ("ALSA: usb-audio: Enable rate validation for Scarlett devices") Signed-off-by: Geoffrey D. Bennett Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/09b9c012024c998c4ca14bd876ef0dce0d0b6101.1771594828.git.g@b4.vu --- sound/usb/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 4cac0dfb0094..3164c9431d29 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2424,7 +2424,7 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x1235, /* Focusrite Novation */ - QUIRK_FLAG_VALIDATE_RATES), + 0), VENDOR_FLG(0x1511, /* AURALiC */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x152a, /* Thesycon devices */ -- cgit v1.2.3 From 38c322068a26a01d7ff64da92179e68cdde9860b Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Sat, 21 Feb 2026 02:36:35 +1030 Subject: ALSA: usb-audio: Add QUIRK_FLAG_SKIP_IFACE_SETUP Add a quirk flag to skip the usb_set_interface(), snd_usb_init_pitch(), and snd_usb_init_sample_rate() calls in __snd_usb_parse_audio_interface(). These are redundant with snd_usb_endpoint_prepare() at stream-open time. Enable the quirk for Focusrite devices, as init_sample_rate(rate_max) sets 192kHz during probing, which disables the internal mixer and Air and Safe modes. Fixes: 16f1f838442d ("Revert "ALSA: usb-audio: Drop superfluous interface setup at parsing"") Signed-off-by: Geoffrey D. Bennett Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/65a7909b15f9feb76c2a6f4f8814c240ddc50737.1771594828.git.g@b4.vu --- sound/usb/quirks.c | 3 ++- sound/usb/stream.c | 3 +++ sound/usb/usbaudio.h | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3164c9431d29..b13d0f4d25ac 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2424,7 +2424,7 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x1235, /* Focusrite Novation */ - 0), + QUIRK_FLAG_SKIP_IFACE_SETUP), VENDOR_FLG(0x1511, /* AURALiC */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x152a, /* Thesycon devices */ @@ -2506,6 +2506,7 @@ static const char *const snd_usb_audio_quirk_flag_names[] = { QUIRK_STRING_ENTRY(MIC_RES_384), QUIRK_STRING_ENTRY(MIXER_PLAYBACK_MIN_MUTE), QUIRK_STRING_ENTRY(MIXER_CAPTURE_MIN_MUTE), + QUIRK_STRING_ENTRY(SKIP_IFACE_SETUP), NULL }; diff --git a/sound/usb/stream.c b/sound/usb/stream.c index ac4d92065dd9..d38c39e28f38 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -1259,6 +1259,9 @@ static int __snd_usb_parse_audio_interface(struct snd_usb_audio *chip, set_iface_first = true; /* try to set the interface... */ + if (chip->quirk_flags & QUIRK_FLAG_SKIP_IFACE_SETUP) + continue; + usb_set_interface(chip->dev, iface_no, 0); if (set_iface_first) usb_set_interface(chip->dev, iface_no, altno); diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index 79978cae9799..085530cf62d9 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -224,6 +224,10 @@ extern bool snd_usb_skip_validation; * playback value represents muted state instead of minimum audible volume * QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE * Similar to QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE, but for capture streams + * QUIRK_FLAG_SKIP_IFACE_SETUP + * Skip the probe-time interface setup (usb_set_interface, + * init_pitch, init_sample_rate); redundant with + * snd_usb_endpoint_prepare() at stream-open time */ enum { @@ -253,6 +257,7 @@ enum { QUIRK_TYPE_MIC_RES_384 = 23, QUIRK_TYPE_MIXER_PLAYBACK_MIN_MUTE = 24, QUIRK_TYPE_MIXER_CAPTURE_MIN_MUTE = 25, + QUIRK_TYPE_SKIP_IFACE_SETUP = 26, /* Please also edit snd_usb_audio_quirk_flag_names */ }; @@ -284,5 +289,6 @@ enum { #define QUIRK_FLAG_MIC_RES_384 QUIRK_FLAG(MIC_RES_384) #define QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE QUIRK_FLAG(MIXER_PLAYBACK_MIN_MUTE) #define QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE QUIRK_FLAG(MIXER_CAPTURE_MIN_MUTE) +#define QUIRK_FLAG_SKIP_IFACE_SETUP QUIRK_FLAG(SKIP_IFACE_SETUP) #endif /* __USBAUDIO_H */ -- cgit v1.2.3 From 0d58273be0b9c3cec3be5488ca37f6ddbaf13cf0 Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Sat, 21 Feb 2026 02:36:56 +1030 Subject: ALSA: usb-audio: Skip clock selector for Focusrite devices Add QUIRK_FLAG_SKIP_CLOCK_SELECTOR for Focusrite devices. During interface parsing, snd_usb_clock_find_source() reads the clock selector value then writes it back unchanged. On Focusrite devices this redundant write results in a ~300ms delay per altsetting, adding ~1.8s to probe time on a typical device with 6 altsettings. Enabling SKIP_CLOCK_SELECTOR skips the redundant write-back. Signed-off-by: Geoffrey D. Bennett Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/00e53ae0a508b41516b41833daa17823381a649c.1771594828.git.g@b4.vu --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b13d0f4d25ac..fbceed8e8d36 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2424,6 +2424,7 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x1235, /* Focusrite Novation */ + QUIRK_FLAG_SKIP_CLOCK_SELECTOR | QUIRK_FLAG_SKIP_IFACE_SETUP), VENDOR_FLG(0x1511, /* AURALiC */ QUIRK_FLAG_DSD_RAW), -- cgit v1.2.3 From 1f96b84835eafb3e6f366dc3a66c0e69504cec9d Mon Sep 17 00:00:00 2001 From: Florian Eckert Date: Thu, 5 Feb 2026 13:55:45 +0100 Subject: pinctrl: equilibrium: rename irq_chip function callbacks Renaming of the irq_chip callback functions to improve clarity. Signed-off-by: Florian Eckert Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-equilibrium.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index 48b55c5bf8d4..49c8232b525a 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -23,7 +23,7 @@ #define PIN_NAME_LEN 10 #define PAD_REG_OFF 0x100 -static void eqbr_gpio_disable_irq(struct irq_data *d) +static void eqbr_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct eqbr_gpio_ctrl *gctrl = gpiochip_get_data(gc); @@ -36,7 +36,7 @@ static void eqbr_gpio_disable_irq(struct irq_data *d) gpiochip_disable_irq(gc, offset); } -static void eqbr_gpio_enable_irq(struct irq_data *d) +static void eqbr_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct eqbr_gpio_ctrl *gctrl = gpiochip_get_data(gc); @@ -50,7 +50,7 @@ static void eqbr_gpio_enable_irq(struct irq_data *d) raw_spin_unlock_irqrestore(&gctrl->lock, flags); } -static void eqbr_gpio_ack_irq(struct irq_data *d) +static void eqbr_irq_ack(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct eqbr_gpio_ctrl *gctrl = gpiochip_get_data(gc); @@ -62,10 +62,10 @@ static void eqbr_gpio_ack_irq(struct irq_data *d) raw_spin_unlock_irqrestore(&gctrl->lock, flags); } -static void eqbr_gpio_mask_ack_irq(struct irq_data *d) +static void eqbr_irq_mask_ack(struct irq_data *d) { - eqbr_gpio_disable_irq(d); - eqbr_gpio_ack_irq(d); + eqbr_irq_mask(d); + eqbr_irq_ack(d); } static inline void eqbr_cfg_bit(void __iomem *addr, @@ -92,7 +92,7 @@ static int eqbr_irq_type_cfg(struct gpio_irq_type *type, return 0; } -static int eqbr_gpio_set_irq_type(struct irq_data *d, unsigned int type) +static int eqbr_irq_set_type(struct irq_data *d, unsigned int type) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct eqbr_gpio_ctrl *gctrl = gpiochip_get_data(gc); @@ -166,11 +166,11 @@ static void eqbr_irq_handler(struct irq_desc *desc) static const struct irq_chip eqbr_irq_chip = { .name = "gpio_irq", - .irq_mask = eqbr_gpio_disable_irq, - .irq_unmask = eqbr_gpio_enable_irq, - .irq_ack = eqbr_gpio_ack_irq, - .irq_mask_ack = eqbr_gpio_mask_ack_irq, - .irq_set_type = eqbr_gpio_set_irq_type, + .irq_ack = eqbr_irq_ack, + .irq_mask = eqbr_irq_mask, + .irq_mask_ack = eqbr_irq_mask_ack, + .irq_unmask = eqbr_irq_unmask, + .irq_set_type = eqbr_irq_set_type, .flags = IRQCHIP_IMMUTABLE, GPIOCHIP_IRQ_RESOURCE_HELPERS, }; -- cgit v1.2.3 From 3e00b1b332e54ba50cca6691f628b9c06574024f Mon Sep 17 00:00:00 2001 From: Florian Eckert Date: Thu, 5 Feb 2026 13:55:46 +0100 Subject: pinctrl: equilibrium: fix warning trace on load The callback functions 'eqbr_irq_mask()' and 'eqbr_irq_ack()' are also called in the callback function 'eqbr_irq_mask_ack()'. This is done to avoid source code duplication. The problem, is that in the function 'eqbr_irq_mask()' also calles the gpiolib function 'gpiochip_disable_irq()' This generates the following warning trace in the log for every gpio on load. [ 6.088111] ------------[ cut here ]------------ [ 6.092440] WARNING: CPU: 3 PID: 1 at drivers/gpio/gpiolib.c:3810 gpiochip_disable_irq+0x39/0x50 [ 6.097847] Modules linked in: [ 6.097847] CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.12.59+ #0 [ 6.097847] Tainted: [W]=WARN [ 6.097847] RIP: 0010:gpiochip_disable_irq+0x39/0x50 [ 6.097847] Code: 39 c6 48 19 c0 21 c6 48 c1 e6 05 48 03 b2 38 03 00 00 48 81 fe 00 f0 ff ff 77 11 48 8b 46 08 f6 c4 02 74 06 f0 80 66 09 fb c3 <0f> 0b 90 0f 1f 40 00 c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 [ 6.097847] RSP: 0000:ffffc9000000b830 EFLAGS: 00010046 [ 6.097847] RAX: 0000000000000045 RBX: ffff888001be02a0 RCX: 0000000000000008 [ 6.097847] RDX: ffff888001be9000 RSI: ffff888001b2dd00 RDI: ffff888001be02a0 [ 6.097847] RBP: ffffc9000000b860 R08: 0000000000000000 R09: 0000000000000000 [ 6.097847] R10: 0000000000000001 R11: ffff888001b2a154 R12: ffff888001be0514 [ 6.097847] R13: ffff888001be02a0 R14: 0000000000000008 R15: 0000000000000000 [ 6.097847] FS: 0000000000000000(0000) GS:ffff888041d80000(0000) knlGS:0000000000000000 [ 6.097847] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6.097847] CR2: 0000000000000000 CR3: 0000000003030000 CR4: 00000000001026b0 [ 6.097847] Call Trace: [ 6.097847] [ 6.097847] ? eqbr_irq_mask+0x63/0x70 [ 6.097847] ? no_action+0x10/0x10 [ 6.097847] eqbr_irq_mask_ack+0x11/0x60 In an other driver (drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c) the interrupt is not disabled here. To fix this, do not call the 'eqbr_irq_mask()' and 'eqbr_irq_ack()' function. Implement instead this directly without disabling the interrupts. Fixes: 52066a53bd11 ("pinctrl: equilibrium: Convert to immutable irq_chip") Signed-off-by: Florian Eckert Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-equilibrium.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index 49c8232b525a..ba1c867b7b89 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -64,8 +64,15 @@ static void eqbr_irq_ack(struct irq_data *d) static void eqbr_irq_mask_ack(struct irq_data *d) { - eqbr_irq_mask(d); - eqbr_irq_ack(d); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct eqbr_gpio_ctrl *gctrl = gpiochip_get_data(gc); + unsigned int offset = irqd_to_hwirq(d); + unsigned long flags; + + raw_spin_lock_irqsave(&gctrl->lock, flags); + writel(BIT(offset), gctrl->membase + GPIO_IRNENCLR); + writel(BIT(offset), gctrl->membase + GPIO_IRNCR); + raw_spin_unlock_irqrestore(&gctrl->lock, flags); } static inline void eqbr_cfg_bit(void __iomem *addr, -- cgit v1.2.3 From 09a30b7a035f9f4ac918c8a9af89d70e43462152 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Mon, 9 Feb 2026 09:33:44 +0530 Subject: pinctrl: qcom: qcs615: Add missing dual edge GPIO IRQ errata flag Wakeup capable GPIOs uses PDC as parent IRQ chip and PDC on qcs615 do not support dual edge IRQs. Add missing wakeirq_dual_edge_errata configuration to enable workaround for dual edge GPIO IRQs. Fixes: b698f36a9d40 ("pinctrl: qcom: add the tlmm driver for QCS615 platform") Signed-off-by: Maulik Shah Reviewed-by: Dmitry Baryshkov Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-qcs615.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/qcom/pinctrl-qcs615.c b/drivers/pinctrl/qcom/pinctrl-qcs615.c index 4dfa820d4e77..f1c827ddbfbf 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs615.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs615.c @@ -1067,6 +1067,7 @@ static const struct msm_pinctrl_soc_data qcs615_tlmm = { .ntiles = ARRAY_SIZE(qcs615_tiles), .wakeirq_map = qcs615_pdc_map, .nwakeirq_map = ARRAY_SIZE(qcs615_pdc_map), + .wakeirq_dual_edge_errata = true, }; static const struct of_device_id qcs615_tlmm_of_match[] = { -- cgit v1.2.3 From 32e0a7ad9c841f46549ccac0f1cca347a40d8685 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 20 Feb 2026 17:34:51 +0800 Subject: gpio: shared: fix memory leaks On a Snapdragon X1 Elite laptop (Lenovo Yoga Slim 7x), kmemleak reports three sets of: unreferenced object 0xffff00080187f400 (size 1024): comm "swapper/0", pid 1, jiffies 4294667327 hex dump (first 32 bytes): 58 bd 70 01 08 00 ff ff 58 bd 70 01 08 00 ff ff X.p.....X.p..... 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace (crc 1665d1f8): kmemleak_alloc+0xf4/0x12c __kmalloc_cache_noprof+0x370/0x49c gpio_shared_make_ref+0x70/0x16c gpio_shared_of_traverse+0x4e8/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_init+0x34/0x1c4 do_one_initcall+0x50/0x280 kernel_init_freeable+0x290/0x33c kernel_init+0x28/0x14c ret_from_fork+0x10/0x20 unreferenced object 0xffff00080170c140 (size 8): comm "swapper/0", pid 1, jiffies 4294667327 hex dump (first 8 bytes): 72 65 73 65 74 00 00 00 reset... backtrace (crc fc24536): kmemleak_alloc+0xf4/0x12c __kmalloc_node_track_caller_noprof+0x3c4/0x584 kstrdup+0x4c/0xcc gpio_shared_make_ref+0x8c/0x16c gpio_shared_of_traverse+0x4e8/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_of_traverse+0x200/0x5f4 gpio_shared_init+0x34/0x1c4 do_one_initcall+0x50/0x280 kernel_init_freeable+0x290/0x33c kernel_init+0x28/0x14c ret_from_fork+0x10/0x20 Fix this by decrementing the reference count of each list entry rather than only the first. Fix verified on the same laptop. Fixes: a060b8c511abb gpiolib: implement low-level, shared GPIO support Signed-off-by: Daniel J Blueman Link: https://patch.msgid.link/20260220093452.101655-1-daniel@quora.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-shared.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib-shared.c b/drivers/gpio/gpiolib-shared.c index d2614ace4de1..17a7128b6bd9 100644 --- a/drivers/gpio/gpiolib-shared.c +++ b/drivers/gpio/gpiolib-shared.c @@ -748,14 +748,14 @@ static bool gpio_shared_entry_is_really_shared(struct gpio_shared_entry *entry) static void gpio_shared_free_exclusive(void) { struct gpio_shared_entry *entry, *epos; + struct gpio_shared_ref *ref, *rpos; list_for_each_entry_safe(entry, epos, &gpio_shared_list, list) { if (gpio_shared_entry_is_really_shared(entry)) continue; - gpio_shared_drop_ref(list_first_entry(&entry->refs, - struct gpio_shared_ref, - list)); + list_for_each_entry_safe(ref, rpos, &entry->refs, list) + gpio_shared_drop_ref(ref); gpio_shared_drop_entry(entry); } } -- cgit v1.2.3 From aa280a08e7d8fae58557acc345b36b3dc329d595 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Tue, 6 Jan 2026 13:15:04 +0000 Subject: x86/fred: Correct speculative safety in fred_extint() array_index_nospec() is no use if the result gets spilled to the stack, as it makes the believed safe-under-speculation value subject to memory predictions. For all practical purposes, this means array_index_nospec() must be used in the expression that accesses the array. As the code currently stands, it's the wrong side of irqentry_enter(), and 'index' is put into %ebp across the function call. Remove the index variable and reposition array_index_nospec(), so it's calculated immediately before the array access. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Signed-off-by: Andrew Cooper Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260106131504.679932-1-andrew.cooper3@citrix.com --- arch/x86/entry/entry_fred.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index a9b72997103d..88c757ac8ccd 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -160,8 +160,6 @@ void __init fred_complete_exception_setup(void) static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; - unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, - NR_SYSTEM_VECTORS); if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) return; @@ -170,7 +168,8 @@ static noinstr void fred_extint(struct pt_regs *regs) irqentry_state_t state = irqentry_enter(regs); instrumentation_begin(); - sysvec_table[index](regs); + sysvec_table[array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS)](regs); instrumentation_end(); irqentry_exit(regs, state); } else { -- cgit v1.2.3 From a0cb371b521dde44f32cfe954b6ef6f82b407393 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Sat, 10 Jan 2026 11:47:37 +0800 Subject: x86/bug: Handle __WARN_printf() trap in early_fixup_exception() The commit 5b472b6e5bd9 ("x86_64/bug: Implement __WARN_printf()") implemented __WARN_printf(), which changed the mechanism to use UD1 instead of UD2. However, it only handles the trap in the runtime IDT handler, while the early booting IDT handler lacks this handling. As a result, the usage of WARN() before the runtime IDT setup can lead to kernel crashes. Since KMSAN is enabled after the runtime IDT setup, it is safe to use handle_bug() directly in early_fixup_exception() to address this issue. Fixes: 5b472b6e5bd9 ("x86_64/bug: Implement __WARN_printf()") Signed-off-by: Hou Wenlong Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/c4fb3645f60d3a78629d9870e8fcc8535281c24f.1768016713.git.houwenlong.hwl@antgroup.com --- arch/x86/include/asm/traps.h | 2 ++ arch/x86/kernel/traps.c | 2 +- arch/x86/mm/extable.c | 7 ++----- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 869b88061801..3f24cc472ce9 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -25,6 +25,8 @@ extern int ibt_selftest_noendbr(void); void handle_invalid_op(struct pt_regs *regs); #endif +noinstr bool handle_bug(struct pt_regs *regs); + static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5a6a772e0a6c..4dbff8ef9b1c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -397,7 +397,7 @@ static inline void handle_invalid_op(struct pt_regs *regs) ILL_ILLOPN, error_get_trap_addr(regs)); } -static noinstr bool handle_bug(struct pt_regs *regs) +noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 2fdc1f1f5adb..6b9ff1c6cafa 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -411,14 +411,11 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) return; if (trapnr == X86_TRAP_UD) { - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { - /* Skip the ud2. */ - regs->ip += LEN_UD2; + if (handle_bug(regs)) return; - } /* - * If this was a BUG and report_bug returns or if this + * If this was a BUG and handle_bug returns or if this * was just a normal #UD, we want to continue onward and * crash. */ -- cgit v1.2.3 From 24c8147abb39618d74fcc36e325765e8fe7bdd7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2026 13:59:43 +0100 Subject: x86/cfi: Fix CFI rewrite for odd alignments Rustam reported his clang builds did not boot properly; turns out his .config has: CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B=y set. Fix up the FineIBT code to deal with this unusual alignment. Fixes: 931ab63664f0 ("x86/ibt: Implement FineIBT") Reported-by: Rustam Kovhaev Signed-off-by: Peter Zijlstra (Intel) Tested-by: Rustam Kovhaev --- arch/x86/include/asm/cfi.h | 12 ++++++++---- arch/x86/include/asm/linkage.h | 4 ++-- arch/x86/kernel/alternative.c | 29 ++++++++++++++++++++++------- arch/x86/net/bpf_jit_comp.c | 13 ++----------- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index c40b9ebc1fb4..ab3fbbd947ed 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -111,6 +111,12 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; +#ifdef CONFIG_CALL_PADDING +#define CFI_OFFSET (CONFIG_FUNCTION_PADDING_CFI+5) +#else +#define CFI_OFFSET 5 +#endif + #ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -119,11 +125,9 @@ static inline int cfi_get_offset(void) { switch (cfi_mode) { case CFI_FINEIBT: - return 16; + return /* fineibt_prefix_size */ 16; case CFI_KCFI: - if (IS_ENABLED(CONFIG_CALL_PADDING)) - return 16; - return 5; + return CFI_OFFSET; default: return 0; } diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9d38ae744a2e..a7294656ad90 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -68,7 +68,7 @@ * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the * CFI symbol layout changes. * - * Without CALL_THUNKS: + * Without CALL_PADDING: * * .align FUNCTION_ALIGNMENT * __cfi_##name: @@ -77,7 +77,7 @@ * .long __kcfi_typeid_##name * name: * - * With CALL_THUNKS: + * With CALL_PADDING: * * .align FUNCTION_ALIGNMENT * __cfi_##name: diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a888ae0f01fb..e87da25d1236 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1182,7 +1182,7 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - CFI_OFFSET); } } @@ -1389,6 +1389,8 @@ extern u8 fineibt_preamble_end[]; #define fineibt_preamble_ud 0x13 #define fineibt_preamble_hash 5 +#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) + /* * : * 0: b8 78 56 34 12 mov $0x12345678, %eax @@ -1634,7 +1636,7 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) * have determined there are no indirect calls to it and we * don't need no CFI either. */ - if (!is_endbr(addr + 16)) + if (!is_endbr(addr + CFI_OFFSET)) continue; hash = decode_preamble_hash(addr, &arity); @@ -1642,6 +1644,15 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) addr, addr, 5, addr)) return -EINVAL; + /* + * FineIBT relies on being at func-16, so if the preamble is + * actually larger than that, place it the tail end. + * + * NOTE: this is possible with things like DEBUG_CALL_THUNKS + * and DEBUG_FORCE_FUNCTION_ALIGN_64B. + */ + addr += CFI_OFFSET - fineibt_prefix_size; + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); @@ -1664,10 +1675,10 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - if (!exact_endbr(addr + 16)) + if (!exact_endbr(addr + CFI_OFFSET)) continue; - poison_endbr(addr + 16); + poison_endbr(addr + CFI_OFFSET); } } @@ -1772,7 +1783,8 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, if (FINEIBT_WARN(fineibt_preamble_size, 20) || FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) || FINEIBT_WARN(fineibt_caller_size, 14) || - FINEIBT_WARN(fineibt_paranoid_size, 20)) + FINEIBT_WARN(fineibt_paranoid_size, 20) || + WARN_ON_ONCE(CFI_OFFSET < fineibt_prefix_size)) return; if (cfi_mode == CFI_AUTO) { @@ -1885,6 +1897,11 @@ static void poison_cfi(void *addr) */ switch (cfi_mode) { case CFI_FINEIBT: + /* + * FineIBT preamble is at func-16. + */ + addr += CFI_OFFSET - fineibt_prefix_size; + /* * FineIBT prefix should start with an ENDBR. */ @@ -1923,8 +1940,6 @@ static void poison_cfi(void *addr) } } -#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) - /* * When regs->ip points to a 0xD6 byte in the FineIBT preamble, * return true and fill out target and type. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8f10080e6fe3..e9b78040d703 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -438,17 +438,8 @@ static void emit_kcfi(u8 **pprog, u32 hash) EMIT1_off32(0xb8, hash); /* movl $hash, %eax */ #ifdef CONFIG_CALL_PADDING - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); + for (int i = 0; i < CONFIG_FUNCTION_PADDING_CFI; i++) + EMIT1(0x90); #endif EMIT_ENDBR(); -- cgit v1.2.3 From 237dc6a054f6787c2a8f61c59086030267e5e1c5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 18 Dec 2025 19:20:29 +0100 Subject: x86/headers: Replace __ASSEMBLY__ stragglers with __ASSEMBLER__ After converting the __ASSEMBLY__ statements to __ASSEMBLER__ in commit 24a295e4ef1ca ("x86/headers: Replace __ASSEMBLY__ with __ASSEMBLER__ in non-UAPI headers"), some new code has been added that uses __ASSEMBLY__ again. Convert these stragglers, too. This is a mechanical patch, done with a simple "sed -i" command. Signed-off-by: Thomas Huth Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251218182029.166993-1-thuth@redhat.com --- arch/x86/include/asm/bug.h | 6 +++--- arch/x86/include/asm/irqflags.h | 4 ++-- arch/x86/include/asm/percpu.h | 2 +- arch/x86/include/asm/runtime-const.h | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 9b4e04690e1a..80c1696d8d59 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct bug_entry; extern void __WARN_trap(struct bug_entry *bug, ...); #endif @@ -137,7 +137,7 @@ do { \ #ifdef HAVE_ARCH_BUG_FORMAT_ARGS -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include DECLARE_STATIC_CALL(WARN_trap, __WARN_trap); @@ -153,7 +153,7 @@ struct arch_va_list { struct sysv_va_list args; }; extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define __WARN_bug_entry(flags, format) ({ \ struct bug_entry *bug; \ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index a1193e9d65f2..462754b0bf8a 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -77,7 +77,7 @@ static __always_inline void native_local_irq_restore(unsigned long flags) #endif #ifndef CONFIG_PARAVIRT -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Used in the idle loop; sti takes one instruction cycle * to complete: @@ -95,7 +95,7 @@ static __always_inline void halt(void) { native_halt(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c55058f3d75e..409981468cba 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -20,7 +20,7 @@ #define PER_CPU_VAR(var) __percpu(var)__percpu_rel -#else /* !__ASSEMBLY__: */ +#else /* !__ASSEMBLER__: */ #include #include diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index e5a13dc8816e..4cd94fdcb45e 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -6,7 +6,7 @@ #error "Cannot use runtime-const infrastructure from modules" #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro RUNTIME_CONST_PTR sym reg movq $0x0123456789abcdef, %\reg @@ -16,7 +16,7 @@ .popsection .endm -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define runtime_const_ptr(sym) ({ \ typeof(sym) __ret; \ @@ -74,5 +74,5 @@ static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), } } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif -- cgit v1.2.3 From b3d99f43c72b56cf7a104a364e7fb34b0702828b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Feb 2026 15:28:16 +0100 Subject: sched/fair: Fix zero_vruntime tracking It turns out that zero_vruntime tracking is broken when there is but a single task running. Current update paths are through __{en,de}queue_entity(), and when there is but a single task, pick_next_task() will always return that one task, and put_prev_set_next_task() will end up in neither function. This can cause entity_key() to grow indefinitely large and cause overflows, leading to much pain and suffering. Furtermore, doing update_zero_vruntime() from __{de,en}queue_entity(), which are called from {set_next,put_prev}_entity() has problems because: - set_next_entity() calls __dequeue_entity() before it does cfs_rq->curr = se. This means the avg_vruntime() will see the removal but not current, missing the entity for accounting. - put_prev_entity() calls __enqueue_entity() before it does cfs_rq->curr = NULL. This means the avg_vruntime() will see the addition *and* current, leading to double accounting. Both cases are incorrect/inconsistent. Noting that avg_vruntime is already called on each {en,de}queue, remove the explicit avg_vruntime() calls (which removes an extra 64bit division for each {en,de}queue) and have avg_vruntime() update zero_vruntime itself. Additionally, have the tick call avg_vruntime() -- discarding the result, but for the side-effect of updating zero_vruntime. While there, optimize avg_vruntime() by noting that the average of one value is rather trivial to compute. Test case: # taskset -c -p 1 $$ # taskset -c 2 bash -c 'while :; do :; done&' # cat /sys/kernel/debug/sched/debug | awk '/^cpu#/ {P=0} /^cpu#2,/ {P=1} {if (P) print $0}' | grep -e zero_vruntime -e "^>" PRE: .zero_vruntime : 31316.407903 >R bash 487 50787.345112 E 50789.145972 2.800000 50780.298364 16 120 0.000000 0.000000 0.000000 / .zero_vruntime : 382548.253179 >R bash 487 427275.204288 E 427276.003584 2.800000 427268.157540 23 120 0.000000 0.000000 0.000000 / POST: .zero_vruntime : 17259.709467 >R bash 526 17259.709467 E 17262.509467 2.800000 16915.031624 9 120 0.000000 0.000000 0.000000 / .zero_vruntime : 18702.723356 >R bash 526 18702.723356 E 18705.523356 2.800000 18358.045513 9 120 0.000000 0.000000 0.000000 / Fixes: 79f3f9bedd14 ("sched/eevdf: Fix min_vruntime vs avg_vruntime") Reported-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260219080624.438854780%40infradead.org --- kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eea99ec01a3f..56dddd4fd208 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a, return vruntime_cmp(a->deadline, "<", b->deadline); } +/* + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale + * and this value should be no more than two lag bounds. Which puts it in the + * general order of: + * + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT + * + * which is around 44 bits in size (on 64bit); that is 20 for + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for + * however many msec the actual slice+tick ends up begin. + * + * (disregarding the actual divide-by-weight part makes for the worst case + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually + * being the zero-lag point). + */ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); @@ -676,39 +691,61 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline -void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight */ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; + cfs_rq->zero_vruntime += delta; } /* - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias. + * + * Called in: + * - place_entity() -- before enqueue + * - update_entity_lag() -- before dequeue + * - entity_tick() + * + * This means it is one entry 'behind' but that puts it close enough to where + * the bound on entity_key() is at most two lag bounds. */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; - long load = cfs_rq->sum_weight; + long weight = cfs_rq->sum_weight; + s64 delta = 0; - if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + if (curr && !curr->on_rq) + curr = NULL; - avg += entity_key(cfs_rq, curr) * weight; - load += weight; - } + if (weight) { + s64 runtime = cfs_rq->sum_w_vruntime; + + if (curr) { + unsigned long w = scale_load_down(curr->load.weight); + + runtime += entity_key(cfs_rq, curr) * w; + weight += w; + } - if (load) { /* sign flips effective floor / ceiling */ - if (avg < 0) - avg -= (load - 1); - avg = div_s64(avg, load); + if (runtime < 0) + runtime -= (weight - 1); + + delta = div_s64(runtime, weight); + } else if (curr) { + /* + * When there is but one element, it is the average. + */ + delta = curr->vruntime - cfs_rq->zero_vruntime; } - return cfs_rq->zero_vruntime + avg; + update_zero_vruntime(cfs_rq, delta); + + return cfs_rq->zero_vruntime; } /* @@ -777,16 +814,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static void update_zero_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - - sum_w_vruntime_update(cfs_rq, delta); - - cfs_rq->zero_vruntime = vruntime; -} - static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) { struct sched_entity *root = __pick_root_entity(cfs_rq); @@ -856,7 +883,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { sum_w_vruntime_add(cfs_rq, se); - update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -868,7 +894,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); sum_w_vruntime_sub(cfs_rq, se); - update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -5524,6 +5549,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_group(curr); + /* + * Pulls along cfs_rq::zero_vruntime. + */ + avg_vruntime(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother -- cgit v1.2.3 From bcd74b2ffdd0a2233adbf26b65c62fc69a809c8e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Jan 2026 16:49:09 +0100 Subject: sched/fair: Only set slice protection at pick time We should not (re)set slice protection in the sched_change pattern which calls put_prev_task() / set_next_task(). Fixes: 63304558ba5d ("sched/eevdf: Curb wakeup-preemption") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260219080624.561421378%40infradead.org --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56dddd4fd208..f2b46c33a8c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5445,7 +5445,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) { clear_buddies(cfs_rq, se); @@ -5460,7 +5460,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(cfs_rq, se); + if (first) + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -8978,13 +8979,13 @@ again: pse = parent_entity(pse); } if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se); + set_next_entity(cfs_rq_of(se), se, true); se = parent_entity(se); } } put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, true); __set_next_task_fair(rq, p, true); } @@ -13598,7 +13599,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } -- cgit v1.2.3 From ff38424030f98976150e42ca35f4b00e6ab8fa23 Mon Sep 17 00:00:00 2001 From: Wang Tao Date: Tue, 20 Jan 2026 12:31:13 +0000 Subject: sched/eevdf: Update se->vprot in reweight_entity() In the EEVDF framework with Run-to-Parity protection, `se->vprot` is an independent variable defining the virtual protection timestamp. When `reweight_entity()` is called (e.g., via nice/renice), it performs the following actions to preserve Lag consistency: 1. Scales `se->vlag` based on the new weight. 2. Calls `place_entity()`, which recalculates `se->vruntime` based on the new weight and scaled lag. However, the current implementation fails to update `se->vprot`, leading to mismatches between the task's actual runtime and its expected duration. Fixes: 63304558ba5d ("sched/eevdf: Curb wakeup-preemption") Suggested-by: Zhang Qiao Signed-off-by: Wang Tao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260120123113.3518950-1-wangtao554@huawei.com --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f2b46c33a8c5..93fa5b8313e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3815,6 +3815,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + bool rel_vprot = false; + u64 vprot; if (se->on_rq) { /* commit outstanding execution time */ @@ -3822,6 +3824,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + if (curr && protect_slice(se)) { + vprot = se->vprot - se->vruntime; + rel_vprot = true; + } + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); @@ -3837,6 +3844,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (se->rel_deadline) se->deadline = div_s64(se->deadline * se->load.weight, weight); + if (rel_vprot) + vprot = div_s64(vprot * se->load.weight, weight); + update_load_set(&se->load, weight); do { @@ -3848,6 +3858,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { place_entity(cfs_rq, se, 0); + if (rel_vprot) + se->vprot = se->vruntime + vprot; update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); -- cgit v1.2.3 From 6e3c0a4e1ad1e0455b7880fad02b3ee179f56c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2025 12:16:28 +0200 Subject: sched/fair: Fix lag clamp Vincent reported that he was seeing undue lag clamping in a mixed slice workload. Implement the max_slice tracking as per the todo comment. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20250422101628.GA33555@noisy.programming.kicks-ass.net --- include/linux/sched.h | 1 + kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 074ad4ef3d81..a7b4a980eb2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -579,6 +579,7 @@ struct sched_entity { u64 deadline; u64 min_vruntime; u64 min_slice; + u64 max_slice; struct list_head group_node; unsigned char on_rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 93fa5b8313e4..f4446cbe8ffa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -748,6 +748,8 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) return cfs_rq->zero_vruntime; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); + /* * lag_i = S - s_i = w_i * (V - v_i) * @@ -761,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) - * - * XXX could add max_slice to the augmented data to track this. */ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + limit = calc_delta_fair(max_slice, se); se->vlag = clamp(vlag, -limit, limit); } @@ -829,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) return min_slice; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 max_slice = 0ULL; + + if (curr && curr->on_rq) + max_slice = curr->slice; + + if (root) + max_slice = max(max_slice, root->max_slice); + + return max_slice; +} + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { return entity_before(__node_2_se(a), __node_2_se(b)); @@ -853,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n } } +static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->max_slice > se->max_slice) + se->max_slice = rse->max_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ @@ -860,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; u64 old_min_slice = se->min_slice; + u64 old_max_slice = se->max_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; @@ -870,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) __min_slice_update(se, node->rb_right); __min_slice_update(se, node->rb_left); + se->max_slice = se->slice; + __max_slice_update(se, node->rb_right); + __max_slice_update(se, node->rb_left); + return se->min_vruntime == old_min_vruntime && - se->min_slice == old_min_slice; + se->min_slice == old_min_slice && + se->max_slice == old_max_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, -- cgit v1.2.3 From 4c652a47722f69c6f2685f05b17490ea97f643a8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 08:41:13 +0100 Subject: rseq: Mark rseq_arm_slice_extension_timer() __always_inline objtool warns about this function being called inside of a uaccess section: kernel/entry/common.o: warning: objtool: irqentry_exit+0x1dc: call to rseq_arm_slice_extension_timer() with UACCESS enabled Interestingly, this happens with CONFIG_RSEQ_SLICE_EXTENSION disabled, so this is an empty function, as the normal implementation is already marked __always_inline. I could reproduce this multiple times with gcc-11 but not with gcc-15, so the compiler probably got better at identifying the trivial function. Mark all the empty helpers for !RSEQ_SLICE_EXTENSION as __always_inline for consistency, avoiding this warning. Fixes: 0ac3b5c3dc45 ("rseq: Implement time slice extension enforcement timer") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260206074122.709580-1-arnd@kernel.org --- include/linux/rseq_entry.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..c6831c93cd6e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -216,10 +216,10 @@ efault: } #else /* CONFIG_RSEQ_SLICE_EXTENSION */ -static inline bool rseq_slice_extension_enabled(void) { return false; } -static inline bool rseq_arm_slice_extension_timer(void) { return false; } -static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static __always_inline bool rseq_slice_extension_enabled(void) { return false; } +static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } +static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -- cgit v1.2.3 From 5324953c06bd929c135d9e04be391ee2c11b5a19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Feb 2026 17:33:29 +0100 Subject: sched/core: Fix wakeup_preempt's next_class tracking Kernel test robot reported that tools/testing/selftests/kvm/hardware_disable_test was failing due to commit 704069649b5b ("sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()") It turns out there were two related problems that could lead to a missed preemption: - when hitting newidle balance from the idle thread, it would elevate rb->next_class from &idle_sched_class to &fair_sched_class, causing later wakeup_preempt() calls to not hit the sched_class_above() case, and not issue resched_curr(). Notably, this modification pattern should only lower the next_class, and never raise it. Create two new helper functions to wrap this. - when doing schedule_idle(), it was possible to miss (re)setting rq->next_class to &idle_sched_class, leading to the very same problem. Cc: Sean Christopherson Fixes: 704069649b5b ("sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202602122157.4e861298-lkp@intel.com Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260218163329.GQ1395416@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 1 + kernel/sched/ext.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 11 +++++++++++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759777694c78..b7f77c165a6e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6830,6 +6830,7 @@ static void __sched notrace __schedule(int sched_mode) /* SCX must consult the BPF scheduler to tell if rq is empty */ if (!rq->nr_running && !scx_enabled()) { next = prev; + rq->next_class = &idle_sched_class; goto picked; } } else if (!preempt && prev_state) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 62b1f3ac5630..06cc0a4aec66 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2460,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq->next_class = &ext_sched_class; + rq_modified_begin(rq, &ext_sched_class); rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2475,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4446cbe8ffa..bf948db905ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12982,7 +12982,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = sched_clock_cpu(this_cpu); __sched_balance_update_blocked_averages(this_rq); - this_rq->next_class = &fair_sched_class; + rq_modified_begin(this_rq, &fair_sched_class); raw_spin_rq_unlock(this_rq); for_each_domain(this_cpu, sd) { @@ -13049,7 +13049,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (sched_class_above(this_rq->next_class, &fair_sched_class)) + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b82fb70a9d54..43bbf0693cca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2748,6 +2748,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define sched_class_above(_a, _b) ((_a) < (_b)) +static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class) +{ + if (sched_class_above(rq->next_class, class)) + rq->next_class = class; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class) +{ + return sched_class_above(rq->next_class, class); +} + static inline bool sched_stop_runnable(struct rq *rq) { return rq->stop && task_on_rq_queued(rq->stop); -- cgit v1.2.3 From 26d43a90be81fc90e26688a51d3ec83188602731 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Feb 2026 15:06:40 -0500 Subject: rseq: Clarify rseq registration rseq_size bound check comment The rseq registration validates that the rseq_size argument is greater or equal to 32 (the original rseq size), but the comment associated with this check does not clearly state this. Clarify the comment to that effect. Fixes: ee3e3ac05c26 ("rseq: Introduce extensible rseq ABI") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260220200642.1317826-2-mathieu.desnoyers@efficios.com --- kernel/rseq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index b0973d19f366..e349f86cc945 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -449,8 +449,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * - * In order to be valid, rseq_len is either the original rseq size, or - * large enough to contain all supported fields, as communicated to + * The rseq_len is required to be greater or equal to the original rseq + * size. In order to be valid, rseq_len is either the original rseq size, + * or large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || -- cgit v1.2.3 From 3b68df978133ac3d46d570af065a73debbb68248 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Feb 2026 15:06:41 -0500 Subject: rseq: slice ext: Ensure rseq feature size differs from original rseq size Before rseq became extensible, its original size was 32 bytes even though the active rseq area was only 20 bytes. This had the following impact in terms of userspace ecosystem evolution: * The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set to 32, even though the size of the active rseq area is really 20. * The GNU libc 2.40 changes this __rseq_size to 20, thus making it express the active rseq area. * Starting from glibc 2.41, __rseq_size corresponds to the AT_RSEQ_FEATURE_SIZE from getauxval(3). This means that users of __rseq_size can always expect it to correspond to the active rseq area, except for the value 32, for which the active rseq area is 20 bytes. Exposing a 32 bytes feature size would make life needlessly painful for userspace. Therefore, add a reserved field at the end of the rseq area to bump the feature size to 33 bytes. This reserved field is expected to be replaced with whatever field will come next, expecting that this field will be larger than 1 byte. The effect of this change is to increase the size from 32 to 64 bytes before we actually have fields using that memory. Clarify the allocation size and alignment requirements in the struct rseq uapi comment. Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the value of the active rseq area size rounded up to next power of 2, which guarantees that the rseq structure will always be aligned on the nearest power of two large enough to contain it, even as it grows. Change the alignment check in the rseq registration accordingly. This will minimize the amount of ABI corner-cases we need to document and require userspace to play games with. The rule stays simple when __rseq_size != 32: #define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field)) Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com --- fs/binfmt_elf.c | 3 ++- include/linux/rseq.h | 12 ++++++++++++ include/uapi/linux/rseq.h | 26 ++++++++++++++++++++++---- kernel/rseq.c | 3 ++- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8e89cc5b2820..fb857faaf0d6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -286,7 +287,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, } #ifdef CONFIG_RSEQ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); - NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); + NEW_AUX_ENT(AT_RSEQ_ALIGN, rseq_alloc_align()); #endif #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7a01a0760405..b9d62fc2140d 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -146,6 +146,18 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = current->rseq; } +/* + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq + * registration. This is the active rseq area size rounded up to next + * power of 2, which guarantees that the rseq structure will always be + * aligned on the nearest power of two large enough to contain it, even + * as it grows. + */ +static inline unsigned int rseq_alloc_align(void) +{ + return 1U << get_count_order(offsetof(struct rseq, end)); +} + #else /* CONFIG_RSEQ */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 863c4a00a66b..f69344fe6c08 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -87,10 +87,17 @@ struct rseq_slice_ctrl { }; /* - * struct rseq is aligned on 4 * 8 bytes to ensure it is always - * contained within a single cache-line. + * The original size and alignment of the allocation for struct rseq is + * 32 bytes. * - * A single struct rseq per thread is allowed. + * The allocation size needs to be greater or equal to + * max(getauxval(AT_RSEQ_FEATURE_SIZE), 32), and the allocation needs to + * be aligned on max(getauxval(AT_RSEQ_ALIGN), 32). + * + * As an alternative, userspace is allowed to use both the original size + * and alignment of 32 bytes for backward compatibility. + * + * A single active struct rseq registration per thread is allowed. */ struct rseq { /* @@ -180,10 +187,21 @@ struct rseq { */ struct rseq_slice_ctrl slice_ctrl; + /* + * Before rseq became extensible, its original size was 32 bytes even + * though the active rseq area was only 20 bytes. + * Exposing a 32 bytes feature size would make life needlessly painful + * for userspace. Therefore, add a reserved byte after byte 32 + * to bump the rseq feature size from 32 to 33. + * The next field to be added to the rseq area will be larger + * than one byte, and will replace this reserved byte. + */ + __u8 __reserved; + /* * Flexible array member at end of structure, after last feature field. */ char end[]; -} __attribute__((aligned(4 * sizeof(__u64)))); +} __attribute__((aligned(32))); #endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index e349f86cc945..38d3ef540760 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -456,7 +457,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) -- cgit v1.2.3 From 486ff5ad49bc50315bcaf6d45f04a33ef0a45ced Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Jun 2025 21:51:05 -0700 Subject: perf/core: Fix invalid wait context in ctx_sched_in() Lockdep found a bug in the event scheduling when a pinned event was failed and wakes up the threads in the ring buffer like below. It seems it should not grab a wait-queue lock under perf-context lock. Let's do it with irq_work. [ 39.913691] ============================= [ 39.914157] [ BUG: Invalid wait context ] [ 39.914623] 6.15.0-next-20250530-next-2025053 #1 Not tainted [ 39.915271] ----------------------------- [ 39.915731] repro/837 is trying to lock: [ 39.916191] ffff88801acfabd8 (&event->waitq){....}-{3:3}, at: __wake_up+0x26/0x60 [ 39.917182] other info that might help us debug this: [ 39.917761] context-{5:5} [ 39.918079] 4 locks held by repro/837: [ 39.918530] #0: ffffffff8725cd00 (rcu_read_lock){....}-{1:3}, at: __perf_event_task_sched_in+0xd1/0xbc0 [ 39.919612] #1: ffff88806ca3c6f8 (&cpuctx_lock){....}-{2:2}, at: __perf_event_task_sched_in+0x1a7/0xbc0 [ 39.920748] #2: ffff88800d91fc18 (&ctx->lock){....}-{2:2}, at: __perf_event_task_sched_in+0x1f9/0xbc0 [ 39.921819] #3: ffffffff8725cd00 (rcu_read_lock){....}-{1:3}, at: perf_event_wakeup+0x6c/0x470 Fixes: f4b07fd62d4d ("perf/core: Use POLLHUP for a pinned event in error") Closes: https://lore.kernel.org/lkml/aD2w50VDvGIH95Pf@ly-workstation Reported-by: "Lai, Yi" Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Lai, Yi" Link: https://patch.msgid.link/20250603045105.1731451-1-namhyung@kernel.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index ac70d68217b6..4f86d22f281a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (*perf_event_fasync(event)) event->pending_kill = POLL_ERR; - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending_irq); } else { struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); -- cgit v1.2.3 From 6a8a48644c4b804123e59dbfc5d6cd29a0194046 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 9 Feb 2026 16:52:25 -0800 Subject: perf/x86/intel/uncore: Add per-scheduler IMC CAS count events IMC on SPR and EMR does not support sub-channels. In contrast, CPUs that use gnr_uncores[] (e.g. Granite Rapids and Sierra Forest) implement two command schedulers (SCH0/SCH1) per memory channel, providing logically independent command and data paths. Do not reuse the spr_uncore_imc[] configuration for these CPUs. Instead, introduce a dedicated gnr_uncore_imc[] with per-scheduler events, so userspace can monitor SCH0 and SCH1 independently. On these CPUs, replace cas_count_{read,write} with cas_count_{read,write}_sch{0,1}. This may break existing userspace that relies on cas_count_{read,write}, prompting it to switch to the per-scheduler events, as the legacy event reports only partial traffic (SCH0). Fixes: 632c4bf6d007 ("perf/x86/intel/uncore: Support Granite Rapids") Fixes: cb4a6ccf3583 ("perf/x86/intel/uncore: Support Sierra Forest and Grand Ridge") Reported-by: Reinette Chatre Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260210005225.20311-1-zide.chen@intel.com --- arch/x86/events/intel/uncore_snbep.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ed6e0b7e715..0a1d08136cc1 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6497,6 +6497,32 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct uncore_event_desc gnr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0, "event=0x05,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1, "event=0x06,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0, "event=0x05,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1, "event=0x06,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type gnr_uncore_imc = { + SPR_UNCORE_MMIO_COMMON_FORMAT(), + .name = "imc", + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = gnr_uncore_imc_events, +}; + static struct intel_uncore_type gnr_uncore_pciex8 = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "pciex8", @@ -6544,7 +6570,7 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { NULL, &spr_uncore_pcu, &gnr_uncore_ubox, - &spr_uncore_imc, + &gnr_uncore_imc, NULL, &gnr_uncore_upi, NULL, -- cgit v1.2.3 From 77de62ad3de3967818c3dbe656b7336ebee461d2 Mon Sep 17 00:00:00 2001 From: Haocheng Yu Date: Tue, 3 Feb 2026 00:20:56 +0800 Subject: perf/core: Fix refcount bug and potential UAF in perf_mmap Syzkaller reported a refcount_t: addition on 0; use-after-free warning in perf_mmap. The issue is caused by a race condition between a failing mmap() setup and a concurrent mmap() on a dependent event (e.g., using output redirection). In perf_mmap(), the ring_buffer (rb) is allocated and assigned to event->rb with the mmap_mutex held. The mutex is then released to perform map_range(). If map_range() fails, perf_mmap_close() is called to clean up. However, since the mutex was dropped, another thread attaching to this event (via inherited events or output redirection) can acquire the mutex, observe the valid event->rb pointer, and attempt to increment its reference count. If the cleanup path has already dropped the reference count to zero, this results in a use-after-free or refcount saturation warning. Fix this by extending the scope of mmap_mutex to cover the map_range() call. This ensures that the ring buffer initialization and mapping (or cleanup on failure) happens atomically effectively, preventing other threads from accessing a half-initialized or dying ring buffer. Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Haocheng Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260202162057.7237-1-yuhaocheng035@gmail.com --- kernel/events/core.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f86d22f281a..22a0f405585b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7465,28 +7465,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = perf_mmap_aux(vma, event, nr_pages); if (ret) return ret; - } - /* - * Since pinned accounting is per vm we cannot allow fork() to copy our - * vma. - */ - vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &perf_mmap_vmops; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); + vma->vm_ops = &perf_mmap_vmops; - mapped = get_mapped(event, event_mapped); - if (mapped) - mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); - /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). - */ - ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(event->rb, vma); + if (ret) + perf_mmap_close(vma); + } return ret; } -- cgit v1.2.3 From eb4a7139e97374f42b7242cc754e77f1623fbcd5 Mon Sep 17 00:00:00 2001 From: Jouni Högander Date: Thu, 12 Feb 2026 08:27:31 +0200 Subject: drm/i915/alpm: ALPM disable fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PORT_ALPM_CTL is supposed to be written only before link training. Remove writing it from ALPM disable. Also clearing ALPM_CTL_ALPM_AUX_LESS_ENABLE and is not about disabling ALPM but switching to AUX-Wake ALPM. Stop touching this bit on ALPM disable. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/7153 Fixes: 1ccbf135862b ("drm/i915/psr: Enable ALPM on source side for eDP Panel replay") Cc: Animesh Manna Cc: Jani Nikula Cc: # v6.10+ Signed-off-by: Jouni Högander Reviewed-by: Michał Grzelak Link: https://patch.msgid.link/20260212062731.397801-1-jouni.hogander@intel.com (cherry picked from commit 008304c9ae75c772d3460040de56e12112cdf5e6) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_alpm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_alpm.c b/drivers/gpu/drm/i915/display/intel_alpm.c index 7ce8c674bb03..07ffee38974b 100644 --- a/drivers/gpu/drm/i915/display/intel_alpm.c +++ b/drivers/gpu/drm/i915/display/intel_alpm.c @@ -562,12 +562,7 @@ void intel_alpm_disable(struct intel_dp *intel_dp) mutex_lock(&intel_dp->alpm.lock); intel_de_rmw(display, ALPM_CTL(display, cpu_transcoder), - ALPM_CTL_ALPM_ENABLE | ALPM_CTL_LOBF_ENABLE | - ALPM_CTL_ALPM_AUX_LESS_ENABLE, 0); - - intel_de_rmw(display, - PORT_ALPM_CTL(cpu_transcoder), - PORT_ALPM_CTL_ALPM_AUX_LESS_ENABLE, 0); + ALPM_CTL_ALPM_ENABLE | ALPM_CTL_LOBF_ENABLE, 0); drm_dbg_kms(display->drm, "Disabling ALPM\n"); mutex_unlock(&intel_dp->alpm.lock); -- cgit v1.2.3 From ec2cceadfae72304ca19650f9cac4b2a97b8a2fc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 19 Feb 2026 10:51:33 +0100 Subject: gpiolib: normalize the return value of gc->get() on behalf of buggy drivers Commit 86ef402d805d ("gpiolib: sanitize the return value of gpio_chip::get()") started checking the return value of the .get() callback in struct gpio_chip. Now - almost a year later - it turns out that there are quite a few drivers in tree that can break with this change. Partially revert it: normalize the return value in GPIO core but also emit a warning. Cc: stable@vger.kernel.org Fixes: 86ef402d805d ("gpiolib: sanitize the return value of gpio_chip::get()") Reported-by: Dmitry Torokhov Closes: https://lore.kernel.org/all/aZSkqGTqMp_57qC7@google.com/ Reviewed-by: Linus Walleij Reviewed-by: Dmitry Torokhov Link: https://patch.msgid.link/20260219-gpiolib-set-normalize-v2-1-f84630e45796@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 86a171e96b0e..ada572aaebd6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3267,8 +3267,12 @@ static int gpiochip_get(struct gpio_chip *gc, unsigned int offset) /* Make sure this is called after checking for gc->get(). */ ret = gc->get(gc, offset); - if (ret > 1) - ret = -EBADE; + if (ret > 1) { + gpiochip_warn(gc, + "invalid return value from gc->get(): %d, consider fixing the driver\n", + ret); + ret = !!ret; + } return ret; } -- cgit v1.2.3 From af12e64ae0661546e8b4f5d30d55c5f53a11efe7 Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Tue, 20 Jan 2026 22:26:46 +0800 Subject: mmc: mmci: Fix device_node reference leak in of_get_dml_pipe_index() When calling of_parse_phandle_with_args(), the caller is responsible to call of_node_put() to release the reference of device node. In of_get_dml_pipe_index(), it does not release the reference. Fixes: 9cb15142d0e3 ("mmc: mmci: Add qcom dml support to the driver.") Signed-off-by: Felix Gu Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmci_qcom_dml.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/mmci_qcom_dml.c b/drivers/mmc/host/mmci_qcom_dml.c index 3da6112fbe39..67371389cc33 100644 --- a/drivers/mmc/host/mmci_qcom_dml.c +++ b/drivers/mmc/host/mmci_qcom_dml.c @@ -109,6 +109,7 @@ static int of_get_dml_pipe_index(struct device_node *np, const char *name) &dma_spec)) return -ENODEV; + of_node_put(dma_spec.np); if (dma_spec.args_count) return dma_spec.args[0]; -- cgit v1.2.3 From 6465a8bbb0f6ad98aeb66dc9ea19c32c193a610b Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Fri, 16 Jan 2026 08:55:30 +0800 Subject: mmc: dw_mmc-rockchip: Fix runtime PM support for internal phase support RK3576 is the first platform to introduce internal phase support, and subsequent platforms are expected to adopt a similar design. In this architecture, runtime suspend powers off the attached power domain, which resets registers, including vendor-specific ones such as SDMMC_TIMING_CON0, SDMMC_TIMING_CON1, and SDMMC_MISC_CON. These registers must be saved and restored, a requirement that falls outside the scope of the dw_mmc core. Fixes: 59903441f5e4 ("mmc: dw_mmc-rockchip: Add internal phase support") Signed-off-by: Shawn Lin Tested-by: Marco Schirrmeister Reviewed-by: Heiko Stuebner Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-rockchip.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index 4e3423a19bdf..ac069d0c42b2 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -36,6 +36,8 @@ struct dw_mci_rockchip_priv_data { int default_sample_phase; int num_phases; bool internal_phase; + int sample_phase; + int drv_phase; }; /* @@ -573,9 +575,43 @@ static void dw_mci_rockchip_remove(struct platform_device *pdev) dw_mci_pltfm_remove(pdev); } +static int dw_mci_rockchip_runtime_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct dw_mci *host = platform_get_drvdata(pdev); + struct dw_mci_rockchip_priv_data *priv = host->priv; + + if (priv->internal_phase) { + priv->sample_phase = rockchip_mmc_get_phase(host, true); + priv->drv_phase = rockchip_mmc_get_phase(host, false); + } + + return dw_mci_runtime_suspend(dev); +} + +static int dw_mci_rockchip_runtime_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct dw_mci *host = platform_get_drvdata(pdev); + struct dw_mci_rockchip_priv_data *priv = host->priv; + int ret; + + ret = dw_mci_runtime_resume(dev); + if (ret) + return ret; + + if (priv->internal_phase) { + rockchip_mmc_set_phase(host, true, priv->sample_phase); + rockchip_mmc_set_phase(host, false, priv->drv_phase); + mci_writel(host, MISC_CON, MEM_CLK_AUTOGATE_ENABLE); + } + + return ret; +} + static const struct dev_pm_ops dw_mci_rockchip_dev_pm_ops = { SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) - RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) + RUNTIME_PM_OPS(dw_mci_rockchip_runtime_suspend, dw_mci_rockchip_runtime_resume, NULL) }; static struct platform_driver dw_mci_rockchip_pltfm_driver = { -- cgit v1.2.3 From 79ad471530e0baef0dce991816013df55e401d9c Mon Sep 17 00:00:00 2001 From: Kamal Dasu Date: Mon, 16 Feb 2026 14:15:43 -0500 Subject: mmc: sdhci-brcmstb: use correct register offset for V1 pin_sel restore The restore path for SDIO_CFG_CORE_V1 was incorrectly using SDIO_CFG_SD_PIN_SEL (offset 0x44) instead of SDIO_CFG_V1_SD_PIN_SEL (offset 0x54), causing the wrong register to be written on resume. The save path already uses the correct V1-specific offset. This affects BCM7445 and BCM72116 platforms which use the V1 config core. Fixes: b7e614802e3f ("mmc: sdhci-brcmstb: save and restore registers during PM") Signed-off-by: Kamal Dasu Cc: stable@vger.kernel.org Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-brcmstb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index c9442499876c..57e45951644e 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -116,7 +116,7 @@ static void sdhci_brcmstb_restore_regs(struct mmc_host *mmc, enum cfg_core_ver v writel(sr->boot_main_ctl, priv->boot_regs + SDIO_BOOT_MAIN_CTL); if (ver == SDIO_CFG_CORE_V1) { - writel(sr->sd_pin_sel, cr + SDIO_CFG_SD_PIN_SEL); + writel(sr->sd_pin_sel, cr + SDIO_CFG_V1_SD_PIN_SEL); return; } -- cgit v1.2.3 From 6510e1324bcdc8caf21f6d17efe27604c48f0d64 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 23 Feb 2026 09:36:16 +0000 Subject: ASoC: cs42l43: Report insert for exotic peripherals For some exotic peripherals the type detect can return a reserved value of 0x4. This will currently return an error and not report anything to user-space, update this to report the insert normally. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260223093616.3800350-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43-jack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/cs42l43-jack.c b/sound/soc/codecs/cs42l43-jack.c index b83bc4de1301..3e04e6897b14 100644 --- a/sound/soc/codecs/cs42l43-jack.c +++ b/sound/soc/codecs/cs42l43-jack.c @@ -699,6 +699,7 @@ static int cs42l43_run_type_detect(struct cs42l43_codec *priv) switch (type & CS42L43_HSDET_TYPE_STS_MASK) { case 0x0: // CTIA case 0x1: // OMTP + case 0x4: return cs42l43_run_load_detect(priv, true); case 0x2: // 3-pole return cs42l43_run_load_detect(priv, false); -- cgit v1.2.3 From 901084c51a0a8fb42a3f37d2e9c62083c495f824 Mon Sep 17 00:00:00 2001 From: Penghe Geng Date: Thu, 19 Feb 2026 15:29:54 -0500 Subject: mmc: core: Avoid bitfield RMW for claim/retune flags Move claimed and retune control flags out of the bitfield word to avoid unrelated RMW side effects in asynchronous contexts. The host->claimed bit shared a word with retune flags. Writes to claimed in __mmc_claim_host() or retune_now in mmc_mq_queue_rq() can overwrite other bits when concurrent updates happen in other contexts, triggering spurious WARN_ON(!host->claimed). Convert claimed, can_retune, retune_now and retune_paused to bool to remove shared-word coupling. Fixes: 6c0cedd1ef952 ("mmc: core: Introduce host claiming by context") Fixes: 1e8e55b67030c ("mmc: block: Add CQE support") Cc: stable@vger.kernel.org Suggested-by: Adrian Hunter Signed-off-by: Penghe Geng Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0e2c265e5d1..ba84f02c2a10 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -486,14 +486,12 @@ struct mmc_host { struct mmc_ios ios; /* current io bus settings */ + bool claimed; /* host exclusively claimed */ + /* group bitfields together to minimize padding */ unsigned int use_spi_crc:1; - unsigned int claimed:1; /* host exclusively claimed */ unsigned int doing_init_tune:1; /* initial tuning in progress */ - unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ - unsigned int retune_now:1; /* do re-tuning at next req */ - unsigned int retune_paused:1; /* re-tuning is temporarily disabled */ unsigned int retune_crc_disable:1; /* don't trigger retune upon crc */ unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ @@ -508,6 +506,9 @@ struct mmc_host { int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ + bool can_retune; /* re-tuning can be used */ + bool retune_now; /* do re-tuning at next req */ + bool retune_paused; /* re-tuning is temporarily disabled */ int need_retune; /* re-tuning is needed */ int hold_retune; /* hold off re-tuning */ unsigned int retune_period; /* re-tuning period in secs */ -- cgit v1.2.3 From 7a73801fdaf8aee90d23ba77976082a48d156a21 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Dec 2025 21:29:41 +0100 Subject: pmdomain: imx: gpcv2: Discard pm_runtime_put() return value Passing pm_runtime_put() return value to the callers is not particularly useful. Returning an error code from pm_runtime_put() merely means that it has not queued up a work item to check whether or not the device can be suspended and there are many perfectly valid situations in which that can happen, like after writing "on" to the devices' runtime PM "control" attribute in sysfs for one example. Accordingly, update imx_pgc_domain_suspend() to simply discard the return value of pm_runtime_put() and always return success to the caller. This will facilitate a planned change of the pm_runtime_put() return type to void in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Peng Fan Acked-by: Ulf Hansson Link: https://patch.msgid.link/15658107.tv2OnDr8pf@rafael.j.wysocki --- drivers/pmdomain/imx/gpcv2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index cff738e4d546..a829f8da5be7 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -1416,7 +1416,9 @@ static int imx_pgc_domain_suspend(struct device *dev) static int imx_pgc_domain_resume(struct device *dev) { - return pm_runtime_put(dev); + pm_runtime_put(dev); + + return 0; } #endif -- cgit v1.2.3 From 3afd8df024339c7da1a5a0302f3987866dd16e40 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Dec 2025 21:36:25 +0100 Subject: PM: runtime: Change pm_runtime_put() return type to void The primary role of pm_runtime_put() is to decrement the runtime PM usage counter of the given device. It always does that regardless of the value returned by it later. In addition, if the runtime PM usage counter after decrementation turns out to be zero, a work item is queued up to check whether or not the device can be suspended. This is not guaranteed to succeed though and even if it is successful, the device may still not be suspended going forward. There are multiple valid reasons why pm_runtime_put() may not decide to queue up the work item mentioned above, including, but not limited to, the case when user space has written "on" to the device's runtime PM "control" file in sysfs. In all of those cases, pm_runtime_put() returns a negative error code (even though the device's runtime PM usage counter has been successfully decremented by it) which is very confusing. In fact, its return value should only be used for debug purposes and care should be taken when doing it even in that case. Accordingly, to avoid the confusion mentioned above, change the return type of pm_runtime_put() to void. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Brian Norris Link: https://patch.msgid.link/14387202.RDIVbhacDa@rafael.j.wysocki --- include/linux/pm_runtime.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 41037c513f06..64921b10ac74 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -545,22 +545,10 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). - * - * Return: - * * 1: Success. Usage counter dropped to zero, but device was already suspended. - * * 0: Success. - * * -EINVAL: Runtime PM error. - * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status - * change ongoing. - * * -EBUSY: Runtime PM child_count non-zero. - * * -EPERM: Device PM QoS resume latency 0. - * * -EINPROGRESS: Suspend already in progress. - * * -ENOSYS: CONFIG_PM not enabled. */ -static inline int pm_runtime_put(struct device *dev) +static inline void pm_runtime_put(struct device *dev) { - return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); + __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** -- cgit v1.2.3 From 32fc4168fa56f6301d858c778a3d712774e9657e Mon Sep 17 00:00:00 2001 From: Azamat Almazbek uulu Date: Sat, 21 Feb 2026 12:48:13 +0100 Subject: ASoC: amd: yc: Add ASUS EXPERTBOOK BM1503CDA to quirk table The ASUS ExpertBook BM1503CDA (Ryzen 5 7535U, Barcelo-R) has an internal DMIC connected through the AMD ACP (Audio CoProcessor) but is missing from the DMI quirk table, so the acp6x machine driver probe returns -ENODEV and no DMIC capture device is created. Add the DMI entry so the internal microphone works out of the box. Signed-off-by: Azamat Almazbek uulu Reviewed-by: Vijendar Mukunda Link: https://patch.msgid.link/20260221114813.5610-1-almazbek1608@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index f1a63475100d..7af4daeb4c6f 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -703,6 +703,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Vivobook_ASUSLaptop M6501RR_M6501RR"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "ASUS EXPERTBOOK BM1503CDA"), + } + }, {} }; -- cgit v1.2.3 From 4b73231b2a61c4142a027613d277a19c484dfcc3 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Sun, 22 Feb 2026 18:40:35 +0800 Subject: regulator: tps65185: check devm_kzalloc() result in probe tps65185_probe() dereferences the allocation result immediately by using data->regmap. If devm_kzalloc() returns NULL under memory pressure, this leads to a NULL pointer dereference. Add the missing allocation check and return -ENOMEM on failure. Signed-off-by: Yufan Chen Link: https://patch.msgid.link/20260222104035.90790-1-ericterminal@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/tps65185.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/regulator/tps65185.c b/drivers/regulator/tps65185.c index 3286c9ab33d0..786622d8d598 100644 --- a/drivers/regulator/tps65185.c +++ b/drivers/regulator/tps65185.c @@ -332,6 +332,9 @@ static int tps65185_probe(struct i2c_client *client) int i; data = devm_kzalloc(&client->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->regmap = devm_regmap_init_i2c(client, ®map_config); if (IS_ERR(data->regmap)) return dev_err_probe(&client->dev, PTR_ERR(data->regmap), -- cgit v1.2.3 From f895e5df80316a308c2f7d64d13a78494630ea05 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 12 Feb 2026 21:52:48 -0600 Subject: ipmi:si: Don't block module unload if the BMC is messed up If the BMC is in a bad state, don't bother waiting for queues messages since there can't be any. Otherwise the unload is blocked until the BMC is back in a good state. Reported-by: Rafael J. Wysocki Fixes: bc3a9d217755 ("ipmi:si: Gracefully handle if the BMC is non-functional") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Corey Minyard Reviewed-by: Rafael J. Wysocki (Intel) --- drivers/char/ipmi/ipmi_si_intf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 0049e3792ba1..3667033fcc51 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2234,7 +2234,8 @@ static void wait_msg_processed(struct smi_info *smi_info) unsigned long jiffies_now; long time_diff; - while (smi_info->curr_msg || (smi_info->si_state != SI_NORMAL)) { + while (smi_info->si_state != SI_HOSED && + (smi_info->curr_msg || (smi_info->si_state != SI_NORMAL))) { jiffies_now = jiffies; time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies) * SI_USEC_PER_JIFFY); -- cgit v1.2.3 From 62cd145453d577113f993efd025f258dd86aa183 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 12 Feb 2026 21:56:54 -0600 Subject: ipmi:msghandler: Handle error returns from the SMI sender It used to be, until recently, that the sender operation on the low level interfaces would not fail. That's not the case any more with recent changes. So check the return value from the sender operation, and propagate it back up from there and handle the errors in all places. Reported-by: Rafael J. Wysocki Fixes: bc3a9d217755 ("ipmi:si: Gracefully handle if the BMC is non-functional") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Corey Minyard Reviewed-by: Rafael J. Wysocki (Intel) --- drivers/char/ipmi/ipmi_msghandler.c | 100 ++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a042b1596933..f8c3c1e44520 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -1887,19 +1887,32 @@ static struct ipmi_smi_msg *smi_add_send_msg(struct ipmi_smi *intf, return smi_msg; } -static void smi_send(struct ipmi_smi *intf, +static int smi_send(struct ipmi_smi *intf, const struct ipmi_smi_handlers *handlers, struct ipmi_smi_msg *smi_msg, int priority) { int run_to_completion = READ_ONCE(intf->run_to_completion); unsigned long flags = 0; + int rv = 0; ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); smi_msg = smi_add_send_msg(intf, smi_msg, priority); ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); - if (smi_msg) - handlers->sender(intf->send_info, smi_msg); + if (smi_msg) { + rv = handlers->sender(intf->send_info, smi_msg); + if (rv) { + ipmi_lock_xmit_msgs(intf, run_to_completion, &flags); + intf->curr_msg = NULL; + ipmi_unlock_xmit_msgs(intf, run_to_completion, &flags); + /* + * Something may have been added to the transmit + * queue, so schedule a check for that. + */ + queue_work(system_wq, &intf->smi_work); + } + } + return rv; } static bool is_maintenance_mode_cmd(struct kernel_ipmi_msg *msg) @@ -2312,6 +2325,7 @@ static int i_ipmi_request(struct ipmi_user *user, struct ipmi_recv_msg *recv_msg; int run_to_completion = READ_ONCE(intf->run_to_completion); int rv = 0; + bool in_seq_table = false; if (supplied_recv) { recv_msg = supplied_recv; @@ -2365,33 +2379,50 @@ static int i_ipmi_request(struct ipmi_user *user, rv = i_ipmi_req_ipmb(intf, addr, msgid, msg, smi_msg, recv_msg, source_address, source_lun, retries, retry_time_ms); + in_seq_table = true; } else if (is_ipmb_direct_addr(addr)) { rv = i_ipmi_req_ipmb_direct(intf, addr, msgid, msg, smi_msg, recv_msg, source_lun); } else if (is_lan_addr(addr)) { rv = i_ipmi_req_lan(intf, addr, msgid, msg, smi_msg, recv_msg, source_lun, retries, retry_time_ms); + in_seq_table = true; } else { - /* Unknown address type. */ + /* Unknown address type. */ ipmi_inc_stat(intf, sent_invalid_commands); rv = -EINVAL; } - if (rv) { -out_err: - if (!supplied_smi) - ipmi_free_smi_msg(smi_msg); - if (!supplied_recv) - ipmi_free_recv_msg(recv_msg); - } else { + if (!rv) { dev_dbg(intf->si_dev, "Send: %*ph\n", smi_msg->data_size, smi_msg->data); - smi_send(intf, intf->handlers, smi_msg, priority); + rv = smi_send(intf, intf->handlers, smi_msg, priority); + if (rv != IPMI_CC_NO_ERROR) + /* smi_send() returns an IPMI err, return a Linux one. */ + rv = -EIO; + if (rv && in_seq_table) { + /* + * If it's in the sequence table, it will be + * retried later, so ignore errors. + */ + rv = 0; + /* But we need to fix the timeout. */ + intf_start_seq_timer(intf, smi_msg->msgid); + ipmi_free_smi_msg(smi_msg); + smi_msg = NULL; + } } +out_err: if (!run_to_completion) mutex_unlock(&intf->users_mutex); + if (rv) { + if (!supplied_smi) + ipmi_free_smi_msg(smi_msg); + if (!supplied_recv) + ipmi_free_recv_msg(recv_msg); + } return rv; } @@ -3965,12 +3996,12 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, dev_dbg(intf->si_dev, "Invalid command: %*ph\n", msg->data_size, msg->data); - smi_send(intf, intf->handlers, msg, 0); - /* - * We used the message, so return the value that - * causes it to not be freed or queued. - */ - rv = -1; + if (smi_send(intf, intf->handlers, msg, 0) == IPMI_CC_NO_ERROR) + /* + * We used the message, so return the value that + * causes it to not be freed or queued. + */ + rv = -1; } else if (!IS_ERR(recv_msg)) { /* Extract the source address from the data. */ ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr; @@ -4044,12 +4075,12 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, msg->data[4] = IPMI_INVALID_CMD_COMPLETION_CODE; msg->data_size = 5; - smi_send(intf, intf->handlers, msg, 0); - /* - * We used the message, so return the value that - * causes it to not be freed or queued. - */ - rv = -1; + if (smi_send(intf, intf->handlers, msg, 0) == IPMI_CC_NO_ERROR) + /* + * We used the message, so return the value that + * causes it to not be freed or queued. + */ + rv = -1; } else if (!IS_ERR(recv_msg)) { /* Extract the source address from the data. */ daddr = (struct ipmi_ipmb_direct_addr *)&recv_msg->addr; @@ -4189,7 +4220,7 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, struct ipmi_smi_msg *msg) { struct cmd_rcvr *rcvr; - int rv = 0; + int rv = 0; /* Free by default */ unsigned char netfn; unsigned char cmd; unsigned char chan; @@ -4242,12 +4273,12 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, dev_dbg(intf->si_dev, "Invalid command: %*ph\n", msg->data_size, msg->data); - smi_send(intf, intf->handlers, msg, 0); - /* - * We used the message, so return the value that - * causes it to not be freed or queued. - */ - rv = -1; + if (smi_send(intf, intf->handlers, msg, 0) == IPMI_CC_NO_ERROR) + /* + * We used the message, so return the value that + * causes it to not be freed or queued. + */ + rv = -1; } else if (!IS_ERR(recv_msg)) { /* Extract the source address from the data. */ lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr; @@ -5056,7 +5087,12 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, ipmi_inc_stat(intf, retransmitted_ipmb_commands); - smi_send(intf, intf->handlers, smi_msg, 0); + /* If this fails we'll retry later or timeout. */ + if (smi_send(intf, intf->handlers, smi_msg, 0) != IPMI_CC_NO_ERROR) { + /* But fix the timeout. */ + intf_start_seq_timer(intf, smi_msg->msgid); + ipmi_free_smi_msg(smi_msg); + } } else ipmi_free_smi_msg(smi_msg); -- cgit v1.2.3 From cae66f1a1dcd23e17da5a015ef9d731129f9d2dd Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 13 Feb 2026 00:15:04 -0600 Subject: ipmi:si: Fix check for a misbehaving BMC There is a race on checking the state in the sender, it needs to be checked under a lock. But you also need a check to avoid issues with a misbehaving BMC for run to completion mode. So leave the check at the beginning for run to completion, and add a check under the lock to avoid the race. Reported-by: Rafael J. Wysocki Fixes: bc3a9d217755 ("ipmi:si: Gracefully handle if the BMC is non-functional") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Corey Minyard Reviewed-by: Rafael J. Wysocki (Intel) --- drivers/char/ipmi/ipmi_si_intf.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 3667033fcc51..6eda61664aaa 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -924,9 +924,14 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct smi_info *smi_info = send_info; unsigned long flags; + int rv = IPMI_CC_NO_ERROR; debug_timestamp(smi_info, "Enqueue"); + /* + * Check here for run to completion mode. A check under lock is + * later. + */ if (smi_info->si_state == SI_HOSED) return IPMI_BUS_ERR; @@ -940,18 +945,15 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg) } spin_lock_irqsave(&smi_info->si_lock, flags); - /* - * The following two lines don't need to be under the lock for - * the lock's sake, but they do need SMP memory barriers to - * avoid getting things out of order. We are already claiming - * the lock, anyway, so just do it under the lock to avoid the - * ordering problem. - */ - BUG_ON(smi_info->waiting_msg); - smi_info->waiting_msg = msg; - check_start_timer_thread(smi_info); + if (smi_info->si_state == SI_HOSED) { + rv = IPMI_BUS_ERR; + } else { + BUG_ON(smi_info->waiting_msg); + smi_info->waiting_msg = msg; + check_start_timer_thread(smi_info); + } spin_unlock_irqrestore(&smi_info->si_lock, flags); - return IPMI_CC_NO_ERROR; + return rv; } static void set_run_to_completion(void *send_info, bool i_run_to_completion) -- cgit v1.2.3 From 318c58852e686c009825ae8c071080b9ccdd2af0 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Wed, 11 Feb 2026 14:22:27 -0500 Subject: cxl/memdev: fix deadlock in cxl_memdev_autoremove() on attach failure cxl_memdev_autoremove() takes device_lock(&cxlmd->dev) via guard(device) and then calls cxl_memdev_unregister() when the attach callback was provided but cxl_mem_probe() failed to bind. cxl_memdev_unregister() calls cdev_device_del() device_del() bus_remove_device() device_release_driver() This path is reached when a driver uses the @attach parameter to devm_cxl_add_memdev() and the CXL topology fails to enumerate (e.g. DVSEC range registers decode outside platform-defined CXL ranges, causing the endpoint port probe to fail). Add cxl_memdev_attach_failed() to set the scope of the check correctly. Reported-by: kreview-c94b85d6d2 Fixes: 29317f8dc6ed ("cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation") Signed-off-by: Gregory Price Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20260211192228.2148713-1-gourry@gourry.net Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index f547d8ac34c7..273c22118d3d 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1089,10 +1089,8 @@ static int cxlmd_add(struct cxl_memdev *cxlmd, struct cxl_dev_state *cxlds) DEFINE_FREE(put_cxlmd, struct cxl_memdev *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) -static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) +static bool cxl_memdev_attach_failed(struct cxl_memdev *cxlmd) { - int rc; - /* * If @attach is provided fail if the driver is not attached upon * return. Note that failure here could be the result of a race to @@ -1100,7 +1098,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * succeeded and then cxl_mem unbound before the lock is acquired. */ guard(device)(&cxlmd->dev); - if (cxlmd->attach && !cxlmd->dev.driver) { + return (cxlmd->attach && !cxlmd->dev.driver); +} + +static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) +{ + int rc; + + if (cxl_memdev_attach_failed(cxlmd)) { cxl_memdev_unregister(cxlmd); return ERR_PTR(-ENXIO); } -- cgit v1.2.3 From ec197dca8735f7627e5cff7e3fa8839b53a28514 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Sun, 22 Feb 2026 08:33:52 +0000 Subject: KVM: arm64: Revert accidental drop of kvm_uninit_stage2_mmu() for non-NV VMs Commit 0c4762e26879 ("KVM: arm64: nv: Avoid NV stage-2 code when NV is not supported") added an early return to several functions in arch/arm64/kvm/nested.c to prevent a UBSAN shift-out-of-bounds error when accessing the pgt union for non-nested VMs. However, this early return was inadvertently applied to kvm_arch_flush_shadow_all() as well, causing it to skip the call to kvm_uninit_stage2_mmu(kvm) for all non-nested VMs. For pKVM, skipping this teardown means the host never unshares the guest's memory with the EL2 hypervisor. When the host kernel later recycles these leaked pages for a new VM, it attempts to re-share them. The hypervisor correctly rejects this with -EPERM, triggering a host WARN_ON and hanging the guest. Fix this by dropping the early return from kvm_arch_flush_shadow_all(). The for-loop guarding the nested MMU cleanup already bounds itself when nested_mmus_size == 0, allowing execution to proceed to kvm_uninit_stage2_mmu() as intended. Reported-by: Mark Brown Closes: https://lore.kernel.org/all/60916cb6-f460-4751-b910-f63c58700ad0@sirena.org.uk/ Fixes: 0c4762e26879 ("KVM: arm64: nv: Avoid NV stage-2 code when NV is not supported") Signed-off-by: Fuad Tabba Tested-by: Mark Brown Link: https://patch.msgid.link/20260222083352.89503-1-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index eeea5e692370..7f1ea85dc67a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1154,9 +1154,6 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { int i; - if (!kvm->arch.nested_mmus_size) - return; - for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; -- cgit v1.2.3 From 822655e6751dde2df7ddaa828c5aba217726c5a2 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 23 Feb 2026 09:29:00 -0700 Subject: cxl/port: Introduce port_to_host() helper In CXL subsystem, a port has its own host device for the port creation and removal. The host of CXL root and all the first level ports is the platform firmware device, the host of other ports is their parent port's device. Create this new helper to much easier to get the host of a cxl port. Introduce port_to_host() and use it to replace all places where using open coded to get the host of a port. Remove endpoint_host() as its functionality can be replaced by port_to_host(). [dj: Squashed commit 1 and 3 in the series to commit 1. ] Signed-off-by: Li Ming Tested-by: Alison Schofield Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260210-fix-port-enumeration-failure-v3-1-06acce0b9ead@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 18 ++++++++++++++++++ drivers/cxl/core/port.c | 29 +++-------------------------- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 007b8aff0238..5b0570df0fd9 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -152,6 +152,24 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +static inline struct device *port_to_host(struct cxl_port *port) +{ + struct cxl_port *parent = is_cxl_root(port) ? NULL : + to_cxl_port(port->dev.parent); + + /* + * The host of CXL root port and the first level of ports is + * the platform firmware device, the host of all other ports + * is their parent port. + */ + if (!parent) + return port->uport_dev; + else if (is_cxl_root(parent)) + return parent->uport_dev; + else + return &parent->dev; +} + static inline struct device *dport_to_host(struct cxl_dport *dport) { struct cxl_port *port = dport->port; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index b69c2529744c..6be150e645b6 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -615,22 +615,8 @@ struct cxl_port *parent_port_of(struct cxl_port *port) static void unregister_port(void *_port) { struct cxl_port *port = _port; - struct cxl_port *parent = parent_port_of(port); - struct device *lock_dev; - /* - * CXL root port's and the first level of ports are unregistered - * under the platform firmware device lock, all other ports are - * unregistered while holding their parent port lock. - */ - if (!parent) - lock_dev = port->uport_dev; - else if (is_cxl_root(parent)) - lock_dev = parent->uport_dev; - else - lock_dev = &parent->dev; - - device_lock_assert(lock_dev); + device_lock_assert(port_to_host(port)); port->dead = true; device_unregister(&port->dev); } @@ -1427,20 +1413,11 @@ static struct device *grandparent(struct device *dev) return NULL; } -static struct device *endpoint_host(struct cxl_port *endpoint) -{ - struct cxl_port *port = to_cxl_port(endpoint->dev.parent); - - if (is_cxl_root(port)) - return port->uport_dev; - return &port->dev; -} - static void delete_endpoint(void *data) { struct cxl_memdev *cxlmd = data; struct cxl_port *endpoint = cxlmd->endpoint; - struct device *host = endpoint_host(endpoint); + struct device *host = port_to_host(endpoint); scoped_guard(device, host) { if (host->driver && !endpoint->dead) { @@ -1456,7 +1433,7 @@ static void delete_endpoint(void *data) int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint) { - struct device *host = endpoint_host(endpoint); + struct device *host = port_to_host(endpoint); struct device *dev = &cxlmd->dev; get_device(host); -- cgit v1.2.3 From 0066688dbcdcf51680f499936faffe6d0e94194e Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 10 Feb 2026 19:46:57 +0800 Subject: cxl/port: Hold port host lock during dport adding. CXL testing environment can trigger following trace Oops: general protection fault, probably for non-canonical address 0xdffffc0000000092: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000490-0x0000000000000497] RIP: 0010:cxl_dpa_to_region+0x105/0x1f0 [cxl_core] Call Trace: cxl_event_trace_record+0xd1/0xa70 [cxl_core] __cxl_event_trace_record+0x12f/0x1e0 [cxl_core] cxl_mem_get_records_log+0x261/0x500 [cxl_core] cxl_mem_get_event_records+0x7c/0xc0 [cxl_core] cxl_mock_mem_probe+0xd38/0x1c60 [cxl_mock_mem] platform_probe+0x9d/0x130 really_probe+0x1c8/0x960 __driver_probe_device+0x187/0x3e0 driver_probe_device+0x45/0x120 __device_attach_driver+0x15d/0x280 When CXL subsystem adds a cxl port to a hierarchy, there is a small window where the new port becomes visible before it is bound to a driver. This happens because device_add() adds a device to bus device list before bus_probe_device() binds it to a driver. So if two cxl memdevs are trying to add a dport to a same port via devm_cxl_enumerate_ports(), the second cxl memdev may observe the port and attempt to add a dport, but fails because the port has not yet been attached to cxl port driver. That causes the memdev->endpoint can not be updated. The sequence is like: CPU 0 CPU 1 devm_cxl_enumerate_ports() # port not found, add it add_port_attach_ep() # hold the parent port lock # to add the new port devm_cxl_create_port() device_add() # Add dev to bus devs list bus_add_device() devm_cxl_enumerate_ports() # found the port find_cxl_port_by_uport() # hold port lock to add a dport device_lock(the port) find_or_add_dport() cxl_port_add_dport() return -ENXIO because port->dev.driver is NULL device_unlock(the port) bus_probe_device() # hold the port lock # for attaching device_lock(the port) attaching the new port device_unlock(the port) To fix this race, require that dport addition holds the host lock of the target port(the host of CXL root and all cxl host bridge ports is the platform firmware device, the host of all other ports is their parent port). The CXL subsystem already requires holding the host lock while attaching a new port. Therefore, successfully acquiring the host lock guarantees that port attaching has completed. Fixes: 4f06d81e7c6a ("cxl: Defer dport allocation for switch ports") Signed-off-by: Li Ming Reviewed-by: Dan Williams Tested-by: Alison Schofield Link: https://patch.msgid.link/20260210-fix-port-enumeration-failure-v3-2-06acce0b9ead@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 6be150e645b6..0c5957d1d329 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1767,7 +1767,16 @@ static struct cxl_dport *find_or_add_dport(struct cxl_port *port, { struct cxl_dport *dport; - device_lock_assert(&port->dev); + /* + * The port is already visible in CXL hierarchy, but it may still + * be in the process of binding to the CXL port driver at this point. + * + * port creation and driver binding are protected by the port's host + * lock, so acquire the host lock here to ensure the port has completed + * driver binding before proceeding with dport addition. + */ + guard(device)(port_to_host(port)); + guard(device)(&port->dev); dport = cxl_find_dport_by_dev(port, dport_dev); if (!dport) { dport = probe_dport(port, dport_dev); @@ -1834,13 +1843,11 @@ retry: * RP port enumerated by cxl_acpi without dport will * have the dport added here. */ - scoped_guard(device, &port->dev) { - dport = find_or_add_dport(port, dport_dev); - if (IS_ERR(dport)) { - if (PTR_ERR(dport) == -EAGAIN) - goto retry; - return PTR_ERR(dport); - } + dport = find_or_add_dport(port, dport_dev); + if (IS_ERR(dport)) { + if (PTR_ERR(dport) == -EAGAIN) + goto retry; + return PTR_ERR(dport); } rc = cxl_add_ep(dport, &cxlmd->dev); -- cgit v1.2.3 From 9d851afa482680bdd7c158cc8720284dc9ecb5fe Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Sat, 21 Feb 2026 23:40:42 +0800 Subject: selftests/sched_ext: Abort test loop on signal The runner sets exit_req on SIGINT/SIGTERM but ignores it during the main loop. This prevents users from cleanly interrupting a test run. Check exit_req each iteration to safely break out on exit signals. Signed-off-by: Cheng-Yang Chou Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/runner.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c index 5748d2c69903..761c21f96404 100644 --- a/tools/testing/selftests/sched_ext/runner.c +++ b/tools/testing/selftests/sched_ext/runner.c @@ -166,6 +166,9 @@ int main(int argc, char **argv) enum scx_test_status status; struct scx_test *test = &__scx_tests[i]; + if (exit_req) + break; + if (list) { printf("%s\n", test->name); if (i == (__scx_num_tests - 1)) -- cgit v1.2.3 From 08fe1b5166fdc81b010d7bf39cd6440620e7931e Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Thu, 5 Feb 2026 22:02:37 -0800 Subject: accel/amdxdna: Remove buffer size check when creating command BO Large command buffers may be used, and they do not always need to be mapped or accessed by the driver. Performing a size check at command BO creation time unnecessarily rejects valid use cases. Remove the buffer size check from command BO creation, and defer vmap and size validation to the paths where the driver actually needs to map and access the command buffer. Fixes: ac49797c1815 ("accel/amdxdna: Add GEM buffer object management") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260206060237.4050492-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/amdxdna_gem.c | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c index 8c290ddd3251..d60db49ead71 100644 --- a/drivers/accel/amdxdna/amdxdna_gem.c +++ b/drivers/accel/amdxdna/amdxdna_gem.c @@ -21,8 +21,6 @@ #include "amdxdna_pci_drv.h" #include "amdxdna_ubuf.h" -#define XDNA_MAX_CMD_BO_SIZE SZ_32K - MODULE_IMPORT_NS("DMA_BUF"); static int @@ -745,12 +743,6 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev, { struct amdxdna_dev *xdna = to_xdna_dev(dev); struct amdxdna_gem_obj *abo; - int ret; - - if (args->size > XDNA_MAX_CMD_BO_SIZE) { - XDNA_ERR(xdna, "Command bo size 0x%llx too large", args->size); - return ERR_PTR(-EINVAL); - } if (args->size < sizeof(struct amdxdna_cmd)) { XDNA_DBG(xdna, "Command BO size 0x%llx too small", args->size); @@ -764,17 +756,7 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev, abo->type = AMDXDNA_BO_CMD; abo->client = filp->driver_priv; - ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva); - if (ret) { - XDNA_ERR(xdna, "Vmap cmd bo failed, ret %d", ret); - goto release_obj; - } - return abo; - -release_obj: - drm_gem_object_put(to_gobj(abo)); - return ERR_PTR(ret); } int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) @@ -871,6 +853,7 @@ struct amdxdna_gem_obj *amdxdna_gem_get_obj(struct amdxdna_client *client, struct amdxdna_dev *xdna = client->xdna; struct amdxdna_gem_obj *abo; struct drm_gem_object *gobj; + int ret; gobj = drm_gem_object_lookup(client->filp, bo_hdl); if (!gobj) { @@ -879,9 +862,26 @@ struct amdxdna_gem_obj *amdxdna_gem_get_obj(struct amdxdna_client *client, } abo = to_xdna_obj(gobj); - if (bo_type == AMDXDNA_BO_INVALID || abo->type == bo_type) + if (bo_type != AMDXDNA_BO_INVALID && abo->type != bo_type) + goto put_obj; + + if (bo_type != AMDXDNA_BO_CMD || abo->mem.kva) return abo; + if (abo->mem.size > SZ_32K) { + XDNA_ERR(xdna, "Cmd bo is too big %ld", abo->mem.size); + goto put_obj; + } + + ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva); + if (ret) { + XDNA_ERR(xdna, "Vmap cmd bo failed, ret %d", ret); + goto put_obj; + } + + return abo; + +put_obj: drm_gem_object_put(gobj); return NULL; } -- cgit v1.2.3 From c68a6af400ca80596e8c37de0a1cb564aa9da8a4 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Thu, 5 Feb 2026 22:02:51 -0800 Subject: accel/amdxdna: Switch to always use chained command Preempt commands are only supported when submitted as chained commands. To ensure preempt support works consistently, always submit commands in chained command format. Set force_cmdlist to true so that single commands are filled using the chained command layout, enabling correct handling of preempt commands. Fixes: 3a0ff7b98af4 ("accel/amdxdna: Support preemption requests") Reviewed-by: Karol Wachowski Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260206060251.4050512-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_ctx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index 4503c7c77a3e..7140c3f96362 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -23,9 +23,9 @@ #include "amdxdna_pci_drv.h" #include "amdxdna_pm.h" -static bool force_cmdlist; +static bool force_cmdlist = true; module_param(force_cmdlist, bool, 0600); -MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)"); +MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)"); #define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */ -- cgit v1.2.3 From 8363c02863332992a1822688da41f881d88d1631 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Thu, 5 Feb 2026 22:03:06 -0800 Subject: accel/amdxdna: Fix crash when destroying a suspended hardware context If userspace issues an ioctl to destroy a hardware context that has already been automatically suspended, the driver may crash because the mailbox channel pointer is NULL for the suspended context. Fix this by checking the mailbox channel pointer in aie2_destroy_context() before accessing it. Fixes: 97f27573837e ("accel/amdxdna: Fix potential NULL pointer dereference in context cleanup") Reviewed-by: Karol Wachowski Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260206060306.4050531-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_message.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c index 7d7dcfeaf794..ab1178850c47 100644 --- a/drivers/accel/amdxdna/aie2_message.c +++ b/drivers/accel/amdxdna/aie2_message.c @@ -318,6 +318,9 @@ int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwc struct amdxdna_dev *xdna = ndev->xdna; int ret; + if (!hwctx->priv->mbox_chann) + return 0; + xdna_mailbox_stop_channel(hwctx->priv->mbox_chann); ret = aie2_destroy_context_req(ndev, hwctx->fw_ctx_id); xdna_mailbox_destroy_channel(hwctx->priv->mbox_chann); -- cgit v1.2.3 From 57aa3917a3b3bd805a3679371f97a1ceda3c5510 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Feb 2026 10:42:51 -0600 Subject: accel/amdxdna: Reduce log noise during process termination During process termination, several error messages are logged that are not actual errors but expected conditions when a process is killed or interrupted. This creates unnecessary noise in the kernel log. The specific scenarios are: 1. HMM invalidation returns -ERESTARTSYS when the wait is interrupted by a signal during process cleanup. This is expected when a process is being terminated and should not be logged as an error. 2. Context destruction returns -ENODEV when the firmware or device has already stopped, which commonly occurs during cleanup if the device was already torn down. This is also an expected condition during orderly shutdown. Downgrade these expected error conditions from error level to debug level to reduce log noise while still keeping genuine errors visible. Fixes: 97f27573837e ("accel/amdxdna: Fix potential NULL pointer dereference in context cleanup") Reviewed-by: Lizhi Hou Signed-off-by: Mario Limonciello Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260210164521.1094274-3-mario.limonciello@amd.com --- drivers/accel/amdxdna/aie2_ctx.c | 6 ++++-- drivers/accel/amdxdna/aie2_message.c | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index 7140c3f96362..e13be7608462 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -497,7 +497,7 @@ static void aie2_release_resource(struct amdxdna_hwctx *hwctx) if (AIE2_FEATURE_ON(xdna->dev_handle, AIE2_TEMPORAL_ONLY)) { ret = aie2_destroy_context(xdna->dev_handle, hwctx); - if (ret) + if (ret && ret != -ENODEV) XDNA_ERR(xdna, "Destroy temporal only context failed, ret %d", ret); } else { ret = xrs_release_resource(xdna->xrs_hdl, (uintptr_t)hwctx); @@ -1070,6 +1070,8 @@ void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP, true, MAX_SCHEDULE_TIMEOUT); - if (!ret || ret == -ERESTARTSYS) + if (!ret) XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret); + else if (ret == -ERESTARTSYS) + XDNA_DBG(xdna, "Wait for bo interrupted by signal"); } diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c index ab1178850c47..5d80c5837745 100644 --- a/drivers/accel/amdxdna/aie2_message.c +++ b/drivers/accel/amdxdna/aie2_message.c @@ -216,8 +216,10 @@ static int aie2_destroy_context_req(struct amdxdna_dev_hdl *ndev, u32 id) req.context_id = id; ret = aie2_send_mgmt_msg_wait(ndev, &msg); - if (ret) + if (ret && ret != -ENODEV) XDNA_WARN(xdna, "Destroy context failed, ret %d", ret); + else if (ret == -ENODEV) + XDNA_DBG(xdna, "Destroy context: device already stopped"); return ret; } -- cgit v1.2.3 From 1aa82181a3c285c7351523d587f7981ae4c015c8 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 11 Feb 2026 12:46:44 -0800 Subject: accel/amdxdna: Fix dead lock for suspend and resume When an application issues a query IOCTL while auto suspend is running, a deadlock can occur. The query path holds dev_lock and then calls pm_runtime_resume_and_get(), which waits for the ongoing suspend to complete. Meanwhile, the suspend callback attempts to acquire dev_lock and blocks, resulting in a deadlock. Fix this by releasing dev_lock before calling pm_runtime_resume_and_get() and reacquiring it after the call completes. Also acquire dev_lock in the resume callback to keep the locking consistent. Fixes: 063db451832b ("accel/amdxdna: Enhance runtime power management") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260211204644.722758-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_ctx.c | 4 ++-- drivers/accel/amdxdna/aie2_pci.c | 7 +++---- drivers/accel/amdxdna/aie2_pm.c | 2 +- drivers/accel/amdxdna/amdxdna_ctx.c | 19 +++++++------------ drivers/accel/amdxdna/amdxdna_pm.c | 2 ++ drivers/accel/amdxdna/amdxdna_pm.h | 11 +++++++++++ 6 files changed, 26 insertions(+), 19 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index e13be7608462..8d79fafd889a 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -629,7 +629,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) goto free_entity; } - ret = amdxdna_pm_resume_get(xdna); + ret = amdxdna_pm_resume_get_locked(xdna); if (ret) goto free_col_list; @@ -760,7 +760,7 @@ static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size if (!hwctx->cus) return -ENOMEM; - ret = amdxdna_pm_resume_get(xdna); + ret = amdxdna_pm_resume_get_locked(xdna); if (ret) goto free_cus; diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index 2a51b2658bfc..07e369507818 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -451,7 +451,6 @@ static int aie2_hw_suspend(struct amdxdna_dev *xdna) { struct amdxdna_client *client; - guard(mutex)(&xdna->dev_lock); list_for_each_entry(client, &xdna->client_list, node) aie2_hwctx_suspend(client); @@ -951,7 +950,7 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; - ret = amdxdna_pm_resume_get(xdna); + ret = amdxdna_pm_resume_get_locked(xdna); if (ret) goto dev_exit; @@ -1044,7 +1043,7 @@ static int aie2_get_array(struct amdxdna_client *client, if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; - ret = amdxdna_pm_resume_get(xdna); + ret = amdxdna_pm_resume_get_locked(xdna); if (ret) goto dev_exit; @@ -1134,7 +1133,7 @@ static int aie2_set_state(struct amdxdna_client *client, if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; - ret = amdxdna_pm_resume_get(xdna); + ret = amdxdna_pm_resume_get_locked(xdna); if (ret) goto dev_exit; diff --git a/drivers/accel/amdxdna/aie2_pm.c b/drivers/accel/amdxdna/aie2_pm.c index 579b8be13b18..29bd4403a94d 100644 --- a/drivers/accel/amdxdna/aie2_pm.c +++ b/drivers/accel/amdxdna/aie2_pm.c @@ -31,7 +31,7 @@ int aie2_pm_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level) { int ret; - ret = amdxdna_pm_resume_get(ndev->xdna); + ret = amdxdna_pm_resume_get_locked(ndev->xdna); if (ret) return ret; diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c index 59fa3800b9d3..5173456bbb61 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.c +++ b/drivers/accel/amdxdna/amdxdna_ctx.c @@ -266,9 +266,9 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr struct amdxdna_drm_config_hwctx *args = data; struct amdxdna_dev *xdna = to_xdna_dev(dev); struct amdxdna_hwctx *hwctx; - int ret, idx; u32 buf_size; void *buf; + int ret; u64 val; if (XDNA_MBZ_DBG(xdna, &args->pad, sizeof(args->pad))) @@ -310,20 +310,17 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct dr return -EINVAL; } - mutex_lock(&xdna->dev_lock); - idx = srcu_read_lock(&client->hwctx_srcu); + guard(mutex)(&xdna->dev_lock); hwctx = xa_load(&client->hwctx_xa, args->handle); if (!hwctx) { XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle); ret = -EINVAL; - goto unlock_srcu; + goto free_buf; } ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size); -unlock_srcu: - srcu_read_unlock(&client->hwctx_srcu, idx); - mutex_unlock(&xdna->dev_lock); +free_buf: kfree(buf); return ret; } @@ -334,7 +331,7 @@ int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl) struct amdxdna_hwctx *hwctx; struct amdxdna_gem_obj *abo; struct drm_gem_object *gobj; - int ret, idx; + int ret; if (!xdna->dev_info->ops->hwctx_sync_debug_bo) return -EOPNOTSUPP; @@ -345,17 +342,15 @@ int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl) abo = to_xdna_obj(gobj); guard(mutex)(&xdna->dev_lock); - idx = srcu_read_lock(&client->hwctx_srcu); hwctx = xa_load(&client->hwctx_xa, abo->assigned_hwctx); if (!hwctx) { ret = -EINVAL; - goto unlock_srcu; + goto put_obj; } ret = xdna->dev_info->ops->hwctx_sync_debug_bo(hwctx, debug_bo_hdl); -unlock_srcu: - srcu_read_unlock(&client->hwctx_srcu, idx); +put_obj: drm_gem_object_put(gobj); return ret; } diff --git a/drivers/accel/amdxdna/amdxdna_pm.c b/drivers/accel/amdxdna/amdxdna_pm.c index d024d480521c..b1fafddd7ad5 100644 --- a/drivers/accel/amdxdna/amdxdna_pm.c +++ b/drivers/accel/amdxdna/amdxdna_pm.c @@ -16,6 +16,7 @@ int amdxdna_pm_suspend(struct device *dev) struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev)); int ret = -EOPNOTSUPP; + guard(mutex)(&xdna->dev_lock); if (xdna->dev_info->ops->suspend) ret = xdna->dev_info->ops->suspend(xdna); @@ -28,6 +29,7 @@ int amdxdna_pm_resume(struct device *dev) struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev)); int ret = -EOPNOTSUPP; + guard(mutex)(&xdna->dev_lock); if (xdna->dev_info->ops->resume) ret = xdna->dev_info->ops->resume(xdna); diff --git a/drivers/accel/amdxdna/amdxdna_pm.h b/drivers/accel/amdxdna/amdxdna_pm.h index 77b2d6e45570..3d26b973e0e3 100644 --- a/drivers/accel/amdxdna/amdxdna_pm.h +++ b/drivers/accel/amdxdna/amdxdna_pm.h @@ -15,4 +15,15 @@ void amdxdna_pm_suspend_put(struct amdxdna_dev *xdna); void amdxdna_pm_init(struct amdxdna_dev *xdna); void amdxdna_pm_fini(struct amdxdna_dev *xdna); +static inline int amdxdna_pm_resume_get_locked(struct amdxdna_dev *xdna) +{ + int ret; + + mutex_unlock(&xdna->dev_lock); + ret = amdxdna_pm_resume_get(xdna); + mutex_lock(&xdna->dev_lock); + + return ret; +} + #endif /* _AMDXDNA_PM_H_ */ -- cgit v1.2.3 From fdb65acfe655f844ae1e88696b9656d3ef5bb8fb Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 11 Feb 2026 12:47:16 -0800 Subject: accel/amdxdna: Fix suspend failure after enabling turbo mode Enabling turbo mode disables hardware clock gating. Suspend requires hardware clock gating to be re-enabled, otherwise suspend will fail. Fix this by calling aie2_runtime_cfg() from aie2_hw_stop() to re-enable clock gating during suspend. Also ensure that firmware is initialized in aie2_hw_start() before modifying clock-gating settings during resume. Fixes: f4d7b8a6bc8c ("accel/amdxdna: Enhance power management settings") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260211204716.722788-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_pci.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index 07e369507818..4b3e6bb97bd2 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -323,6 +323,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna) return; } + aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL); aie2_mgmt_fw_fini(ndev); xdna_mailbox_stop_channel(ndev->mgmt_chann); xdna_mailbox_destroy_channel(ndev->mgmt_chann); @@ -406,15 +407,15 @@ static int aie2_hw_start(struct amdxdna_dev *xdna) goto stop_psp; } - ret = aie2_pm_init(ndev); + ret = aie2_mgmt_fw_init(ndev); if (ret) { - XDNA_ERR(xdna, "failed to init pm, ret %d", ret); + XDNA_ERR(xdna, "initial mgmt firmware failed, ret %d", ret); goto destroy_mgmt_chann; } - ret = aie2_mgmt_fw_init(ndev); + ret = aie2_pm_init(ndev); if (ret) { - XDNA_ERR(xdna, "initial mgmt firmware failed, ret %d", ret); + XDNA_ERR(xdna, "failed to init pm, ret %d", ret); goto destroy_mgmt_chann; } -- cgit v1.2.3 From 07efce5a6611af6714ea3ef65694e0c8dd7e44f5 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 11 Feb 2026 12:53:41 -0800 Subject: accel/amdxdna: Fix command hang on suspended hardware context When a hardware context is suspended, the job scheduler is stopped. If a command is submitted while the context is suspended, the job is queued in the scheduler but aie2_sched_job_run() is never invoked to restart the hardware context. As a result, the command hangs. Fix this by modifying the hardware context suspend routine to keep the job scheduler running so that queued jobs can trigger context restart properly. Fixes: aac243092b70 ("accel/amdxdna: Add command execution") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260211205341.722982-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_ctx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index 8d79fafd889a..25845bd5e507 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -53,6 +53,7 @@ static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwct { drm_sched_stop(&hwctx->priv->sched, bad_job); aie2_destroy_context(xdna->dev_handle, hwctx); + drm_sched_start(&hwctx->priv->sched, 0); } static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx) @@ -80,7 +81,6 @@ static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hw } out: - drm_sched_start(&hwctx->priv->sched, 0); XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret); return ret; } @@ -297,19 +297,23 @@ aie2_sched_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; int ret; - if (!hwctx->priv->mbox_chann) + ret = amdxdna_pm_resume_get(hwctx->client->xdna); + if (ret) + return NULL; + + if (!hwctx->priv->mbox_chann) { + amdxdna_pm_suspend_put(hwctx->client->xdna); return NULL; + } - if (!mmget_not_zero(job->mm)) + if (!mmget_not_zero(job->mm)) { + amdxdna_pm_suspend_put(hwctx->client->xdna); return ERR_PTR(-ESRCH); + } kref_get(&job->refcnt); fence = dma_fence_get(job->fence); - ret = amdxdna_pm_resume_get(hwctx->client->xdna); - if (ret) - goto out; - if (job->drv_cmd) { switch (job->drv_cmd->opcode) { case SYNC_DEBUG_BO: -- cgit v1.2.3 From 1110a949675ebd56b3f0286e664ea543f745801c Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Tue, 17 Feb 2026 10:54:15 -0800 Subject: accel/amdxdna: Fix out-of-bounds memset in command slot handling The remaining space in a command slot may be smaller than the size of the command header. Clearing the command header with memset() before verifying the available slot space can result in an out-of-bounds write and memory corruption. Fix this by moving the memset() call after the size validation. Fixes: 3d32eb7a5ecf ("accel/amdxdna: Fix cu_idx being cleared by memset() during command setup") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260217185415.1781908-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_message.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c index 5d80c5837745..277a27bce850 100644 --- a/drivers/accel/amdxdna/aie2_message.c +++ b/drivers/accel/amdxdna/aie2_message.c @@ -699,11 +699,11 @@ aie2_cmdlist_fill_npu_cf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *siz u32 cmd_len; void *cmd; - memset(npu_slot, 0, sizeof(*npu_slot)); cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); if (*size < sizeof(*npu_slot) + cmd_len) return -EINVAL; + memset(npu_slot, 0, sizeof(*npu_slot)); npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); if (npu_slot->cu_idx == INVALID_CU_IDX) return -EINVAL; @@ -724,7 +724,6 @@ aie2_cmdlist_fill_npu_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *si u32 cmd_len; u32 arg_sz; - memset(npu_slot, 0, sizeof(*npu_slot)); sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); arg_sz = cmd_len - sizeof(*sn); if (cmd_len < sizeof(*sn) || arg_sz > MAX_NPU_ARGS_SIZE) @@ -733,6 +732,7 @@ aie2_cmdlist_fill_npu_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *si if (*size < sizeof(*npu_slot) + arg_sz) return -EINVAL; + memset(npu_slot, 0, sizeof(*npu_slot)); npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); if (npu_slot->cu_idx == INVALID_CU_IDX) return -EINVAL; @@ -756,7 +756,6 @@ aie2_cmdlist_fill_npu_preempt(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t u32 cmd_len; u32 arg_sz; - memset(npu_slot, 0, sizeof(*npu_slot)); pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); arg_sz = cmd_len - sizeof(*pd); if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE) @@ -765,6 +764,7 @@ aie2_cmdlist_fill_npu_preempt(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t if (*size < sizeof(*npu_slot) + arg_sz) return -EINVAL; + memset(npu_slot, 0, sizeof(*npu_slot)); npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); if (npu_slot->cu_idx == INVALID_CU_IDX) return -EINVAL; @@ -792,7 +792,6 @@ aie2_cmdlist_fill_npu_elf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *si u32 cmd_len; u32 arg_sz; - memset(npu_slot, 0, sizeof(*npu_slot)); pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); arg_sz = cmd_len - sizeof(*pd); if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE) @@ -801,6 +800,7 @@ aie2_cmdlist_fill_npu_elf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *si if (*size < sizeof(*npu_slot) + arg_sz) return -EINVAL; + memset(npu_slot, 0, sizeof(*npu_slot)); npu_slot->type = EXEC_NPU_TYPE_ELF; npu_slot->inst_buf_addr = pd->inst_buf; npu_slot->save_buf_addr = pd->save_buf; -- cgit v1.2.3 From 03808abb1d868aed7478a11a82e5bb4b3f1ca6d6 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Tue, 17 Feb 2026 11:28:15 -0800 Subject: accel/amdxdna: Prevent ubuf size overflow The ubuf size calculation may overflow, resulting in an undersized allocation and possible memory corruption. Use check_add_overflow() helpers to validate the size calculation before allocation. Fixes: bd72d4acda10 ("accel/amdxdna: Support user space allocated buffer") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260217192815.1784689-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/amdxdna_ubuf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/accel/amdxdna/amdxdna_ubuf.c b/drivers/accel/amdxdna/amdxdna_ubuf.c index b509f10b155c..fb71d6e3f44d 100644 --- a/drivers/accel/amdxdna/amdxdna_ubuf.c +++ b/drivers/accel/amdxdna/amdxdna_ubuf.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -176,7 +177,10 @@ struct dma_buf *amdxdna_get_ubuf(struct drm_device *dev, goto free_ent; } - exp_info.size += va_ent[i].len; + if (check_add_overflow(exp_info.size, va_ent[i].len, &exp_info.size)) { + ret = -EINVAL; + goto free_ent; + } } ubuf->nr_pages = exp_info.size >> PAGE_SHIFT; -- cgit v1.2.3 From 901ec3470994006bc8dd02399e16b675566c3416 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Thu, 19 Feb 2026 13:19:46 -0800 Subject: accel/amdxdna: Validate command buffer payload count The count field in the command header is used to determine the valid payload size. Verify that the valid payload does not exceed the remaining buffer space. Fixes: aac243092b70 ("accel/amdxdna: Add command execution") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260219211946.1920485-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/amdxdna_ctx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c index 5173456bbb61..263d36072540 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.c +++ b/drivers/accel/amdxdna/amdxdna_ctx.c @@ -104,7 +104,10 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size) if (size) { count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header); - if (unlikely(count <= num_masks)) { + if (unlikely(count <= num_masks || + count * sizeof(u32) + + offsetof(struct amdxdna_cmd, data[0]) > + abo->mem.size)) { *size = 0; return NULL; } -- cgit v1.2.3 From a7b4bc094fbaa7dc7b7b91ae33549bbd7eefaac1 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 7 Jan 2026 13:22:57 +0100 Subject: module: Remove duplicate freeing of lockdep classes In the error path of load_module(), under the free_module label, the code calls lockdep_free_key_range() to release lock classes associated with the MOD_DATA, MOD_RODATA and MOD_RO_AFTER_INIT module regions, and subsequently invokes module_deallocate(). Since commit ac3b43283923 ("module: replace module_layout with module_memory"), the module_deallocate() function calls free_mod_mem(), which releases the lock classes as well and considers all module regions. Attempting to free these classes twice is unnecessary. Remove the redundant code in load_module(). Fixes: ac3b43283923 ("module: replace module_layout with module_memory") Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Reviewed-by: Aaron Tomlin Acked-by: Song Liu Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sami Tolvanen --- kernel/module/main.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index 2bac4c7cd019..eec6c9cf8dbb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3544,12 +3544,6 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: mod_stat_bump_invalid(info, flags); - /* Free lock-classes; relies on the preceding sync_rcu() */ - for_class_mod_mem_type(type, core_data) { - lockdep_free_key_range(mod->mem[type].base, - mod->mem[type].size); - } - module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: -- cgit v1.2.3 From 8d597ba6ec18dae2eec143d4e1c9d81441ca0dda Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 5 Feb 2026 15:37:08 +0100 Subject: module: Fix the modversions and signing submenus The module Kconfig file contains a set of options related to "Module versioning support" (depends on MODVERSIONS) and "Module signature verification" (depends on MODULE_SIG). The Kconfig tool automatically creates submenus when an entry for a symbol is followed by consecutive items that all depend on the symbol. However, this functionality doesn't work for the mentioned module options. The MODVERSIONS options are interleaved with ASM_MODVERSIONS, which has no 'depends on MODVERSIONS' but instead uses 'default HAVE_ASM_MODVERSIONS && MODVERSIONS'. Similarly, the MODULE_SIG options are interleaved by a comment warning not to forget signing modules with scripts/sign-file, which uses the condition 'depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL'. The result is that the options are confusingly shown when using a menuconfig tool, as follows: [*] Module versioning support Module versioning implementation (genksyms (from source code)) ---> [ ] Extended Module Versioning Support [*] Basic Module Versioning Support [*] Source checksum for all modules [*] Module signature verification [ ] Require modules to be validly signed [ ] Automatically sign all modules Hash algorithm to sign modules (SHA-256) ---> Fix the issue by using if/endif to group related options together in kernel/module/Kconfig, similarly to how the MODULE_DEBUG options are already grouped. Note that the signing-related options depend on 'MODULE_SIG || IMA_APPRAISE_MODSIG', with the exception of MODULE_SIG_FORCE, which is valid only for MODULE_SIG and is therefore kept separately. For consistency, do the same for the MODULE_COMPRESS entries. The options are then properly placed into submenus, as follows: [*] Module versioning support Module versioning implementation (genksyms (from source code)) ---> [ ] Extended Module Versioning Support [*] Basic Module Versioning Support [*] Source checksum for all modules [*] Module signature verification [ ] Require modules to be validly signed [ ] Automatically sign all modules Hash algorithm to sign modules (SHA-256) ---> Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Signed-off-by: Sami Tolvanen --- kernel/module/Kconfig | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index be74917802ad..43b1bb01fd27 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -169,9 +169,10 @@ config MODVERSIONS make them incompatible with the kernel you are running. If unsure, say N. +if MODVERSIONS + choice prompt "Module versioning implementation" - depends on MODVERSIONS help Select the tool used to calculate symbol versions for modules. @@ -206,7 +207,7 @@ endchoice config ASM_MODVERSIONS bool - default HAVE_ASM_MODVERSIONS && MODVERSIONS + default HAVE_ASM_MODVERSIONS help This enables module versioning for exported symbols also from assembly. This can be enabled only when the target architecture @@ -214,7 +215,6 @@ config ASM_MODVERSIONS config EXTENDED_MODVERSIONS bool "Extended Module Versioning Support" - depends on MODVERSIONS help This enables extended MODVERSIONs support, allowing long symbol names to be versioned. @@ -224,7 +224,6 @@ config EXTENDED_MODVERSIONS config BASIC_MODVERSIONS bool "Basic Module Versioning Support" - depends on MODVERSIONS default y help This enables basic MODVERSIONS support, allowing older tools or @@ -237,6 +236,8 @@ config BASIC_MODVERSIONS This is enabled by default when MODVERSIONS are enabled. If unsure, say Y. +endif # MODVERSIONS + config MODULE_SRCVERSION_ALL bool "Source checksum for all modules" help @@ -277,10 +278,11 @@ config MODULE_SIG_FORCE Reject unsigned modules or signed modules for which we don't have a key. Without this, such modules will simply taint the kernel. +if MODULE_SIG || IMA_APPRAISE_MODSIG + config MODULE_SIG_ALL bool "Automatically sign all modules" default y - depends on MODULE_SIG || IMA_APPRAISE_MODSIG help Sign all modules during make modules_install. Without this option, modules must be signed manually, using the scripts/sign-file tool. @@ -290,7 +292,6 @@ comment "Do not forget to sign required modules with scripts/sign-file" choice prompt "Hash algorithm to sign modules" - depends on MODULE_SIG || IMA_APPRAISE_MODSIG default MODULE_SIG_SHA512 help This determines which sort of hashing algorithm will be used during @@ -327,7 +328,6 @@ endchoice config MODULE_SIG_HASH string - depends on MODULE_SIG || IMA_APPRAISE_MODSIG default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 @@ -335,6 +335,8 @@ config MODULE_SIG_HASH default "sha3-384" if MODULE_SIG_SHA3_384 default "sha3-512" if MODULE_SIG_SHA3_512 +endif # MODULE_SIG || IMA_APPRAISE_MODSIG + config MODULE_COMPRESS bool "Module compression" help @@ -350,9 +352,10 @@ config MODULE_COMPRESS If unsure, say N. +if MODULE_COMPRESS + choice prompt "Module compression type" - depends on MODULE_COMPRESS help Choose the supported algorithm for module compression. @@ -379,7 +382,6 @@ endchoice config MODULE_COMPRESS_ALL bool "Automatically compress all modules" default y - depends on MODULE_COMPRESS help Compress all modules during 'make modules_install'. @@ -389,7 +391,6 @@ config MODULE_COMPRESS_ALL config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD @@ -400,6 +401,8 @@ config MODULE_DECOMPRESS If unsure, say N. +endif # MODULE_COMPRESS + config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS bool "Allow loading of modules with missing namespace imports" help -- cgit v1.2.3 From 1f0638604f65dae9fc8b6ce937907daf5f0132d0 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Sun, 22 Feb 2026 16:25:22 +0800 Subject: selftests/sched_ext: Fix unused-result warning for read() The read() call in run_test() triggers a warn_unused_result compiler warning, which breaks the build under -Werror. Check the return value of read() and exit the child process on failure to satisfy the compiler and handle pipe read errors. Reviewed-by: Andrea Righi Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/init_enable_count.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 82c71653977b..44577e30e764 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -57,7 +57,8 @@ static enum scx_test_status run_test(bool global) char buf; close(pipe_fds[1]); - read(pipe_fds[0], &buf, 1); + if (read(pipe_fds[0], &buf, 1) < 0) + exit(1); close(pipe_fds[0]); exit(0); } -- cgit v1.2.3 From 0c36a6f6f0eb071379b7995ef571c18c1235adf2 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Sun, 22 Feb 2026 21:08:25 +0800 Subject: tools/sched_ext: scx_central: Remove unused '-p' option The '-p' option is defined in getopt() but not handled in the switch statement or documented in the help text. Providing '-p' currently triggers the default error path. Remove it to sync the optstring with the actual implementation. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 2a805f1d6c8f..710fa03376e2 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -66,7 +66,7 @@ restart: assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); - while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { + while ((opt = getopt(argc, argv, "s:c:vh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -- cgit v1.2.3 From cbb297323dbed4f6087b5f189585413801d1d45b Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Sun, 22 Feb 2026 21:08:26 +0800 Subject: tools/sched_ext: scx_sdt: Remove unused '-f' option The '-f' option is defined in getopt() but not handled in the switch statement or documented in the help text. Providing '-f' currently triggers the default error path. Remove it to sync the optstring with the actual implementation. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/sched_ext/scx_sdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c index d8ca9aa316a5..a36405d8df30 100644 --- a/tools/sched_ext/scx_sdt.c +++ b/tools/sched_ext/scx_sdt.c @@ -54,7 +54,7 @@ restart: optind = 1; skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); - while ((opt = getopt(argc, argv, "fvh")) != -1) { + while ((opt = getopt(argc, argv, "vh")) != -1) { switch (opt) { case 'v': verbose = true; -- cgit v1.2.3 From 075e70d13edfcb309c8fdc1f444fbd1f834ca1d4 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Mon, 23 Feb 2026 02:47:09 +0800 Subject: selftests/sched_ext: Remove duplicated unistd.h include in rt_stall.c The header is included twice in rt_stall.c. Remove the redundant inclusion to clean up the code. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/rt_stall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c index ab772e336f86..81ea9b4883e5 100644 --- a/tools/testing/selftests/sched_ext/rt_stall.c +++ b/tools/testing/selftests/sched_ext/rt_stall.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "rt_stall.bpf.skel.h" #include "scx_test.h" #include "../kselftest.h" -- cgit v1.2.3 From e7e222ad73d93fe54d6e6e3a15253a0ecf081a1b Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 11 Feb 2026 17:31:23 -0700 Subject: cxl: Move devm_cxl_add_nvdimm_bridge() to cxl_pmem.ko Moving the symbol devm_cxl_add_nvdimm_bridge() to drivers/cxl/cxl_pmem.c, so that cxl_pmem can export a symbol that gives cxl_acpi a depedency on cxl_pmem kernel module. This is a prepatory patch to resolve the issue of a race for nvdimm_bus object that is created during cxl_acpi_probe(). No functional changes besides moving code. Suggested-by: Dan Williams Acked-by: Ira Weiny Tested-by: Alison Schofield Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260205001633.1813643-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pmem.c | 13 +++---------- drivers/cxl/cxl.h | 2 ++ drivers/cxl/pmem.c | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index 3c6e76721522..b652e3486038 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -115,15 +115,8 @@ static void unregister_nvb(void *_cxl_nvb) device_unregister(&cxl_nvb->dev); } -/** - * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology - * @host: platform firmware root device - * @port: CXL port at the root of a CXL topology - * - * Return: bridge device that can host cxl_nvdimm objects - */ -struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, - struct cxl_port *port) +struct cxl_nvdimm_bridge *__devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port) { struct cxl_nvdimm_bridge *cxl_nvb; struct device *dev; @@ -155,7 +148,7 @@ err: put_device(dev); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, "CXL"); +EXPORT_SYMBOL_FOR_MODULES(__devm_cxl_add_nvdimm_bridge, "cxl_pmem"); static void cxl_nvdimm_release(struct device *dev) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 04c673e7cdb0..f5850800f400 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -920,6 +920,8 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev); struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, struct cxl_port *port); +struct cxl_nvdimm_bridge *__devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port); struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 6a97e4e490b6..c67b30b516bf 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -13,6 +13,20 @@ static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); +/** + * __devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology + * @host: platform firmware root device + * @port: CXL port at the root of a CXL topology + * + * Return: bridge device that can host cxl_nvdimm objects + */ +struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port) +{ + return __devm_cxl_add_nvdimm_bridge(host, port); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, "CXL"); + static void clear_exclusive(void *mds) { clear_exclusive_cxl_commands(mds, exclusive_cmds); -- cgit v1.2.3 From 43d37df67f7770d8d261fdcb64ecc8c314e91303 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Fri, 6 Feb 2026 14:30:59 -0800 Subject: drm/xe/wa: Steer RMW of MCR registers while building default LRC When generating the default LRC, if a register is not masked, we apply any save-restore programming necessary via a read-modify-write sequence that will ensure we only update the relevant bits/fields without clobbering the rest of the register. However some of the registers that need to be updated might be MCR registers which require steering to a non-terminated instance to ensure we can read back a valid, non-zero value. The steering of reads originating from a command streamer is controlled by register CS_MMIO_GROUP_INSTANCE_SELECT. Emit additional MI_LRI commands to update the steering before any RMW of an MCR register to ensure the reads are performed properly. Note that needing to perform a RMW of an MCR register while building the default LRC is pretty rare. Most of the MCR registers that are part of an engine's LRCs are also masked registers, so no MCR is necessary. Fixes: f2f90989ccff ("drm/xe: Avoid reading RMW registers in emit_wa_job") Cc: Michal Wajdeczko Reviewed-by: Balasubramani Vivekanandan Link: https://patch.msgid.link/20260206223058.387014-2-matthew.d.roper@intel.com Signed-off-by: Matt Roper (cherry picked from commit 6c2e331c915ba9e774aa847921262805feb00863) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 6 +++ drivers/gpu/drm/xe/xe_gt.c | 66 ++++++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 68172b0248a6..dc5a4fafa70c 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -96,6 +96,12 @@ #define ENABLE_SEMAPHORE_POLL_BIT REG_BIT(13) #define RING_CMD_CCTL(base) XE_REG((base) + 0xc4, XE_REG_OPTION_MASKED) + +#define CS_MMIO_GROUP_INSTANCE_SELECT(base) XE_REG((base) + 0xcc) +#define SELECTIVE_READ_ADDRESSING REG_BIT(30) +#define SELECTIVE_READ_GROUP REG_GENMASK(29, 23) +#define SELECTIVE_READ_INSTANCE REG_GENMASK(22, 16) + /* * CMD_CCTL read/write fields take a MOCS value and _not_ a table index. * The lsb of each can be considered a separate enabling bit for encryption. diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 9d090d0f2438..df6d04704823 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -210,11 +210,15 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q) return ret; } +/* Dwords required to emit a RMW of a register */ +#define EMIT_RMW_DW 20 + static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) { - struct xe_reg_sr *sr = &q->hwe->reg_lrc; + struct xe_hw_engine *hwe = q->hwe; + struct xe_reg_sr *sr = &hwe->reg_lrc; struct xe_reg_sr_entry *entry; - int count_rmw = 0, count = 0, ret; + int count_rmw = 0, count_rmw_mcr = 0, count = 0, ret; unsigned long idx; struct xe_bb *bb; size_t bb_len = 0; @@ -224,6 +228,8 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) xa_for_each(&sr->xa, idx, entry) { if (entry->reg.masked || entry->clr_bits == ~0) ++count; + else if (entry->reg.mcr) + ++count_rmw_mcr; else ++count_rmw; } @@ -231,17 +237,35 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) if (count) bb_len += count * 2 + 1; - if (count_rmw) - bb_len += count_rmw * 20 + 7; + /* + * RMW of MCR registers is the same as a normal RMW, except an + * additional LRI (3 dwords) is required per register to steer the read + * to a nom-terminated instance. + * + * We could probably shorten the batch slightly by eliding the + * steering for consecutive MCR registers that have the same + * group/instance target, but it's not worth the extra complexity to do + * so. + */ + bb_len += count_rmw * EMIT_RMW_DW; + bb_len += count_rmw_mcr * (EMIT_RMW_DW + 3); + + /* + * After doing all RMW, we need 7 trailing dwords to clean up, + * plus an additional 3 dwords to reset steering if any of the + * registers were MCR. + */ + if (count_rmw || count_rmw_mcr) + bb_len += 7 + (count_rmw_mcr ? 3 : 0); - if (q->hwe->class == XE_ENGINE_CLASS_RENDER) + if (hwe->class == XE_ENGINE_CLASS_RENDER) /* * Big enough to emit all of the context's 3DSTATE via * xe_lrc_emit_hwe_state_instructions() */ - bb_len += xe_gt_lrc_size(gt, q->hwe->class) / sizeof(u32); + bb_len += xe_gt_lrc_size(gt, hwe->class) / sizeof(u32); - xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", q->hwe->name, bb_len); + xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", hwe->name, bb_len); bb = xe_bb_new(gt, bb_len, false); if (IS_ERR(bb)) @@ -276,13 +300,23 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) } } - if (count_rmw) { - /* Emit MI_MATH for each RMW reg: 20dw per reg + 7 trailing dw */ - + if (count_rmw || count_rmw_mcr) { xa_for_each(&sr->xa, idx, entry) { if (entry->reg.masked || entry->clr_bits == ~0) continue; + if (entry->reg.mcr) { + struct xe_reg_mcr reg = { .__reg.raw = entry->reg.raw }; + u8 group, instance; + + xe_gt_mcr_get_nonterminated_steering(gt, reg, &group, &instance); + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(hwe->mmio_base).addr; + *cs++ = SELECTIVE_READ_ADDRESSING | + REG_FIELD_PREP(SELECTIVE_READ_GROUP, group) | + REG_FIELD_PREP(SELECTIVE_READ_INSTANCE, instance); + } + *cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; *cs++ = entry->reg.addr; *cs++ = CS_GPR_REG(0, 0).addr; @@ -308,8 +342,9 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) *cs++ = CS_GPR_REG(0, 0).addr; *cs++ = entry->reg.addr; - xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n", - entry->reg.addr, entry->clr_bits, entry->set_bits); + xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x%s\n", + entry->reg.addr, entry->clr_bits, entry->set_bits, + entry->reg.mcr ? " (MCR)" : ""); } /* reset used GPR */ @@ -321,6 +356,13 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) *cs++ = 0; *cs++ = CS_GPR_REG(0, 2).addr; *cs++ = 0; + + /* reset steering */ + if (count_rmw_mcr) { + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(q->hwe->mmio_base).addr; + *cs++ = 0; + } } cs = xe_lrc_emit_hwe_state_instructions(q, cs); -- cgit v1.2.3 From f9d69d5e7bde2295eb7488a56f094ac8f5383b92 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 30 Dec 2025 10:32:08 -0800 Subject: module: Fix kernel panic when a symbol st_shndx is out of bounds The module loader doesn't check for bounds of the ELF section index in simplify_symbols(): for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { const char *name = info->strtab + sym[i].st_name; switch (sym[i].st_shndx) { case SHN_COMMON: [...] default: /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); else /** HERE --> **/ secbase = info->sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; break; } } A symbol with an out-of-bounds st_shndx value, for example 0xffff (known as SHN_XINDEX or SHN_HIRESERVE), may cause a kernel panic: BUG: unable to handle page fault for address: ... RIP: 0010:simplify_symbols+0x2b2/0x480 ... Kernel panic - not syncing: Fatal exception This can happen when module ELF is legitimately using SHN_XINDEX or when it is corrupted. Add a bounds check in simplify_symbols() to validate that st_shndx is within the valid range before using it. This issue was discovered due to a bug in llvm-objcopy, see relevant discussion for details [1]. [1] https://lore.kernel.org/linux-modules/20251224005752.201911-1-ihor.solodrai@linux.dev/ Signed-off-by: Ihor Solodrai Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- kernel/module/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/module/main.c b/kernel/module/main.c index eec6c9cf8dbb..c3ce106c70af 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1568,6 +1568,13 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) break; default: + if (sym[i].st_shndx >= info->hdr->e_shnum) { + pr_err("%s: Symbol %s has an invalid section index %u (max %u)\n", + mod->name, name, sym[i].st_shndx, info->hdr->e_shnum - 1); + ret = -ENOEXEC; + break; + } + /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); -- cgit v1.2.3 From 16cb1a64dce56708a1684bf755ad52fb52c41f87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 13:12:00 +0100 Subject: RDMA/uverbs: select CONFIG_DMA_SHARED_BUFFER The addition of dmabuf support in uverbs means that it is no longer possible to build infiniband support if that is disabled: arm-linux-gnueabi-ld: drivers/infiniband/core/ib_core_uverbs.o: in function `rdma_user_mmap_entry_remove.part.0': ib_core_uverbs.c:(.text+0x508): undefined reference to `dma_buf_move_notify' (dma_buf_move_notify): Unknown destination type (ARM/Thumb) in drivers/infiniband/core/ib_core_uverbs.o ib_core_uverbs.c:(.text+0x518): undefined reference to `dma_resv_wait_timeout' (dma_resv_wait_timeout): Unknown destination type (ARM/Thumb) in drivers/infiniband/core/ib_core_uverbs.o Select this from Kconfig, as we do for the other users. Fixes: 0ac6f4056c4a ("RDMA/uverbs: Add DMABUF object type and operations") Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/20260216121213.2088910-1-arnd@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 794b9778816b..78ac2ff5befd 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -6,6 +6,7 @@ menuconfig INFINIBAND depends on INET depends on m || IPV6 != m depends on !ALPHA + select DMA_SHARED_BUFFER select IRQ_POLL select DIMLIB help -- cgit v1.2.3 From f9a1767ce3a34bc33c3d33473f65dc13a380e379 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:11 -0500 Subject: cgroup/cpuset: Fix incorrect change to effective_xcpus in partition_xcpus_del() The effective_xcpus of a cpuset can contain offline CPUs. In partition_xcpus_del(), the xcpus parameter is incorrectly used as a temporary cpumask to mask out offline CPUs. As xcpus can be the effective_xcpus of a cpuset, this can result in unexpected changes in that cpumask. Fix this problem by not making any changes to the xcpus parameter. Fixes: 11e5f407b64a ("cgroup/cpuset: Keep track of CPUs in isolated partitions") Reviewed-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c43efef7df71..a366ef84f982 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1221,8 +1221,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); - cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask); } /* -- cgit v1.2.3 From 68230aac8b9aad243626fbaf3ca170012c17fec5 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:12 -0500 Subject: cgroup/cpuset: Fix incorrect use of cpuset_update_tasks_cpumask() in update_cpumasks_hier() Commit e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") incorrectly changed the 2nd parameter of cpuset_update_tasks_cpumask() from tmp->new_cpus to cp->effective_cpus. This second parameter is just a temporary cpumask for internal use. The cpuset_update_tasks_cpumask() function was originally called update_tasks_cpumask() before commit 381b53c3b549 ("cgroup/cpuset: rename functions shared between v1 and v2"). This mistake can incorrectly change the effective_cpus of the cpuset when it is the top_cpuset or in arm64 architecture where task_cpu_possible_mask() may differ from cpu_possible_mask. So far top_cpuset hasn't been passed to update_cpumasks_hier() yet, but arm64 arch can still be impacted. Fix it by reverting the incorrect change. Fixes: e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a366ef84f982..8dfffe2dc9ed 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2157,7 +2157,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - cpuset_update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, tmp->new_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE -- cgit v1.2.3 From 17b1860034c769c9f7669ae2612e91ef8fdde769 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:13 -0500 Subject: cgroup/cpuset: Clarify exclusion rules for cpuset internal variables Clarify the locking rules associated with file level internal variables inside the cpuset code. There is no functional change. Reviewed-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 105 ++++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8dfffe2dc9ed..165e2967025b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -61,6 +61,58 @@ static const char * const perr_strings[] = { [PERR_REMOTE] = "Have remote partition underneath", }; +/* + * CPUSET Locking Convention + * ------------------------- + * + * Below are the three global locks guarding cpuset structures in lock + * acquisition order: + * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) + * - cpuset_mutex + * - callback_lock (raw spinlock) + * + * A task must hold all the three locks to modify externally visible or + * used fields of cpusets, though some of the internally used cpuset fields + * and internal variables can be modified without holding callback_lock. If only + * reliable read access of the externally used fields are needed, a task can + * hold either cpuset_mutex or callback_lock which are exposed to other + * external subsystems. + * + * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others, + * ensuring that it is the only task able to also acquire callback_lock and + * be able to modify cpusets. It can perform various checks on the cpuset + * structure first, knowing nothing will change. It can also allocate memory + * without holding callback_lock. While it is performing these checks, various + * callback routines can briefly acquire callback_lock to query cpusets. Once + * it is ready to make the changes, it takes callback_lock, blocking everyone + * else. + * + * Calls to the kernel memory allocator cannot be made while holding + * callback_lock which is a spinlock, as the memory allocator may sleep or + * call back into cpuset code and acquire callback_lock. + * + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. + * + * The cpuset_common_seq_show() handlers only hold callback_lock across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + */ + +static DEFINE_MUTEX(cpuset_mutex); + +/* + * File level internal variables below follow one of the following exclusion + * rules. + * + * RWCS: Read/write-able by holding either cpus_write_lock (and optionally + * cpuset_mutex) or both cpus_read_lock and cpuset_mutex. + * + * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable + * by holding both cpuset_mutex and callback_lock. + */ + /* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in @@ -70,19 +122,18 @@ static const char * const perr_strings[] = { * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ -static cpumask_var_t subpartitions_cpus; +static cpumask_var_t subpartitions_cpus; /* RWCS */ /* - * Exclusive CPUs in isolated partitions + * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated) */ -static cpumask_var_t isolated_cpus; +static cpumask_var_t isolated_cpus; /* CSCB */ /* - * isolated_cpus updating flag (protected by cpuset_mutex) - * Set if isolated_cpus is going to be updated in the current - * cpuset_mutex crtical section. + * Set if isolated_cpus is being updated in the current cpuset_mutex + * critical section. */ -static bool isolated_cpus_updating; +static bool isolated_cpus_updating; /* RWCS */ /* * A flag to force sched domain rebuild at the end of an operation. @@ -98,7 +149,7 @@ static bool isolated_cpus_updating; * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ -static bool force_sd_rebuild; +static bool force_sd_rebuild; /* RWCS */ /* * Partition root states: @@ -218,42 +269,6 @@ struct cpuset top_cpuset = { .partition_root_state = PRS_ROOT, }; -/* - * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel - * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset - * structures. Note that cpuset_mutex needs to be a mutex as it is used in - * paths that rely on priority inheritance (e.g. scheduler - on RT) for - * correctness. - * - * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, it blocks others, ensuring that it is the only task able to - * also acquire callback_lock and be able to modify cpusets. It can perform - * various checks on the cpuset structure first, knowing nothing will change. - * It can also allocate memory while just holding cpuset_mutex. While it is - * performing these checks, various callback routines can briefly acquire - * callback_lock to query cpusets. Once it is ready to make the changes, it - * takes callback_lock, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_lock, as that would risk double tripping on callback_lock - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_lock, then it has read-only - * access to cpusets. - * - * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. - * - * The cpuset_common_seq_show() handlers only hold callback_lock across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - */ - -static DEFINE_MUTEX(cpuset_mutex); - /** * cpuset_lock - Acquire the global cpuset mutex * @@ -1163,6 +1178,8 @@ static void reset_partition_data(struct cpuset *cs) static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); + lockdep_assert_held(&callback_lock); + lockdep_assert_held(&cpuset_mutex); if (new_prs == PRS_ISOLATED) cpumask_or(isolated_cpus, isolated_cpus, xcpus); else -- cgit v1.2.3 From 14713ed9e9291813849018f32edbf5f6de362088 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:14 -0500 Subject: cgroup/cpuset: Set isolated_cpus_updating only if isolated_cpus is changed As cpuset is updating HK_TYPE_DOMAIN housekeeping mask when there is a change in the set of isolated CPUs, making this change is now more costly than before. Right now, the isolated_cpus_updating flag can be set even if there is no real change in isolated_cpus. Put in additional checks to make sure that isolated_cpus_updating is set only if there is a real change in isolated_cpus. Reviewed-by: Chen Ridong Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 165e2967025b..edc67c6aa553 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1180,11 +1180,15 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus WARN_ON_ONCE(old_prs == new_prs); lockdep_assert_held(&callback_lock); lockdep_assert_held(&cpuset_mutex); - if (new_prs == PRS_ISOLATED) + if (new_prs == PRS_ISOLATED) { + if (cpumask_subset(xcpus, isolated_cpus)) + return; cpumask_or(isolated_cpus, isolated_cpus, xcpus); - else + } else { + if (!cpumask_intersects(xcpus, isolated_cpus)) + return; cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); - + } isolated_cpus_updating = true; } -- cgit v1.2.3 From 5e6aac573cc4cc8e62aebf880b824e219a5ff557 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:15 -0500 Subject: kselftest/cgroup: Simplify test_cpuset_prs.sh by removing "S+" command The "S+" command is used in the test matrix to enable the cpuset controller. However this can be done automatically and we never use the "S-" command to disable cpuset controller. Simplify the test matrix and reduce clutter by removing the command and doing that automatically. There is no functional change to the test cases. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 213 +++++++++++----------- 1 file changed, 104 insertions(+), 109 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 5dff3ad53867..ef16f5610d0b 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -196,7 +196,6 @@ test_add_proc() # P = set cpus.partition (0:member, 1:root, 2:isolated) # C = add cpu-list to cpuset.cpus # X = add cpu-list to cpuset.cpus.exclusive -# S

= use prefix in subtree_control # T = put a task into cgroup # CX = add cpu-list to both cpuset.cpus and cpuset.cpus.exclusive # O= = Write to CPU online file of @@ -209,44 +208,43 @@ test_add_proc() # sched-debug matching which includes offline CPUs and single-CPU partitions # while the second one is for matching cpuset.cpus.isolated. # -SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" +SETUP_A123_PARTITIONS="C1-3:P1 C2-3:P1 C3:P1" TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- - " C0-1 . . C2-3 S+ C4-5 . . 0 A2:0-1" + " C0-1 . . C2-3 . C4-5 . . 0 A2:0-1" " C0-1 . . C2-3 P1 . . . 0 " - " C0-1 . . C2-3 P1:S+ C0-1:P1 . . 0 " - " C0-1 . . C2-3 P1:S+ C1:P1 . . 0 " - " C0-1:S+ . . C2-3 . . . P1 0 " - " C0-1:P1 . . C2-3 S+ C1 . . 0 " - " C0-1:P1 . . C2-3 S+ C1:P1 . . 0 " - " C0-1:P1 . . C2-3 S+ C1:P1 . P1 0 " + " C0-1 . . C2-3 P1 C0-1:P1 . . 0 " + " C0-1 . . C2-3 P1 C1:P1 . . 0 " + " C0-1 . . C2-3 . . . P1 0 " + " C0-1:P1 . . C2-3 . C1 . . 0 " + " C0-1:P1 . . C2-3 . C1:P1 . . 0 " + " C0-1:P1 . . C2-3 . C1:P1 . P1 0 " " C0-1:P1 . . C2-3 C4-5 . . . 0 A1:4-5" - " C0-1:P1 . . C2-3 S+:C4-5 . . . 0 A1:4-5" " C0-1 . . C2-3:P1 . . . C2 0 " " C0-1 . . C2-3:P1 . . . C4-5 0 B1:4-5" - "C0-3:P1:S+ C2-3:P1 . . . . . . 0 A1:0-1|A2:2-3|XA2:2-3" - "C0-3:P1:S+ C2-3:P1 . . C1-3 . . . 0 A1:1|A2:2-3|XA2:2-3" - "C2-3:P1:S+ C3:P1 . . C3 . . . 0 A1:|A2:3|XA2:3 A1:P1|A2:P1" - "C2-3:P1:S+ C3:P1 . . C3 P0 . . 0 A1:3|A2:3 A1:P1|A2:P0" - "C2-3:P1:S+ C2:P1 . . C2-4 . . . 0 A1:3-4|A2:2" - "C2-3:P1:S+ C3:P1 . . C3 . . C0-2 0 A1:|B1:0-2 A1:P1|A2:P1" + " C0-3:P1 C2-3:P1 . . . . . . 0 A1:0-1|A2:2-3|XA2:2-3" + " C0-3:P1 C2-3:P1 . . C1-3 . . . 0 A1:1|A2:2-3|XA2:2-3" + " C2-3:P1 C3:P1 . . C3 . . . 0 A1:|A2:3|XA2:3 A1:P1|A2:P1" + " C2-3:P1 C3:P1 . . C3 P0 . . 0 A1:3|A2:3 A1:P1|A2:P0" + " C2-3:P1 C2:P1 . . C2-4 . . . 0 A1:3-4|A2:2" + " C2-3:P1 C3:P1 . . C3 . . C0-2 0 A1:|B1:0-2 A1:P1|A2:P1" "$SETUP_A123_PARTITIONS . C2-3 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1" # CPU offlining cases: - " C0-1 . . C2-3 S+ C4-5 . O2=0 0 A1:0-1|B1:3" - "C0-3:P1:S+ C2-3:P1 . . O2=0 . . . 0 A1:0-1|A2:3" - "C0-3:P1:S+ C2-3:P1 . . O2=0 O2=1 . . 0 A1:0-1|A2:2-3" - "C0-3:P1:S+ C2-3:P1 . . O1=0 . . . 0 A1:0|A2:2-3" - "C0-3:P1:S+ C2-3:P1 . . O1=0 O1=1 . . 0 A1:0-1|A2:2-3" - "C2-3:P1:S+ C3:P1 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" - "C2-3:P1:S+ C3:P2 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" - "C2-3:P1:S+ C3:P1 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" - "C2-3:P1:S+ C3:P2 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" - "C2-3:P1:S+ C3:P1 . . O2=0 . . . 0 A1:|A2:3 A1:P1|A2:P1" - "C2-3:P1:S+ C3:P1 . . O3=0 . . . 0 A1:2|A2: A1:P1|A2:P1" - "C2-3:P1:S+ C3:P1 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-1" - "C2-3:P1:S+ C3:P1 . . . T:O3=0 . . 0 A1:2|A2:2 A1:P1|A2:P-1" + " C0-1 . . C2-3 . C4-5 . O2=0 0 A1:0-1|B1:3" + " C0-3:P1 C2-3:P1 . . O2=0 . . . 0 A1:0-1|A2:3" + " C0-3:P1 C2-3:P1 . . O2=0 O2=1 . . 0 A1:0-1|A2:2-3" + " C0-3:P1 C2-3:P1 . . O1=0 . . . 0 A1:0|A2:2-3" + " C0-3:P1 C2-3:P1 . . O1=0 O1=1 . . 0 A1:0-1|A2:2-3" + " C2-3:P1 C3:P1 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" + " C2-3:P1 C3:P2 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" + " C2-3:P1 C3:P1 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P1" + " C2-3:P1 C3:P2 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P2" + " C2-3:P1 C3:P1 . . O2=0 . . . 0 A1:|A2:3 A1:P1|A2:P1" + " C2-3:P1 C3:P1 . . O3=0 . . . 0 A1:2|A2: A1:P1|A2:P1" + " C2-3:P1 C3:P1 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-1" + " C2-3:P1 C3:P1 . . . T:O3=0 . . 0 A1:2|A2:2 A1:P1|A2:P-1" "$SETUP_A123_PARTITIONS . O1=0 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1" "$SETUP_A123_PARTITIONS . O2=0 . . . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1" "$SETUP_A123_PARTITIONS . O3=0 . . . 0 A1:1|A2:2|A3: A1:P1|A2:P1|A3:P1" @@ -264,88 +262,87 @@ TEST_MATRIX=( # # Remote partition and cpuset.cpus.exclusive tests # - " C0-3:S+ C1-3:S+ C2-3 . X2-3 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1|A2:2-3|A3:2-3 A1:P0|A2:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2" - " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" - " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" - " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" - " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1|A3:2-3 A2:P2|A3:P2 1-3" - " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3|B1:4-5 A3:P2|B1:P2 2-5" - " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4|A2:1-3|A3:1-3 A2:P2 1-3" - " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4|A2:4|A3:2-3 A3:P2 2-3" + " C0-3 C1-3 C2-3 . X2-3 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:2-3" + " C0-3 C1-3 C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1|A2:2-3|A3:2-3 A1:P0|A2:P2 2-3" + " C0-3 C1-3 C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" + " C0-3 C1-3 C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3 C1-3 C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3 C1-3 C2-3 C2-3 . . . P2 0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2" + " C0-3 C1-3 C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" + " C0-3 C1-3 C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" + " C0-3 C1-3 C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" + " C0-3 C1-3 C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1|A3:2-3 A2:P2|A3:P2 1-3" + " C0-3 C1-3 C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3|B1:4-5 A3:P2|B1:P2 2-5" + " C4:X0-3 X1-3 X2-3 . . P2 . . 0 A1:4|A2:1-3|A3:1-3 A2:P2 1-3" + " C4:X0-3 X1-3 X2-3 . . . P2 . 0 A1:4|A2:4|A3:2-3 A3:P2 2-3" # Nested remote/local partition tests - " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4-5 \ + " C0-3 C1-3 C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4-5 \ A1:P0|A2:P1|A3:P2|B1:P1 2-3" - " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4 \ + " C0-3 C1-3 C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4 \ A1:P0|A2:P1|A3:P2|B1:P1 2-4|2-3" - " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1|A2:2-3|A3:2-3|B1:4 \ + " C0-3 C1-3 C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1|A2:2-3|A3:2-3|B1:4 \ A1:P0|A2:P1|A3:P0|B1:P1" - " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:2|A3:3|B1:4 \ + " C0-3 C1-3 C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:2|A3:3|B1:4 \ A1:P0|A2:P1|A3:P2|B1:P1 2-4|3" - " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1|A2:2-3|A3:4 \ + " C0-4 C1-4 C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1|A2:2-3|A3:4 \ A1:P0|A2:P2|A3:P1 2-4|2-3" - " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1|A2:2|A3:3-4 \ + " C0-4 C1-4 C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1|A2:2|A3:3-4 \ A1:P0|A2:P2|A3:P1 2" - " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ + " C0-4:X2-4 C1-4:X2-4:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4|A2:1-4|A3:2-4 \ A1:P0|A2:P-2|A3:P-1 ." - " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ + " C0-4:X2-4 C1-4:X2-4:P2 C2-4:X4:P1 \ . . . X1 . 0 A1:0-1|A2:2-4|A3:2-4 \ A1:P0|A2:P2|A3:P-1 2-4" # Remote partition offline tests - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1|A2:1|A3:3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2|A2:1-2|A3: A1:P0|A3:P2 3" - " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2|A2:1-2|A3:1-2 A1:P0|A3:P-2 3|" + " C0-3 C1-3 C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1|A2:1|A3:3 A1:P0|A3:P2 2-3" + " C0-3 C1-3 C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" + " C0-3 C1-3 C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2|A2:1-2|A3: A1:P0|A3:P2 3" + " C0-3 C1-3 C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2|A2:1-2|A3:1-2 A1:P0|A3:P-2 3|" # An invalidated remote partition cannot self-recover from hotplug - " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3|A2:1-3|A3:2 A1:P0|A3:P-2 ." + " C0-3 C1-3 C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3|A2:1-3|A3:2 A1:P0|A3:P-2 ." # cpus.exclusive.effective clearing test - " C0-3:S+ C1-3:S+ C2 . X2-3:X . . . 0 A1:0-3|A2:1-3|A3:2|XA1:" + " C0-3 C1-3 C2 . X2-3:X . . . 0 A1:0-3|A2:1-3|A3:2|XA1:" # Invalid to valid remote partition transition test - " C0-3:S+ C1-3 . . . X3:P2 . . 0 A1:0-3|A2:1-3|XA2: A2:P-2 ." - " C0-3:S+ C1-3:X3:P2 - . . X2-3 P2 . . 0 A1:0-2|A2:3|XA2:3 A2:P2 3" + " C0-3 C1-3 . . . X3:P2 . . 0 A1:0-3|A2:1-3|XA2: A2:P-2 ." + " C0-3 C1-3:X3:P2 . . X2-3 P2 . . 0 A1:0-2|A2:3|XA2:3 A2:P2 3" # Invalid to valid local partition direct transition tests - " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" - " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" - " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0" - " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" + " C1-3:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" + " C1-3:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0" + " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" # Local partition invalidation tests - " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + " C0-3:X1-3:P2 C1-3:X2-3:P2 C2-3:X3:P2 \ . . . . . 0 A1:1|A2:2|A3:3 A1:P2|A2:P2|A3:P2 1-3" - " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + " C0-3:X1-3:P2 C1-3:X2-3:P2 C2-3:X3:P2 \ . . X4 . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3" - " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ + " C0-3:X1-3:P2 C1-3:X2-3:P2 C2-3:X3:P2 \ . . C4:X . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3" # Local partition CPU change tests - " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2|A2:3-5 A1:P2|A2:P1 0-2" - " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3|A2:4-5 A1:P2|A2:P1 1-3" + " C0-5:P2 C4-5:P1 . . . C3-5 . . 0 A1:0-2|A2:3-5 A1:P2|A2:P1 0-2" + " C0-5:P2 C4-5:P1 . . C1-5 . . . 0 A1:1-3|A2:4-5 A1:P2|A2:P1 1-3" # cpus_allowed/exclusive_cpus update tests - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + " C0-3:X2-3 C1-3:X2-3 C2-3:X2-3 \ . X:C4 . P2 . 0 A1:4|A2:4|XA2:|XA3:|A3:4 \ A1:P0|A3:P-2 ." - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + " C0-3:X2-3 C1-3:X2-3 C2-3:X2-3 \ . X1 . P2 . 0 A1:0-3|A2:1-3|XA1:1|XA2:|XA3:|A3:2-3 \ A1:P0|A3:P-2 ." - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ + " C0-3:X2-3 C1-3:X2-3 C2-3:X2-3 \ . . X3 P2 . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3 \ A1:P0|A3:P2 3" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ + " C0-3:X2-3 C1-3:X2-3 C2-3:X2-3:P2 \ . . X3 . . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3|XA3:3 \ A1:P0|A3:P2 3" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ + " C0-3:X2-3 C1-3:X2-3 C2-3:X2-3:P2 \ . X4 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:4|XA2:|XA3 \ A1:P0|A3:P-2" @@ -356,37 +353,37 @@ TEST_MATRIX=( # # Adding CPUs to partition root that are not in parent's # cpuset.cpus is allowed, but those extra CPUs are ignored. - "C2-3:P1:S+ C3:P1 . . . C2-4 . . 0 A1:|A2:2-3 A1:P1|A2:P1" + " C2-3:P1 C3:P1 . . . C2-4 . . 0 A1:|A2:2-3 A1:P1|A2:P1" # Taking away all CPUs from parent or itself if there are tasks # will make the partition invalid. - "C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3|A2:2-3 A1:P1|A2:P-1" - " C3:P1:S+ C3 . . T P1 . . 0 A1:3|A2:3 A1:P1|A2:P-1" + " C2-3:P1 C3:P1 . . T C2-3 . . 0 A1:2-3|A2:2-3 A1:P1|A2:P-1" + " C3:P1 C3 . . T P1 . . 0 A1:3|A2:3 A1:P1|A2:P-1" "$SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1" "$SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1" # Changing a partition root to member makes child partitions invalid - "C2-3:P1:S+ C3:P1 . . P0 . . . 0 A1:2-3|A2:3 A1:P0|A2:P-1" + " C2-3:P1 C3:P1 . . P0 . . . 0 A1:2-3|A2:3 A1:P0|A2:P-1" "$SETUP_A123_PARTITIONS . C2-3 P0 . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P0|A3:P-1" # cpuset.cpus can contains cpus not in parent's cpuset.cpus as long # as they overlap. - "C2-3:P1:S+ . . . . C3-4:P1 . . 0 A1:2|A2:3 A1:P1|A2:P1" + " C2-3:P1 . . . . C3-4:P1 . . 0 A1:2|A2:3 A1:P1|A2:P1" # Deletion of CPUs distributed to child cgroup is allowed. - "C0-1:P1:S+ C1 . C2-3 C4-5 . . . 0 A1:4-5|A2:4-5" + " C0-1:P1 C1 . C2-3 C4-5 . . . 0 A1:4-5|A2:4-5" # To become a valid partition root, cpuset.cpus must overlap parent's # cpuset.cpus. - " C0-1:P1 . . C2-3 S+ C4-5:P1 . . 0 A1:0-1|A2:0-1 A1:P1|A2:P-1" + " C0-1:P1 . . C2-3 . C4-5:P1 . . 0 A1:0-1|A2:0-1 A1:P1|A2:P-1" # Enabling partition with child cpusets is allowed - " C0-1:S+ C1 . C2-3 P1 . . . 0 A1:0-1|A2:1 A1:P1" + " C0-1 C1 . C2-3 P1 . . . 0 A1:0-1|A2:1 A1:P1" # A partition root with non-partition root parent is invalid| but it # can be made valid if its parent becomes a partition root too. - " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" - " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" + " C0-1 C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" + " C0-1 C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" # A non-exclusive cpuset.cpus change will not invalidate its siblings partition. " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:3 A1:P1|B1:P0" @@ -398,23 +395,23 @@ TEST_MATRIX=( # Child partition root that try to take all CPUs from parent partition # with tasks will remain invalid. - " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" - " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1|A2:1-4 A1:P1|A2:P1" - " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" + " C1-4:P1 P1 . . . . . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" + " C1-4:P1 P1 . . . C1-4 . . 0 A1|A2:1-4 A1:P1|A2:P1" + " C1-4:P1 P1 . . T C1-4 . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1" # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't # affect cpuset.cpus.exclusive.effective. - " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4|XA2:3" + " C1-4:X3 C1:X3 . . . C . . 0 A2:1-4|XA2:3" # cpuset.cpus can contain CPUs that overlap a sibling cpuset with cpus.exclusive # but creating a local partition out of it is not allowed. Similarly and change # in cpuset.cpus of a local partition that overlaps sibling exclusive CPUs will # invalidate it. - " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1 0 A1:1|A2:2-4|B1:5-6|XB1:5-6 \ + " CX1-4 CX2-4:P2 . C5-6 . . . P1 0 A1:1|A2:2-4|B1:5-6|XB1:5-6 \ A1:P0|A2:P2:B1:P1 2-4" - " CX1-4:S+ CX2-4:P2 . C3-6 . . . P1 0 A1:1|A2:2-4|B1:5-6 \ + " CX1-4 CX2-4:P2 . C3-6 . . . P1 0 A1:1|A2:2-4|B1:5-6 \ A1:P0|A2:P2:B1:P-1 2-4" - " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ + " CX1-4 CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ A1:P0|A2:P2:B1:P-1 2-4" # When multiple partitions with conflicting cpuset.cpus are created, the @@ -426,14 +423,14 @@ TEST_MATRIX=( " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" # cpuset.cpus can become empty with task in it as it inherits parent's effective CPUs - " C1-3:S+ C2 . . . T:C . . 0 A1:1-3|A2:1-3" + " C1-3 C2 . . . T:C . . 0 A1:1-3|A2:1-3" # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: # A task cannot be added to a partition with no cpu - "C2-3:P1:S+ C3:P1 . . O2=0:T . . . 1 A1:|A2:3 A1:P1|A2:P1" + " C2-3:P1 C3:P1 . . O2=0:T . . . 1 A1:|A2:3 A1:P1|A2:P1" # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5" @@ -465,31 +462,31 @@ REMOTE_TEST_MATRIX=( # old-p1 old-p2 old-c11 old-c12 old-c21 old-c22 # new-p1 new-p2 new-c11 new-c12 new-c21 new-c22 ECPUs Pstate ISOLCPUS # ------ ------ ------- ------- ------- ------- ----- ------ -------- - " X1-3:S+ X4-6:S+ X1-2 X3 X4-5 X6 \ + " X1-3 X4-6 X1-2 X3 X4-5 X6 \ . . P2 P2 P2 P2 c11:1-2|c12:3|c21:4-5|c22:6 \ c11:P2|c12:P2|c21:P2|c22:P2 1-6" - " CX1-4:S+ . X1-2:P2 C3 . . \ + " CX1-4 . X1-2:P2 C3 . . \ . . . C3-4 . . p1:3-4|c11:1-2|c12:3-4 \ p1:P0|c11:P2|c12:P0 1-2" - " CX1-4:S+ . X1-2:P2 . . . \ + " CX1-4 . X1-2:P2 . . . \ X2-4 . . . . . p1:1,3-4|c11:2 \ p1:P0|c11:P2 2" - " CX1-5:S+ . X1-2:P2 X3-5:P1 . . \ + " CX1-5 . X1-2:P2 X3-5:P1 . . \ X2-4 . . . . . p1:1,5|c11:2|c12:3-4 \ p1:P0|c11:P2|c12:P1 2" - " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \ + " CX1-4 . X1-2:P2 X3-4:P1 . . \ . . X2 . . . p1:1|c11:2|c12:3-4 \ p1:P0|c11:P2|c12:P1 2" # p1 as member, will get its effective CPUs from its parent rtest - " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \ + " CX1-4 . X1-2:P2 X3-4:P1 . . \ . . X1 CX2-4 . . p1:5-7|c11:1|c12:2-4 \ p1:P0|c11:P2|c12:P1 1" - " CX1-4:S+ X5-6:P1:S+ . . . . \ - . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ + " CX1-4 X5-6:P1 . . . . \ + . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \ 1-2,4-6|1-2,5-6" # c12 whose cpuset.cpus CPUs are all granted to c11 will become invalid partition - " C1-5:P1:S+ . C1-4:P1 C2-3 . . \ + " C1-5:P1 . C1-4:P1 C2-3 . . \ . . . P1 . . p1:5|c11:1-4|c12:5 \ p1:P1|c11:P1|c12:P-1" ) @@ -530,7 +527,6 @@ set_ctrl_state() CGRP=$1 STATE=$2 SHOWERR=${3} - CTRL=${CTRL:=$CONTROLLER} HASERR=0 REDIRECT="2> $TMPMSG" [[ -z "$STATE" || "$STATE" = '.' ]] && return 0 @@ -540,15 +536,16 @@ set_ctrl_state() for CMD in $(echo $STATE | sed -e "s/:/ /g") do TFILE=$CGRP/cgroup.procs - SFILE=$CGRP/cgroup.subtree_control PFILE=$CGRP/cpuset.cpus.partition CFILE=$CGRP/cpuset.cpus XFILE=$CGRP/cpuset.cpus.exclusive - case $CMD in - S*) PREFIX=${CMD#?} - COMM="echo ${PREFIX}${CTRL} > $SFILE" + + # Enable cpuset controller if not enabled yet + [[ -f $CFILE ]] || { + COMM="echo +cpuset > $CGRP/../cgroup.subtree_control" eval $COMM $REDIRECT - ;; + } + case $CMD in X*) CPUS=${CMD#?} COMM="echo $CPUS > $XFILE" @@ -947,7 +944,6 @@ check_test_results() run_state_test() { TEST=$1 - CONTROLLER=cpuset CGROUP_LIST=". A1 A1/A2 A1/A2/A3 B1" RESET_LIST="A1/A2/A3 A1/A2 A1 B1" I=0 @@ -1003,7 +999,6 @@ run_state_test() run_remote_state_test() { TEST=$1 - CONTROLLER=cpuset [[ -d rtest ]] || mkdir rtest cd rtest echo +cpuset > cgroup.subtree_control -- cgit v1.2.3 From 3bfe47967191f42d17510713b31a47d9284b8c5a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:16 -0500 Subject: cgroup/cpuset: Move housekeeping_update()/rebuild_sched_domains() together With the latest changes in sched/isolation.c, rebuild_sched_domains*() requires the HK_TYPE_DOMAIN housekeeping cpumask to be properly updated first, if needed, before the sched domains can be rebuilt. So the two naturally fit together. Do that by creating a new update_hk_sched_domains() helper to house both actions. The name of the isolated_cpus_updating flag to control the call to housekeeping_update() is now outdated. So change it to update_housekeeping to better reflect its purpose. Also move the call to update_hk_sched_domains() to the end of cpuset and hotplug operations before releasing the cpuset_mutex. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 51 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index edc67c6aa553..14b07a283a2c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -130,10 +130,9 @@ static cpumask_var_t subpartitions_cpus; /* RWCS */ static cpumask_var_t isolated_cpus; /* CSCB */ /* - * Set if isolated_cpus is being updated in the current cpuset_mutex - * critical section. + * Set if housekeeping cpumasks are to be updated. */ -static bool isolated_cpus_updating; /* RWCS */ +static bool update_housekeeping; /* RWCS */ /* * A flag to force sched domain rebuild at the end of an operation. @@ -1189,7 +1188,7 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus return; cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); } - isolated_cpus_updating = true; + update_housekeeping = true; } /* @@ -1307,22 +1306,22 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) } /* - * update_isolation_cpumasks - Update external isolation related CPU masks + * update_hk_sched_domains - Update HK cpumasks & rebuild sched domains * - * The following external CPU masks will be updated if necessary: - * - workqueue unbound cpumask + * Update housekeeping cpumasks and rebuild sched domains if necessary. + * This should be called at the end of cpuset or hotplug actions. */ -static void update_isolation_cpumasks(void) +static void update_hk_sched_domains(void) { - int ret; - - if (!isolated_cpus_updating) - return; - - ret = housekeeping_update(isolated_cpus); - WARN_ON_ONCE(ret < 0); - - isolated_cpus_updating = false; + if (update_housekeeping) { + /* Updating HK cpumasks implies rebuild sched domains */ + WARN_ON_ONCE(housekeeping_update(isolated_cpus)); + update_housekeeping = false; + force_sd_rebuild = true; + } + /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */ + if (force_sd_rebuild) + rebuild_sched_domains_locked(); } /** @@ -1473,7 +1472,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1518,7 +1516,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1589,7 +1586,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1933,7 +1929,6 @@ write_error: partition_xcpus_add(new_prs, parent, tmp->delmask); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -2901,7 +2896,6 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -3191,9 +3185,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); - if (force_sd_rebuild) - rebuild_sched_domains_locked(); out_unlock: + update_hk_sched_domains(); cpuset_full_unlock(); if (of_cft(of)->private == FILE_MEMLIST) schedule_flush_migrate_mm(); @@ -3301,6 +3294,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); + update_hk_sched_domains(); cpuset_full_unlock(); return retval ?: nbytes; } @@ -3475,6 +3469,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); + update_hk_sched_domains(); cpuset_full_unlock(); } @@ -3882,10 +3877,12 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if necessary */ - if (force_sd_rebuild) - rebuild_sched_domains_cpuslocked(); + if (update_housekeeping || force_sd_rebuild) { + mutex_lock(&cpuset_mutex); + update_hk_sched_domains(); + mutex_unlock(&cpuset_mutex); + } free_tmpmasks(ptmp); } -- cgit v1.2.3 From 6df415aa46ec10d607da5063d88492a7c7762074 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:17 -0500 Subject: cgroup/cpuset: Defer housekeeping_update() calls from CPU hotplug to workqueue The cpuset_handle_hotplug() may need to invoke housekeeping_update(), for instance, when an isolated partition is invalidated because its last active CPU has been put offline. As we are going to enable dynamic update to the nozh_full housekeeping cpumask (HK_TYPE_KERNEL_NOISE) soon with the help of CPU hotplug, allowing the CPU hotplug path to call into housekeeping_update() directly from update_isolation_cpumasks() will likely cause deadlock. So we have to defer any call to housekeeping_update() after the CPU hotplug operation has finished. This is now done via the workqueue where the update_hk_sched_domains() function will be invoked via the hk_sd_workfn(). An concurrent cpuset control file write may have executed the required update_hk_sched_domains() function before the work function is called. So the work function call may become a no-op when it is invoked. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 31 +++++++++++++++++++---- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 11 +++++++- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 14b07a283a2c..aa915e9b588f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1324,6 +1324,16 @@ static void update_hk_sched_domains(void) rebuild_sched_domains_locked(); } +/* + * Work function to invoke update_hk_sched_domains() + */ +static void hk_sd_workfn(struct work_struct *work) +{ + cpuset_full_lock(); + update_hk_sched_domains(); + cpuset_full_unlock(); +} + /** * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets * @parent: Parent cpuset containing all siblings @@ -3796,6 +3806,7 @@ unlock: */ static void cpuset_handle_hotplug(void) { + static DECLARE_WORK(hk_sd_work, hk_sd_workfn); static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; @@ -3878,11 +3889,21 @@ static void cpuset_handle_hotplug(void) } - if (update_housekeeping || force_sd_rebuild) { - mutex_lock(&cpuset_mutex); - update_hk_sched_domains(); - mutex_unlock(&cpuset_mutex); - } + /* + * Queue a work to call housekeeping_update() & rebuild_sched_domains() + * There will be a slight delay before the HK_TYPE_DOMAIN housekeeping + * cpumask can correctly reflect what is in isolated_cpus. + * + * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that + * is still pending. Before the pending bit is cleared, the work data + * is copied out and work item dequeued. So it is possible to queue + * the work again before the hk_sd_workfn() is invoked to process the + * previously queued work. Since hk_sd_workfn() doesn't use the work + * item at all, this is not a problem. + */ + if (update_housekeeping || force_sd_rebuild) + queue_work(system_unbound_wq, &hk_sd_work); + free_tmpmasks(ptmp); } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index ef16f5610d0b..a56f4153c64d 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -245,6 +245,9 @@ TEST_MATRIX=( " C2-3:P1 C3:P1 . . O3=0 . . . 0 A1:2|A2: A1:P1|A2:P1" " C2-3:P1 C3:P1 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-1" " C2-3:P1 C3:P1 . . . T:O3=0 . . 0 A1:2|A2:2 A1:P1|A2:P-1" + " C2-3:P1 C3:P2 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-2" + " C1-3:P1 C3:P2 . . . T:O3=0 . . 0 A1:1-2|A2:1-2 A1:P1|A2:P-2 3|" + " C1-3:P1 C3:P2 . . . T:O3=0 O3=1 . 0 A1:1-2|A2:3 A1:P1|A2:P2 3" "$SETUP_A123_PARTITIONS . O1=0 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1" "$SETUP_A123_PARTITIONS . O2=0 . . . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1" "$SETUP_A123_PARTITIONS . O3=0 . . . 0 A1:1|A2:2|A3: A1:P1|A2:P1|A3:P1" @@ -761,7 +764,7 @@ check_cgroup_states() # only CPUs in isolated partitions as well as those that are isolated at # boot time. # -# $1 - expected isolated cpu list(s) {,} +# $1 - expected isolated cpu list(s) {|} # - expected sched/domains value # - cpuset.cpus.isolated value = if not defined # @@ -770,6 +773,7 @@ check_isolcpus() EXPECTED_ISOLCPUS=$1 ISCPUS=${CGROUP2}/cpuset.cpus.isolated ISOLCPUS=$(cat $ISCPUS) + HKICPUS=$(cat /sys/devices/system/cpu/isolated) LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains if [[ $EXPECTED_ISOLCPUS = . ]] @@ -807,6 +811,11 @@ check_isolcpus() ISOLCPUS= EXPECTED_ISOLCPUS=$EXPECTED_SDOMAIN + # + # The inverse of HK_TYPE_DOMAIN cpumask in $HKICPUS should match $ISOLCPUS + # + [[ "$ISOLCPUS" != "$HKICPUS" ]] && return 1 + # # Use the sched domain in debugfs to check isolated CPUs, if available # -- cgit v1.2.3 From a84097e625f2b9e7f273161c004f34b7be63b348 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 21 Feb 2026 13:54:18 -0500 Subject: cgroup/cpuset: Call housekeeping_update() without holding cpus_read_lock The current cpuset partition code is able to dynamically update the sched domains of a running system and the corresponding HK_TYPE_DOMAIN housekeeping cpumask to perform what is essentially the "isolcpus=domain,..." boot command line feature at run time. The housekeeping cpumask update requires flushing a number of different workqueues which may not be safe with cpus_read_lock() held as the workqueue flushing code may acquire cpus_read_lock() or acquiring locks which have locking dependency with cpus_read_lock() down the chain. Below is an example of such circular locking problem. ====================================================== WARNING: possible circular locking dependency detected 6.18.0-test+ #2 Tainted: G S ------------------------------------------------------ test_cpuset_prs/10971 is trying to acquire lock: ffff888112ba4958 ((wq_completion)sync_wq){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x7a/0x180 but task is already holding lock: ffffffffae47f450 (cpuset_mutex){+.+.}-{4:4}, at: cpuset_partition_write+0x85/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (cpuset_mutex){+.+.}-{4:4}: -> #3 (cpu_hotplug_lock){++++}-{0:0}: -> #2 (rtnl_mutex){+.+.}-{4:4}: -> #1 ((work_completion)(&arg.work)){+.+.}-{0:0}: -> #0 ((wq_completion)sync_wq){+.+.}-{0:0}: Chain exists of: (wq_completion)sync_wq --> cpu_hotplug_lock --> cpuset_mutex 5 locks held by test_cpuset_prs/10971: #0: ffff88816810e440 (sb_writers#7){.+.+}-{0:0}, at: ksys_write+0xf9/0x1d0 #1: ffff8891ab620890 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0x260/0x5f0 #2: ffff8890a78b83e8 (kn->active#187){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x2b6/0x5f0 #3: ffffffffadf32900 (cpu_hotplug_lock){++++}-{0:0}, at: cpuset_partition_write+0x77/0x130 #4: ffffffffae47f450 (cpuset_mutex){+.+.}-{4:4}, at: cpuset_partition_write+0x85/0x130 Call Trace: : touch_wq_lockdep_map+0x93/0x180 __flush_workqueue+0x111/0x10b0 housekeeping_update+0x12d/0x2d0 update_parent_effective_cpumask+0x595/0x2440 update_prstate+0x89d/0xce0 cpuset_partition_write+0xc5/0x130 cgroup_file_write+0x1a5/0x680 kernfs_fop_write_iter+0x3df/0x5f0 vfs_write+0x525/0xfd0 ksys_write+0xf9/0x1d0 do_syscall_64+0x95/0x520 entry_SYSCALL_64_after_hwframe+0x76/0x7e To avoid such a circular locking dependency problem, we have to call housekeeping_update() without holding the cpus_read_lock() and cpuset_mutex. The current set of wq's flushed by housekeeping_update() may not have work functions that call cpus_read_lock() directly, but we are likely to extend the list of wq's that are flushed in the future. Moreover, the current set of work functions may hold locks that may have cpu_hotplug_lock down the dependency chain. So housekeeping_update() is now called after releasing cpus_read_lock and cpuset_mutex at the end of a cpuset operation. These two locks are then re-acquired later before calling rebuild_sched_domains_locked(). To enable mutual exclusion between the housekeeping_update() call and other cpuset control file write actions, a new top level cpuset_top_mutex is introduced. This new mutex will be acquired first to allow sharing variables used by both code paths. However, cpuset update from CPU hotplug can still happen in parallel with the housekeeping_update() call, though that should be rare in production environment. As cpus_read_lock() is now no longer held when tmigr_isolated_exclude_cpumask() is called, it needs to acquire it directly. The lockdep_is_cpuset_held() is also updated to return true if either cpuset_top_mutex or cpuset_mutex is held. Fixes: 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 47 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/isolation.c | 4 +--- kernel/time/timer_migration.c | 4 +--- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index aa915e9b588f..aa45351c6f38 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -65,14 +65,28 @@ static const char * const perr_strings[] = { * CPUSET Locking Convention * ------------------------- * - * Below are the three global locks guarding cpuset structures in lock + * Below are the four global/local locks guarding cpuset structures in lock * acquisition order: + * - cpuset_top_mutex * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) * - cpuset_mutex * - callback_lock (raw spinlock) * - * A task must hold all the three locks to modify externally visible or - * used fields of cpusets, though some of the internally used cpuset fields + * As cpuset will now indirectly flush a number of different workqueues in + * housekeeping_update() to update housekeeping cpumasks when the set of + * isolated CPUs is going to be changed, it may be vulnerable to deadlock + * if we hold cpus_read_lock while calling into housekeeping_update(). + * + * The first cpuset_top_mutex will be held except when calling into + * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock + * and cpuset_mutex will be held instead. The main purpose of this mutex + * is to prevent regular cpuset control file write actions from interfering + * with the call to housekeeping_update(), though CPU hotplug operation can + * still happen in parallel. This mutex also provides protection for some + * internal variables. + * + * A task must hold all the remaining three locks to modify externally visible + * or used fields of cpusets, though some of the internally used cpuset fields * and internal variables can be modified without holding callback_lock. If only * reliable read access of the externally used fields are needed, a task can * hold either cpuset_mutex or callback_lock which are exposed to other @@ -100,6 +114,7 @@ static const char * const perr_strings[] = { * cpumasks and nodemasks. */ +static DEFINE_MUTEX(cpuset_top_mutex); static DEFINE_MUTEX(cpuset_mutex); /* @@ -111,6 +126,8 @@ static DEFINE_MUTEX(cpuset_mutex); * * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable * by holding both cpuset_mutex and callback_lock. + * + * T: Read/write-able by holding the cpuset_top_mutex. */ /* @@ -134,6 +151,11 @@ static cpumask_var_t isolated_cpus; /* CSCB */ */ static bool update_housekeeping; /* RWCS */ +/* + * Copy of isolated_cpus to be passed to housekeeping_update() + */ +static cpumask_var_t isolated_hk_cpus; /* T */ + /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in @@ -297,6 +319,7 @@ void lockdep_assert_cpuset_lock_held(void) */ void cpuset_full_lock(void) { + mutex_lock(&cpuset_top_mutex); cpus_read_lock(); mutex_lock(&cpuset_mutex); } @@ -305,12 +328,14 @@ void cpuset_full_unlock(void) { mutex_unlock(&cpuset_mutex); cpus_read_unlock(); + mutex_unlock(&cpuset_top_mutex); } #ifdef CONFIG_LOCKDEP bool lockdep_is_cpuset_held(void) { - return lockdep_is_held(&cpuset_mutex); + return lockdep_is_held(&cpuset_mutex) || + lockdep_is_held(&cpuset_top_mutex); } #endif @@ -1315,9 +1340,20 @@ static void update_hk_sched_domains(void) { if (update_housekeeping) { /* Updating HK cpumasks implies rebuild sched domains */ - WARN_ON_ONCE(housekeeping_update(isolated_cpus)); update_housekeeping = false; force_sd_rebuild = true; + cpumask_copy(isolated_hk_cpus, isolated_cpus); + + /* + * housekeeping_update() is now called without holding + * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex + * is still being held for mutual exclusion. + */ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus)); + cpus_read_lock(); + mutex_lock(&cpuset_mutex); } /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */ if (force_sd_rebuild) @@ -3635,6 +3671,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3b725d39c06e..ef152d401fe2 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask) struct cpumask *trial, *old = NULL; int err; - lockdep_assert_cpus_held(); - trial = kmalloc(cpumask_size(), GFP_KERNEL); if (!trial) return -ENOMEM; @@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask) } if (!housekeeping.flags) - static_branch_enable_cpuslocked(&housekeeping_overridden); + static_branch_enable(&housekeeping_overridden); if (housekeeping.flags & HK_FLAG_DOMAIN) old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 6da9cd562b20..83428aa03aef 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1559,8 +1559,6 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; int cpu; - lockdep_assert_cpus_held(); - if (!works) return -ENOMEM; if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) @@ -1570,6 +1568,7 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) * First set previously isolated CPUs as available (unisolate). * This cpumask contains only CPUs that switched to available now. */ + guard(cpus_read_lock)(); cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); @@ -1626,7 +1625,6 @@ static int __init tmigr_init_isolation(void) cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); /* Protect against RCU torture hotplug testing */ - guard(cpus_read_lock)(); return tmigr_isolated_exclude_cpumask(cpumask); } late_initcall(tmigr_init_isolation); -- cgit v1.2.3 From 1bfd7575092420ba5a0b944953c95b74a5646ff8 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Thu, 19 Feb 2026 23:35:18 +0000 Subject: drm/xe/sync: Cleanup partially initialized sync on parse failure xe_sync_entry_parse() can allocate references (syncobj, fence, chain fence, or user fence) before hitting a later failure path. Several of those paths returned directly, leaving partially initialized state and leaking refs. Route these error paths through a common free_sync label and call xe_sync_entry_cleanup(sync) before returning the error. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost Signed-off-by: Shuicheng Lin Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260219233516.2938172-5-shuicheng.lin@intel.com (cherry picked from commit f939bdd9207a5d1fc55cced5459858480686ce22) Cc: stable@vger.kernel.org Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sync.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index eb136390dafd..ebf6c96d7a41 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -146,8 +146,10 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, if (!signal) { sync->fence = drm_syncobj_fence_get(sync->syncobj); - if (XE_IOCTL_DBG(xe, !sync->fence)) - return -EINVAL; + if (XE_IOCTL_DBG(xe, !sync->fence)) { + err = -EINVAL; + goto free_sync; + } } break; @@ -167,17 +169,21 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, if (signal) { sync->chain_fence = dma_fence_chain_alloc(); - if (!sync->chain_fence) - return -ENOMEM; + if (!sync->chain_fence) { + err = -ENOMEM; + goto free_sync; + } } else { sync->fence = drm_syncobj_fence_get(sync->syncobj); - if (XE_IOCTL_DBG(xe, !sync->fence)) - return -EINVAL; + if (XE_IOCTL_DBG(xe, !sync->fence)) { + err = -EINVAL; + goto free_sync; + } err = dma_fence_chain_find_seqno(&sync->fence, sync_in.timeline_value); if (err) - return err; + goto free_sync; } break; @@ -216,6 +222,10 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, sync->timeline_value = sync_in.timeline_value; return 0; + +free_sync: + xe_sync_entry_cleanup(sync); + return err; } ALLOW_ERROR_INJECTION(xe_sync_entry_parse, ERRNO); -- cgit v1.2.3 From 0879c3f04f67e2a1677c25dcc24669ce21eb6a6c Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Thu, 19 Feb 2026 23:35:19 +0000 Subject: drm/xe/sync: Fix user fence leak on alloc failure When dma_fence_chain_alloc() fails, properly release the user fence reference to prevent a memory leak. Fixes: 0995c2fc39b0 ("drm/xe: Enforce correct user fence signaling order using") Cc: Matthew Brost Signed-off-by: Shuicheng Lin Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260219233516.2938172-6-shuicheng.lin@intel.com (cherry picked from commit a5d5634cde48a9fcd68c8504aa07f89f175074a0) Cc: stable@vger.kernel.org Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index ebf6c96d7a41..24d6d9af20d6 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -206,8 +206,10 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence))) return PTR_ERR(sync->ufence); sync->ufence_chain_fence = dma_fence_chain_alloc(); - if (!sync->ufence_chain_fence) - return -ENOMEM; + if (!sync->ufence_chain_fence) { + err = -ENOMEM; + goto free_sync; + } sync->ufence_syncobj = ufence_syncobj; } -- cgit v1.2.3 From 96a7b71c4438d3b72d6c95e3efdc9e8e8aee6b78 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 23 Feb 2026 13:43:45 -0800 Subject: ubd: Use pointer-to-pointers for io_thread_req arrays Having an unbounded array for irq_req_buffer and io_req_buffer doesn't provide any bounds safety, and confuses the needed allocation type, which is returning a pointer to pointers. Instead of the implicit cast, switch the variable types. Reported-by: Nathan Chancellor Reported-by: Guenter Roeck Closes: https://lore.kernel.org/all/b04b6c13-7d0e-4a89-9e68-b572b6c686ac@roeck-us.net Fixes: 69050f8d6d07 ("treewide: Replace kmalloc with kmalloc_obj for non-scalar types") Acked-by: Richard Weinberger Link: https://patch.msgid.link/20260223214341.work.846-kees@kernel.org Signed-off-by: Kees Cook --- arch/um/drivers/ubd_kern.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 012b2bcaa8a0..20fc33300a95 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -69,11 +69,11 @@ struct io_thread_req { }; -static struct io_thread_req * (*irq_req_buffer)[]; +static struct io_thread_req **irq_req_buffer; static struct io_thread_req *irq_remainder; static int irq_remainder_size; -static struct io_thread_req * (*io_req_buffer)[]; +static struct io_thread_req **io_req_buffer; static struct io_thread_req *io_remainder; static int io_remainder_size; @@ -398,7 +398,7 @@ static int thread_fd = -1; static int bulk_req_safe_read( int fd, - struct io_thread_req * (*request_buffer)[], + struct io_thread_req **request_buffer, struct io_thread_req **remainder, int *remainder_size, int max_recs @@ -465,7 +465,7 @@ static irqreturn_t ubd_intr(int irq, void *dev) &irq_remainder, &irq_remainder_size, UBD_REQ_BUFFER_SIZE)) >= 0) { for (i = 0; i < len / sizeof(struct io_thread_req *); i++) - ubd_end_request((*irq_req_buffer)[i]); + ubd_end_request(irq_req_buffer[i]); } if (len < 0 && len != -EAGAIN) @@ -1512,7 +1512,7 @@ void *io_thread(void *arg) } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *req = (*io_req_buffer)[count]; + struct io_thread_req *req = io_req_buffer[count]; int i; io_count++; -- cgit v1.2.3 From f709859f331b8fbd7fede5769d668008fc5ba789 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 23 Feb 2026 11:23:12 -0700 Subject: init/Kconfig: Adjust fixed clang version for __builtin_counted_by_ref Commit d39a1d7486d9 ("compiler_types: Disable __builtin_counted_by_ref for Clang") used 22.0.0 as the fixed version for a compiler crash but the fix was only merged in main (23.0.0) and release/22.x (22.1.0). With the current fixed version number, prerelease or Android LLVM 22 builds will still be able to hit the compiler crash when building the kernel. This can be particularly disruptive when bisecting LLVM. Use 21.1.0 as the fixed version number to ensure the fix for this crash is always present. Fixes: d39a1d7486d9 ("compiler_types: Disable __builtin_counted_by_ref for Clang") Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20260223-fix-clang-version-builtin-counted-by-ref-v1-1-3ea478a24f0a@kernel.org Signed-off-by: Kees Cook --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index c25869cf59c1..b55deae9256c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -153,7 +153,7 @@ config CC_HAS_COUNTED_BY_PTR config CC_HAS_BROKEN_COUNTED_BY_REF bool # https://github.com/llvm/llvm-project/issues/182575 - default y if CC_IS_CLANG && CLANG_VERSION < 220000 + default y if CC_IS_CLANG && CLANG_VERSION < 220100 config CC_HAS_MULTIDIMENSIONAL_NONSTRING def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) -- cgit v1.2.3 From 63c49efc987afefc6b9bb7de083eb8748e0b1789 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:17 -0800 Subject: selftests/bpf: Add simple strscpy() implementation Replace bpf_strlcpy() in bpf_util.h with a sized_strscpy(), which is a simplified sized_strscpy() from the kernel (lib/string.c [1]). It: * takes a count (destination size) parameter * guarantees NULL-termination * returns the number of characters copied or -E2BIG Re-define strscpy macro similar to in-kernel implementation [2]: allow the count parameter to be optional. Add #ifdef-s to tools/include/linux/args.h, as they may be defined in other system headers (for example, __CONCAT in sys/cdefs.h). Fixup the single existing bpf_strlcpy() call in cgroup_helpers.c [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c?h=v6.19#n113 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/string.h?h=v6.19#n91 Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/include/linux/args.h | 4 +++ tools/testing/selftests/bpf/bpf_util.h | 45 ++++++++++++++++++++-------- tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/tools/include/linux/args.h b/tools/include/linux/args.h index 2e8e65d975c7..14b268f2389a 100644 --- a/tools/include/linux/args.h +++ b/tools/include/linux/args.h @@ -22,7 +22,11 @@ #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) /* Concatenate two parameters, but allow them to be expanded beforehand. */ +#ifndef __CONCAT #define __CONCAT(a, b) a ## b +#endif +#ifndef CONCATENATE #define CONCATENATE(a, b) __CONCAT(a, b) +#endif #endif /* _LINUX_ARGS_H */ diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 4bc2d25f33e1..6cb56501a505 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -8,6 +8,7 @@ #include #include #include /* libbpf_num_possible_cpus */ +#include static inline unsigned int bpf_num_possible_cpus(void) { @@ -21,25 +22,43 @@ static inline unsigned int bpf_num_possible_cpus(void) return possible_cpus; } -/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst - * is zero-terminated string no matter what (unless sz == 0, in which case - * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs - * in what is returned. Given this is internal helper, it's trivial to extend - * this, when necessary. Use this instead of strncpy inside libbpf source code. +/* + * Simplified strscpy() implementation. The kernel one is in lib/string.c */ -static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) +static inline ssize_t sized_strscpy(char *dest, const char *src, size_t count) { - size_t i; + long res = 0; - if (sz == 0) - return; + if (count == 0) + return -E2BIG; - sz--; - for (i = 0; i < sz && src[i]; i++) - dst[i] = src[i]; - dst[i] = '\0'; + while (count > 1) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Force NUL-termination. */ + dest[res] = '\0'; + + /* Return E2BIG if the source didn't stop */ + return src[res] ? -E2BIG : res; } +#define __strscpy0(dst, src, ...) \ + sized_strscpy(dst, src, sizeof(dst)) +#define __strscpy1(dst, src, size) \ + sized_strscpy(dst, src, size) + +#undef strscpy /* Redefine the placeholder from tools/include/linux/string.h */ +#define strscpy(dst, src, ...) \ + CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) + #define __bpf_percpu_val_align __attribute__((__aligned__(8))) #define BPF_DECLARE_PERCPU(type, name) \ diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 20cede4db3ce..45cd0b479fe3 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -86,7 +86,7 @@ static int __enable_controllers(const char *cgroup_path, const char *controllers enable[len] = 0; close(fd); } else { - bpf_strlcpy(enable, controllers, sizeof(enable)); + strscpy(enable, controllers); } snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); -- cgit v1.2.3 From 6a087ac23f5b6619033ad08e60c355b00bb8ce51 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:18 -0800 Subject: selftests/bpf: Replace strcpy() calls with strscpy() strcpy() does not perform bounds checking and is considered deprecated [1]. Replace strcpy() calls with strscpy() defined in bpf_util.h. [1] https://docs.kernel.org/process/deprecated.html#strcpy Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/network_helpers.c | 2 +- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- tools/testing/selftests/bpf/prog_tests/setget_sockopt.c | 2 +- tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_veristat.c | 4 ++-- tools/testing/selftests/bpf/xdp_features.c | 3 ++- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 0a6a5561bed3..5374b7e16d53 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -432,7 +432,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, memset(addr, 0, sizeof(*sun)); sun->sun_family = family; sun->sun_path[0] = 0; - strcpy(sun->sun_path + 1, addr_str); + strscpy(sun->sun_path + 1, addr_str, sizeof(sun->sun_path) - 1); if (len) *len = offsetof(struct sockaddr_un, sun_path) + 1 + strlen(addr_str); return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index b7d1b52309d0..f829b6f09bc9 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -281,7 +281,7 @@ static void test_dctcp_fallback(void) dctcp_skel = bpf_dctcp__open(); if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) return; - strcpy(dctcp_skel->rodata->fallback_cc, "cubic"); + strscpy(dctcp_skel->rodata->fallback_cc, "cubic"); if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index e4dac529d424..77fe1bfb7504 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -212,7 +212,7 @@ void test_setget_sockopt(void) if (!ASSERT_OK_PTR(skel, "open skel")) goto done; - strcpy(skel->rodata->veth, "binddevtest1"); + strscpy(skel->rodata->veth, "binddevtest1"); skel->rodata->veth_ifindex = if_nametoindex("binddevtest1"); if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex")) goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index ba6b3ec1156a..53637431ec5d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -142,7 +142,7 @@ static int getsetsockopt(void) /* TCP_CONGESTION can extend the string */ - strcpy(buf.cc, "nv"); + strscpy(buf.cc, "nv"); err = setsockopt(fd, SOL_TCP, TCP_CONGESTION, &buf, strlen("nv")); if (err) { log_err("Failed to call setsockopt(TCP_CONGESTION)"); diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index b38c16b4247f..9aff08ac55c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -24,9 +24,9 @@ static struct fixture *init_fixture(void) /* for no_alu32 and cpuv4 veristat is in parent folder */ if (access("./veristat", F_OK) == 0) - strcpy(fix->veristat, "./veristat"); + strscpy(fix->veristat, "./veristat"); else if (access("../veristat", F_OK) == 0) - strcpy(fix->veristat, "../veristat"); + strscpy(fix->veristat, "../veristat"); else PRINT_FAIL("Can't find veristat binary"); diff --git a/tools/testing/selftests/bpf/xdp_features.c b/tools/testing/selftests/bpf/xdp_features.c index 595c79141cf3..a27ed663967c 100644 --- a/tools/testing/selftests/bpf/xdp_features.c +++ b/tools/testing/selftests/bpf/xdp_features.c @@ -16,6 +16,7 @@ #include +#include "bpf_util.h" #include "xdp_features.skel.h" #include "xdp_features.h" @@ -212,7 +213,7 @@ static void set_env_default(void) env.feature.drv_feature = NETDEV_XDP_ACT_NDO_XMIT; env.feature.action = -EINVAL; env.ifindex = -ENODEV; - strcpy(env.ifname, "unknown"); + strscpy(env.ifname, "unknown"); make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_CTRL_PORT, &env.dut_ctrl_addr, NULL); make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_ECHO_PORT, -- cgit v1.2.3 From 9af0feae8016ba58ad7ff784a903404986b395b1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 27 Jan 2026 10:38:39 +0100 Subject: RDMA/core: Fix stale RoCE GIDs during netdev events at registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RoCE GID entries become stale when netdev properties change during the IB device registration window. This is reproducible with a udev rule that sets a MAC address when a VF netdev appears: ACTION=="add", SUBSYSTEM=="net", KERNEL=="eth4", \ RUN+="/sbin/ip link set eth4 address 88:22:33:44:55:66" After VF creation, show_gids displays GIDs derived from the original random MAC rather than the configured one. The root cause is a race between netdev event processing and device registration: CPU 0 (driver) CPU 1 (udev/workqueue) ────────────── ────────────────────── ib_register_device() ib_cache_setup_one() gid_table_setup_one() _gid_table_setup_one() ← GID table allocated rdma_roce_rescan_device() ← GIDs populated with OLD MAC ip link set eth4 addr NEW_MAC NETDEV_CHANGEADDR queued netdevice_event_work_handler() ib_enum_all_roce_netdevs() ← Iterates DEVICE_REGISTERED ← Device NOT marked yet, SKIP! enable_device_and_get() xa_set_mark(DEVICE_REGISTERED) ← Too late, event was lost The netdev event handler uses ib_enum_all_roce_netdevs() which only iterates devices marked DEVICE_REGISTERED. However, this mark is set late in the registration process, after the GID cache is already populated. Events arriving in this window are silently dropped. Fix this by introducing a new xarray mark DEVICE_GID_UPDATES that is set immediately after the GID table is allocated and initialized. Use the new mark in ib_enum_all_roce_netdevs() function to iterate devices instead of DEVICE_REGISTERED. This is safe because: - After _gid_table_setup_one(), all required structures exist (port_data, immutable, cache.gid) - The GID table mutex serializes concurrent access between the initial rescan and event handlers - Event handlers correctly update stale GIDs even when racing with rescan - The mark is cleared in ib_cache_cleanup_one() before teardown This also fixes similar races for IP address events (inetaddr_event, inet6addr_event) which use the same enumeration path. Fixes: 0df91bb67334 ("RDMA/devices: Use xarray to store the client_data") Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260127093839.126291-1-jiri@resnulli.us Reported-by: syzbot+881d65229ca4f9ae8c84@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 13 +++++++++++++ drivers/infiniband/core/core_priv.h | 3 +++ drivers/infiniband/core/device.c | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index b415d4aad04f..ee4a2bc68fb2 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -926,6 +926,13 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; + /* + * Mark the device as ready for GID cache updates. This allows netdev + * event handlers to update the GID cache even before the device is + * fully registered. + */ + ib_device_enable_gid_updates(ib_dev); + rdma_roce_rescan_device(ib_dev); return err; @@ -1637,6 +1644,12 @@ void ib_cache_release_one(struct ib_device *device) void ib_cache_cleanup_one(struct ib_device *device) { + /* + * Clear the GID updates mark first to prevent event handlers from + * accessing the device while it's being torn down. + */ + ib_device_disable_gid_updates(device); + /* The cleanup function waits for all in-progress workqueue * elements and cleans up the GID cache. This function should be * called after the device was removed from the devices list and diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 05102769a918..a2c36666e6fc 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -100,6 +100,9 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, roce_netdev_callback cb, void *cookie); +void ib_device_enable_gid_updates(struct ib_device *device); +void ib_device_disable_gid_updates(struct ib_device *device); + typedef int (*nldev_callback)(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1b5f1ee0a557..558b73940d66 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -93,6 +93,7 @@ static struct workqueue_struct *ib_unreg_wq; static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 +#define DEVICE_GID_UPDATES XA_MARK_2 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 @@ -2412,11 +2413,42 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, unsigned long index; down_read(&devices_rwsem); - xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) + xa_for_each_marked(&devices, index, dev, DEVICE_GID_UPDATES) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } +/** + * ib_device_enable_gid_updates - Mark device as ready for GID cache updates + * @device: Device to mark + * + * Called after GID table is allocated and initialized. After this mark is set, + * netdevice event handlers can update the device's GID cache. This allows + * events that arrive during device registration to be processed, avoiding + * stale GID entries when netdev properties change during the device + * registration process. + */ +void ib_device_enable_gid_updates(struct ib_device *device) +{ + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_GID_UPDATES); + up_write(&devices_rwsem); +} + +/** + * ib_device_disable_gid_updates - Clear the GID updates mark + * @device: Device to unmark + * + * Called before GID table cleanup to prevent event handlers from accessing + * the device while it's being torn down. + */ +void ib_device_disable_gid_updates(struct ib_device *device) +{ + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_GID_UPDATES); + up_write(&devices_rwsem); +} + /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device -- cgit v1.2.3 From 7a23af417d9dd57b4382356b2e7442e5d2bf5bea Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Wed, 18 Feb 2026 09:12:45 +0000 Subject: RDMA/bng_re: Remove unnessary validity checks Fix below smatch warning: drivers/infiniband/hw/bng_re/bng_dev.c:113 bng_re_net_ring_free() warn: variable dereferenced before check 'rdev' (see line 107) current driver has unnessary validity checks. So, removing these unnessary validity checks. Fixes: 4f830cd8d7fe ("RDMA/bng_re: Add infrastructure for enabling Firmware channel") Fixes: 745065770c2d ("RDMA/bng_re: Register and get the resources from bnge driver") Fixes: 04e031ff6e60 ("RDMA/bng_re: Initialize the Firmware and Hardware") Fixes: d0da769c19d0 ("RDMA/bng_re: Add Auxiliary interface") Reported-by: Simon Horman Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601010413.sWadrQel-lkp@intel.com/ Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20260218091246.1764808-2-siva.kallam@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_dev.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index cf3bf08e5f26..b8df807c3a64 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -54,9 +54,6 @@ static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) { struct bng_re_chip_ctx *chip_ctx; - if (!rdev->chip_ctx) - return; - kfree(rdev->dev_attr); rdev->dev_attr = NULL; @@ -124,12 +121,6 @@ static int bng_re_net_ring_free(struct bng_re_dev *rdev, struct bnge_fw_msg fw_msg = {}; int rc = -EINVAL; - if (!rdev) - return rc; - - if (!aux_dev) - return rc; - bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); req.ring_type = type; req.ring_id = cpu_to_le16(fw_ring_id); @@ -150,10 +141,7 @@ static int bng_re_net_ring_alloc(struct bng_re_dev *rdev, struct hwrm_ring_alloc_input req = {}; struct hwrm_ring_alloc_output resp; struct bnge_fw_msg fw_msg = {}; - int rc = -EINVAL; - - if (!aux_dev) - return rc; + int rc; bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); req.enables = 0; @@ -184,10 +172,7 @@ static int bng_re_stats_ctx_free(struct bng_re_dev *rdev) struct hwrm_stat_ctx_free_input req = {}; struct hwrm_stat_ctx_free_output resp = {}; struct bnge_fw_msg fw_msg = {}; - int rc = -EINVAL; - - if (!aux_dev) - return rc; + int rc; bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id); @@ -208,13 +193,10 @@ static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev) struct hwrm_stat_ctx_alloc_output resp = {}; struct hwrm_stat_ctx_alloc_input req = {}; struct bnge_fw_msg fw_msg = {}; - int rc = -EINVAL; + int rc; stats->fw_id = BNGE_INVALID_STATS_CTX_ID; - if (!aux_dev) - return rc; - bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); req.update_period_ms = cpu_to_le32(1000); req.stats_dma_addr = cpu_to_le64(stats->dma_map); @@ -486,8 +468,7 @@ static void bng_re_remove(struct auxiliary_device *adev) rdev = dev_info->rdev; - if (rdev) - bng_re_remove_device(rdev, adev); + bng_re_remove_device(rdev, adev); kfree(dev_info); } -- cgit v1.2.3 From 3d2e5d12a2eef0ca8a629a422aa593673235c77c Mon Sep 17 00:00:00 2001 From: Siva Reddy Kallam Date: Wed, 18 Feb 2026 09:12:46 +0000 Subject: RDMA/bng_re: Unwind bng_re_dev_init properly Fix below smatch warning: drivers/infiniband/hw/bng_re/bng_dev.c:270 bng_re_dev_init() warn: missing unwind goto? Current bng_re_dev_init function is not having clear unwinding code. So, added proper unwinding with ladder. Fixes: 4f830cd8d7fe ("RDMA/bng_re: Add infrastructure for enabling Firmware channel") Reported-by: Simon Horman Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601010413.sWadrQel-lkp@intel.com/ Signed-off-by: Siva Reddy Kallam Link: https://patch.msgid.link/20260218091246.1764808-3-siva.kallam@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bng_re/bng_dev.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c index b8df807c3a64..d34b5f88cd40 100644 --- a/drivers/infiniband/hw/bng_re/bng_dev.c +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -285,7 +285,7 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) if (rc) { ibdev_err(&rdev->ibdev, "Failed to register with netedev: %#x\n", rc); - return -EINVAL; + goto reg_netdev_fail; } set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -294,19 +294,16 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) ibdev_err(&rdev->ibdev, "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", rdev->aux_dev->auxr_info->msix_requested); - bnge_unregister_dev(rdev->aux_dev); - clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); - return -EINVAL; + rc = -EINVAL; + goto msix_ctx_fail; } ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", rdev->aux_dev->auxr_info->msix_requested); rc = bng_re_setup_chip_ctx(rdev); if (rc) { - bnge_unregister_dev(rdev->aux_dev); - clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); ibdev_err(&rdev->ibdev, "Failed to get chip context\n"); - return -EINVAL; + goto msix_ctx_fail; } bng_re_query_hwrm_version(rdev); @@ -315,16 +312,14 @@ static int bng_re_dev_init(struct bng_re_dev *rdev) if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate RCFW Channel: %#x\n", rc); - goto fail; + goto alloc_fw_chl_fail; } /* Allocate nq record memory */ rdev->nqr = kzalloc_obj(*rdev->nqr); if (!rdev->nqr) { - bng_re_destroy_chip_ctx(rdev); - bnge_unregister_dev(rdev->aux_dev); - clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); - return -ENOMEM; + rc = -ENOMEM; + goto nq_alloc_fail; } rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested; @@ -393,9 +388,15 @@ disable_rcfw: free_ring: bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); free_rcfw: + kfree(rdev->nqr); +nq_alloc_fail: bng_re_free_rcfw_channel(&rdev->rcfw); -fail: - bng_re_dev_uninit(rdev); +alloc_fw_chl_fail: + bng_re_destroy_chip_ctx(rdev); +msix_ctx_fail: + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); +reg_netdev_fail: return rc; } -- cgit v1.2.3 From 7a648d598cb8e8c62af3f0e020a25820a3f3a9a7 Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Sat, 14 Feb 2026 23:14:51 +0800 Subject: pinctrl: pinconf-generic: Fix memory leak in pinconf_generic_parse_dt_config() In pinconf_generic_parse_dt_config(), if parse_dt_cfg() fails, it returns directly. This bypasses the cleanup logic and results in a memory leak of the cfg buffer. Fix this by jumping to the out label on failure, ensuring kfree(cfg) is called before returning. Fixes: 90a18c512884 ("pinctrl: pinconf-generic: Handle string values for generic properties") Signed-off-by: Felix Gu Reviewed-by: Antonio Borneo Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index 94b1d057197c..2b030bd0e6ad 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -351,13 +351,13 @@ int pinconf_generic_parse_dt_config(struct device_node *np, ret = parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg); if (ret) - return ret; + goto out; if (pctldev && pctldev->desc->num_custom_params && pctldev->desc->custom_params) { ret = parse_dt_cfg(np, pctldev->desc->custom_params, pctldev->desc->num_custom_params, cfg, &ncfg); if (ret) - return ret; + goto out; } /* no configs found at all */ -- cgit v1.2.3 From a48150d05190b41b5eec19f74b751a75a15a456a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Feb 2026 09:58:39 +0100 Subject: pinctrl: amdisp: Make amdisp_pinctrl_ops variable static File-scope 'amdisp_pinctrl_ops' is not used outside of this unit, so make it static to silence sparse warning: pinctrl-amdisp.c:83:26: warning: symbol 'amdisp_pinctrl_ops' was not declared. Should it be static? Signed-off-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amdisp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-amdisp.c b/drivers/pinctrl/pinctrl-amdisp.c index efbf40c776ea..e0874cc086a7 100644 --- a/drivers/pinctrl/pinctrl-amdisp.c +++ b/drivers/pinctrl/pinctrl-amdisp.c @@ -80,7 +80,7 @@ static int amdisp_get_group_pins(struct pinctrl_dev *pctldev, return 0; } -const struct pinctrl_ops amdisp_pinctrl_ops = { +static const struct pinctrl_ops amdisp_pinctrl_ops = { .get_groups_count = amdisp_get_groups_count, .get_group_name = amdisp_get_group_name, .get_group_pins = amdisp_get_group_pins, -- cgit v1.2.3 From c2e174994c9e98956a85dee9af6fbb293f5ad673 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Feb 2026 09:58:40 +0100 Subject: pinctrl: cix: sky1: Unexport sky1_pinctrl_pm_ops File-scope 'sky1_pinctrl_pm_ops' is not used outside of this unit (and it should not be!), so unexport it and make it static to silence sparse warning: pinctrl-sky1.c:525:25: warning: symbol 'sky1_pinctrl_pm_ops' was not declared. Should it be static? Signed-off-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij --- drivers/pinctrl/cix/pinctrl-sky1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pinctrl/cix/pinctrl-sky1.c b/drivers/pinctrl/cix/pinctrl-sky1.c index 5d0d8be815b2..938894058d86 100644 --- a/drivers/pinctrl/cix/pinctrl-sky1.c +++ b/drivers/pinctrl/cix/pinctrl-sky1.c @@ -522,11 +522,10 @@ static int __maybe_unused sky1_pinctrl_resume(struct device *dev) return pinctrl_force_default(spctl->pctl); } -const struct dev_pm_ops sky1_pinctrl_pm_ops = { +static const struct dev_pm_ops sky1_pinctrl_pm_ops = { SET_LATE_SYSTEM_SLEEP_PM_OPS(sky1_pinctrl_suspend, sky1_pinctrl_resume) }; -EXPORT_SYMBOL_GPL(sky1_pinctrl_pm_ops); static int sky1_pinctrl_probe(struct platform_device *pdev) { -- cgit v1.2.3 From e9e268ea9df102abef34d7afba59ef4d5868d5d7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 16 Feb 2026 09:25:50 +0100 Subject: pinctrl: qcom: sdm660-lpass-lpi: Make groups and functions variables static File-scope 'sdm660_lpi_pinctrl_groups' and 'sdm660_lpi_pinctrl_functions' are not used outside of this unit, so make them static to silence sparse warnings: pinctrl-sdm660-lpass-lpi.c:79:27: warning: symbol 'sdm660_lpi_pinctrl_groups' was not declared. Should it be static? pinctrl-sdm660-lpass-lpi.c:116:27: warning: symbol 'sdm660_lpi_pinctrl_functions' was not declared. Should it be static? Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c index d93af5f0e8d3..65411abfbfac 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c @@ -76,7 +76,7 @@ static const char * const pdm_clk_groups[] = { "gpio18" }; static const char * const pdm_rx_groups[] = { "gpio21", "gpio23", "gpio25" }; static const char * const pdm_sync_groups[] = { "gpio19" }; -const struct lpi_pingroup sdm660_lpi_pinctrl_groups[] = { +static const struct lpi_pingroup sdm660_lpi_pinctrl_groups[] = { LPI_PINGROUP_OFFSET(0, LPI_NO_SLEW, _, _, _, _, 0x0000), LPI_PINGROUP_OFFSET(1, LPI_NO_SLEW, _, _, _, _, 0x1000), LPI_PINGROUP_OFFSET(2, LPI_NO_SLEW, _, _, _, _, 0x2000), @@ -113,7 +113,7 @@ const struct lpi_pingroup sdm660_lpi_pinctrl_groups[] = { LPI_PINGROUP_OFFSET(31, LPI_NO_SLEW, _, _, _, _, 0xb010), }; -const struct lpi_function sdm660_lpi_pinctrl_functions[] = { +static const struct lpi_function sdm660_lpi_pinctrl_functions[] = { LPI_FUNCTION(comp_rx), LPI_FUNCTION(dmic1_clk), LPI_FUNCTION(dmic1_data), -- cgit v1.2.3 From a2539b92e4b791c1ba482930b5e51b1591975461 Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Thu, 19 Feb 2026 00:51:22 +0800 Subject: pinctrl: meson: amlogic-a4: Fix device node reference leak in aml_dt_node_to_map_pinmux() The of_get_parent() function returns a device_node with an incremented reference count. Use the __free(device_node) cleanup attribute to ensure of_node_put() is automatically called when pnode goes out of scope, fixing a reference leak. Fixes: 6e9be3abb78c ("pinctrl: Add driver support for Amlogic SoCs") Signed-off-by: Felix Gu Signed-off-by: Linus Walleij --- drivers/pinctrl/meson/pinctrl-amlogic-a4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c index dfa32b11555c..e2293a872dcb 100644 --- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c +++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c @@ -679,7 +679,6 @@ static int aml_dt_node_to_map_pinmux(struct pinctrl_dev *pctldev, unsigned int *num_maps) { struct device *dev = pctldev->dev; - struct device_node *pnode; unsigned long *configs = NULL; unsigned int num_configs = 0; struct property *prop; @@ -693,7 +692,7 @@ static int aml_dt_node_to_map_pinmux(struct pinctrl_dev *pctldev, return -ENOENT; } - pnode = of_get_parent(np); + struct device_node *pnode __free(device_node) = of_get_parent(np); if (!pnode) { dev_info(dev, "Missing function node\n"); return -EINVAL; -- cgit v1.2.3 From fd5bed798f45eb3a178ad527b43ab92705faaf8a Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Mon, 23 Feb 2026 17:39:07 +0800 Subject: pinctrl: cirrus: cs42l43: Fix double-put in cs42l43_pin_probe() devm_add_action_or_reset() already invokes the action on failure, so the explicit put causes a double-put. Fixes: 9b07cdf86a0b ("pinctrl: cirrus: Fix fwnode leak in cs42l43_pin_probe()") Signed-off-by: Felix Gu Reviewed-by: Charles Keepax Signed-off-by: Linus Walleij --- drivers/pinctrl/cirrus/pinctrl-cs42l43.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c index a8f82104a384..227c37c360e1 100644 --- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c +++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c @@ -574,10 +574,9 @@ static int cs42l43_pin_probe(struct platform_device *pdev) if (child) { ret = devm_add_action_or_reset(&pdev->dev, cs42l43_fwnode_put, child); - if (ret) { - fwnode_handle_put(child); + if (ret) return ret; - } + if (!child->dev) child->dev = priv->dev; fwnode = child; -- cgit v1.2.3 From 45fe4592454368df24d18352be700ff40e7df0c0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 23 Feb 2026 11:57:18 +0100 Subject: pinctrl: rockchip: Fix configuring a deferred pin Commit e2c58cbe3aff ("pinctrl: rockchip: Simplify locking with scoped_guard()") added a scoped_guard() over existing code containing a "break" instruction. That "break" was for the outer (existing) for-loop, which now exits inner, scoped_guard() loop. If GPIO driver did not probe, then driver will not bail out, but instead continue to configure the pin. Fix the issue by simplifying the code - the break in original code was leading directly to end of the function returning 0, thus we can simply return here rockchip_pinconf_defer_pin status. Reported-by: David Lechner Closes: https://lore.kernel.org/r/f5b38942-a584-4e78-a893-de4a219070b2@baylibre.com/ Fixes: e2c58cbe3aff ("pinctrl: rockchip: Simplify locking with scoped_guard()") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-rockchip.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index d87c0b1de616..f15b18f334ee 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3640,14 +3640,10 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, * or the gpio driver hasn't probed yet. */ scoped_guard(mutex, &bank->deferred_lock) { - if (!gpio || !gpio->direction_output) { - rc = rockchip_pinconf_defer_pin(bank, - pin - bank->pin_base, - param, arg); - if (rc) - return rc; - break; - } + if (!gpio || !gpio->direction_output) + return rockchip_pinconf_defer_pin(bank, + pin - bank->pin_base, + param, arg); } } -- cgit v1.2.3 From 01e10d0272b932f908b4f9b6609a10cb1f35fafe Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Tue, 24 Feb 2026 17:24:18 +0800 Subject: pinctrl: sunxi: Implement gpiochip::get_direction() After commit 471e998c0e31 ("gpiolib: remove redundant callback check"), a warning will be printed if the gpio driver does not implement this callback. The warning was added in commit e623c4303ed1 ("gpiolib: sanitize the return value of gpio_chip::get_direction()"), but was masked by the "redundant" check. The warning can be triggered by any action that calls the callback, such as dumping the GPIO state from /sys/kernel/debug/gpio. Implement it for the sunxi driver. This is simply a matter of reading out the mux value from the registers, then checking if it is one of the GPIO functions and which direction it is. Signed-off-by: Chen-Yu Tsai Reviewed-by: Jernej Skrabec Reviewed-by: Bartosz Golaszewski Reviewed-by: Andre Przywara Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index 48434292a39b..c990b6118172 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -204,6 +204,32 @@ sunxi_pinctrl_desc_find_function_by_pin(struct sunxi_pinctrl *pctl, return NULL; } +static struct sunxi_desc_function * +sunxi_pinctrl_desc_find_function_by_pin_and_mux(struct sunxi_pinctrl *pctl, + const u16 pin_num, + const u8 muxval) +{ + for (unsigned int i = 0; i < pctl->desc->npins; i++) { + const struct sunxi_desc_pin *pin = pctl->desc->pins + i; + struct sunxi_desc_function *func = pin->functions; + + if (pin->pin.number != pin_num) + continue; + + if (pin->variant && !(pctl->variant & pin->variant)) + continue; + + while (func->name) { + if (func->muxval == muxval) + return func; + + func++; + } + } + + return NULL; +} + static int sunxi_pctrl_get_groups_count(struct pinctrl_dev *pctldev) { struct sunxi_pinctrl *pctl = pinctrl_dev_get_drvdata(pctldev); @@ -930,6 +956,30 @@ static const struct pinmux_ops sunxi_pmx_ops = { .strict = true, }; +static int sunxi_pinctrl_gpio_get_direction(struct gpio_chip *chip, + unsigned int offset) +{ + struct sunxi_pinctrl *pctl = gpiochip_get_data(chip); + const struct sunxi_desc_function *func; + u32 pin = offset + chip->base; + u32 reg, shift, mask; + u8 muxval; + + sunxi_mux_reg(pctl, offset, ®, &shift, &mask); + + muxval = (readl(pctl->membase + reg) & mask) >> shift; + + func = sunxi_pinctrl_desc_find_function_by_pin_and_mux(pctl, pin, muxval); + if (!func) + return -ENODEV; + + if (!strcmp(func->name, "gpio_out")) + return GPIO_LINE_DIRECTION_OUT; + if (!strcmp(func->name, "gpio_in") || !strcmp(func->name, "irq")) + return GPIO_LINE_DIRECTION_IN; + return -EINVAL; +} + static int sunxi_pinctrl_gpio_direction_input(struct gpio_chip *chip, unsigned offset) { @@ -1599,6 +1649,7 @@ int sunxi_pinctrl_init_with_flags(struct platform_device *pdev, pctl->chip->request = gpiochip_generic_request; pctl->chip->free = gpiochip_generic_free; pctl->chip->set_config = gpiochip_generic_config; + pctl->chip->get_direction = sunxi_pinctrl_gpio_get_direction; pctl->chip->direction_input = sunxi_pinctrl_gpio_direction_input; pctl->chip->direction_output = sunxi_pinctrl_gpio_direction_output; pctl->chip->get = sunxi_pinctrl_gpio_get; -- cgit v1.2.3 From fd80bd7105f88189f47d465ca8cb7d115570de30 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Fri, 20 Feb 2026 17:21:26 -0500 Subject: RDMA/ionic: Fix potential NULL pointer dereference in ionic_query_port The function ionic_query_port() calls ib_device_get_netdev() without checking the return value which could lead to NULL pointer dereference, Fix it by checking the return value and return -ENODEV if the 'ndev' is NULL. Fixes: 2075bbe8ef03 ("RDMA/ionic: Register device ops for miscellaneous functionality") Signed-off-by: Kamal Heib Link: https://patch.msgid.link/20260220222125.16973-2-kheib@redhat.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/ionic/ionic_ibdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c index 164046d00e5d..bd4c73e530d0 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.c +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c @@ -81,6 +81,8 @@ static int ionic_query_port(struct ib_device *ibdev, u32 port, return -EINVAL; ndev = ib_device_get_netdev(ibdev, port); + if (!ndev) + return -ENODEV; if (netif_running(ndev) && netif_carrier_ok(ndev)) { attr->state = IB_PORT_ACTIVE; -- cgit v1.2.3 From f22c77ce49db0589103d96487dca56f5b2136362 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 Feb 2026 11:02:47 -0400 Subject: RDMA/efa: Fix typo in efa_alloc_mr() The pattern is to check the entire driver request space, not just sizeof something unrelated. Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Signed-off-by: Jason Gunthorpe Link: https://patch.msgid.link/1-v1-83e918d69e73+a9-rdma_udata_rc_jgg@nvidia.com Acked-by: Michael Margolin Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index b5b93b42e6c4..b0aa7c0238ed 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1661,7 +1661,7 @@ static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags, struct efa_mr *mr; if (udata && udata->inlen && - !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { + !ib_is_udata_cleared(udata, 0, udata->inlen)) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, udata not cleared\n"); return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 117942ca43e2e3c3d121faae530989931b7f67e1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 Feb 2026 11:02:48 -0400 Subject: IB/mthca: Add missed mthca_unmap_user_db() for mthca_create_srq() Fix a user triggerable leak on the system call failure path. Cc: stable@vger.kernel.org Fixes: ec34a922d243 ("[PATCH] IB/mthca: Add SRQ implementation") Signed-off-by: Jason Gunthorpe Link: https://patch.msgid.link/2-v1-83e918d69e73+a9-rdma_udata_rc_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mthca/mthca_provider.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index ef0635064fba..5c182ae4fc2f 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -428,6 +428,8 @@ static int mthca_create_srq(struct ib_srq *ibsrq, if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) { mthca_free_srq(to_mdev(ibsrq->device), srq); + mthca_unmap_user_db(to_mdev(ibsrq->device), &context->uar, + context->db_tab, ucmd.db_index); return -EFAULT; } @@ -436,6 +438,7 @@ static int mthca_create_srq(struct ib_srq *ibsrq, static int mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata) { + mthca_free_srq(to_mdev(srq->device), to_msrq(srq)); if (udata) { struct mthca_ucontext *context = rdma_udata_to_drv_context( @@ -446,8 +449,6 @@ static int mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata) mthca_unmap_user_db(to_mdev(srq->device), &context->uar, context->db_tab, to_msrq(srq)->db_index); } - - mthca_free_srq(to_mdev(srq->device), to_msrq(srq)); return 0; } -- cgit v1.2.3 From 74586c6da9ea222a61c98394f2fc0a604748438c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 Feb 2026 11:02:49 -0400 Subject: RDMA/irdma: Fix kernel stack leak in irdma_create_user_ah() struct irdma_create_ah_resp { // 8 bytes, no padding __u32 ah_id; // offset 0 - SET (uresp.ah_id = ah->sc_ah.ah_info.ah_idx) __u8 rsvd[4]; // offset 4 - NEVER SET <- LEAK }; rsvd[4]: 4 bytes of stack memory leaked unconditionally. Only ah_id is assigned before ib_respond_udata(). The reserved members of the structure were not zeroed. Cc: stable@vger.kernel.org Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Jason Gunthorpe Link: https://patch.msgid.link/3-v1-83e918d69e73+a9-rdma_udata_rc_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 15af53237217..7251cd7a2147 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -5212,7 +5212,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah, #define IRDMA_CREATE_AH_MIN_RESP_LEN offsetofend(struct irdma_create_ah_resp, rsvd) struct irdma_ah *ah = container_of(ibah, struct irdma_ah, ibah); struct irdma_device *iwdev = to_iwdev(ibah->pd->device); - struct irdma_create_ah_resp uresp; + struct irdma_create_ah_resp uresp = {}; struct irdma_ah *parent_ah; int err; -- cgit v1.2.3 From faa72102b178c7ae6c6afea23879e7c84fc59b4e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 Feb 2026 11:02:50 -0400 Subject: RDMA/ionic: Fix kernel stack leak in ionic_create_cq() struct ionic_cq_resp resp { __u32 cqid[2]; // offset 0 - PARTIALLY SET (see below) __u8 udma_mask; // offset 8 - SET (resp.udma_mask = vcq->udma_mask) __u8 rsvd[7]; // offset 9 - NEVER SET <- LEAK }; rsvd[7]: 7 bytes of stack memory leaked unconditionally. cqid[2]: The loop at line 1256 iterates over udma_idx but skips indices where !(vcq->udma_mask & BIT(udma_idx)). The array has 2 entries but udma_count could be 1, meaning cqid[1] might never be written via ionic_create_cq_common(). If udma_mask only has bit 0 set, cqid[1] (4 bytes) is also leaked. So potentially 11 bytes leaked. Cc: stable@vger.kernel.org Fixes: e8521822c733 ("RDMA/ionic: Register device ops for control path") Signed-off-by: Jason Gunthorpe Link: https://patch.msgid.link/4-v1-83e918d69e73+a9-rdma_udata_rc_jgg@nvidia.com Acked-by: Abhijit Gangurde Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/ionic/ionic_controlpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ionic/ionic_controlpath.c b/drivers/infiniband/hw/ionic/ionic_controlpath.c index 5b3f40bd98d8..4842931f5316 100644 --- a/drivers/infiniband/hw/ionic/ionic_controlpath.c +++ b/drivers/infiniband/hw/ionic/ionic_controlpath.c @@ -1218,7 +1218,7 @@ int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); struct ionic_vcq *vcq = to_ionic_vcq(ibcq); struct ionic_tbl_buf buf = {}; - struct ionic_cq_resp resp; + struct ionic_cq_resp resp = {}; struct ionic_cq_req req; int udma_idx = 0, rc; -- cgit v1.2.3 From 78437ab3b769f80526416570f60173c89858dd84 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 13 Feb 2026 00:58:11 +0100 Subject: clk: scu/imx8qxp: do not register driver in probe() imx_clk_scu_init() registers the imx_clk_scu_driver while commonly being called from IMX driver's probe() callbacks. However, it neither makes sense to register drivers from probe() callbacks of other drivers, nor does the driver core allow registering drivers with a device lock already being held. The latter was revealed by commit dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") leading to a deadlock condition described in [1]. Besides that, nothing seems to unregister the imx_clk_scu_driver once the corresponding driver module is unloaded, which leaves the driver-core with a dangling pointer. Also, if there are multiple matching devices for the imx8qxp_clk_driver, imx8qxp_clk_probe() calls imx_clk_scu_init() multiple times. However, any subsequent call after the first one will fail, since the driver-core does not allow to register the same struct platform_driver multiple times. Hence, register the imx_clk_scu_driver from module_init() and unregister it in module_exit(). Note that we first register the imx8qxp_clk_driver and then call imx_clk_scu_module_init() to avoid having to call imx_clk_scu_module_exit() in the unwind path of imx8qxp_clk_init(). Fixes: dc23806a7c47 ("driver core: enforce device_lock for driver_match_device()") Fixes: 220175cd3979 ("clk: imx: scu: fix build break when compiled as modules") Reported-by: Alexander Stein Closes: https://lore.kernel.org/lkml/13955113.uLZWGnKmhe@steina-w/ Tested-by: Alexander Stein # TQMa8x/MBa8x Link: https://lore.kernel.org/lkml/DFU7CEPUSG9A.1KKGVW4HIPMSH@kernel.org/ [1] Acked-by: Abel Vesa Reviewed-by: Daniel Baluta Link: https://patch.msgid.link/20260212235842.85934-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- drivers/clk/imx/clk-imx8qxp.c | 24 +++++++++++++++++++++++- drivers/clk/imx/clk-scu.c | 12 +++++++++++- drivers/clk/imx/clk-scu.h | 2 ++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/clk/imx/clk-imx8qxp.c b/drivers/clk/imx/clk-imx8qxp.c index 3ae162625bb1..c781425a005e 100644 --- a/drivers/clk/imx/clk-imx8qxp.c +++ b/drivers/clk/imx/clk-imx8qxp.c @@ -346,7 +346,29 @@ static struct platform_driver imx8qxp_clk_driver = { }, .probe = imx8qxp_clk_probe, }; -module_platform_driver(imx8qxp_clk_driver); + +static int __init imx8qxp_clk_init(void) +{ + int ret; + + ret = platform_driver_register(&imx8qxp_clk_driver); + if (ret) + return ret; + + ret = imx_clk_scu_module_init(); + if (ret) + platform_driver_unregister(&imx8qxp_clk_driver); + + return ret; +} +module_init(imx8qxp_clk_init); + +static void __exit imx8qxp_clk_exit(void) +{ + imx_clk_scu_module_exit(); + platform_driver_unregister(&imx8qxp_clk_driver); +} +module_exit(imx8qxp_clk_exit); MODULE_AUTHOR("Aisheng Dong "); MODULE_DESCRIPTION("NXP i.MX8QXP clock driver"); diff --git a/drivers/clk/imx/clk-scu.c b/drivers/clk/imx/clk-scu.c index 33e637ad3579..a85ec48a798b 100644 --- a/drivers/clk/imx/clk-scu.c +++ b/drivers/clk/imx/clk-scu.c @@ -191,6 +191,16 @@ static bool imx_scu_clk_is_valid(u32 rsrc_id) return p != NULL; } +int __init imx_clk_scu_module_init(void) +{ + return platform_driver_register(&imx_clk_scu_driver); +} + +void __exit imx_clk_scu_module_exit(void) +{ + return platform_driver_unregister(&imx_clk_scu_driver); +} + int imx_clk_scu_init(struct device_node *np, const struct imx_clk_scu_rsrc_table *data) { @@ -215,7 +225,7 @@ int imx_clk_scu_init(struct device_node *np, rsrc_table = data; } - return platform_driver_register(&imx_clk_scu_driver); + return 0; } /* diff --git a/drivers/clk/imx/clk-scu.h b/drivers/clk/imx/clk-scu.h index af7b697f51ca..ca82f2cce897 100644 --- a/drivers/clk/imx/clk-scu.h +++ b/drivers/clk/imx/clk-scu.h @@ -25,6 +25,8 @@ extern const struct imx_clk_scu_rsrc_table imx_clk_scu_rsrc_imx8dxl; extern const struct imx_clk_scu_rsrc_table imx_clk_scu_rsrc_imx8qxp; extern const struct imx_clk_scu_rsrc_table imx_clk_scu_rsrc_imx8qm; +int __init imx_clk_scu_module_init(void); +void __exit imx_clk_scu_module_exit(void); int imx_clk_scu_init(struct device_node *np, const struct imx_clk_scu_rsrc_table *data); struct clk_hw *imx_scu_of_clk_src_get(struct of_phandle_args *clkspec, -- cgit v1.2.3 From ab39cc4cb8ceecdc2b61747433e7237f1ac2b789 Mon Sep 17 00:00:00 2001 From: David Arcari Date: Tue, 24 Feb 2026 07:21:06 -0500 Subject: cpufreq: intel_pstate: Fix NULL pointer dereference in update_cpu_qos_request() The update_cpu_qos_request() function attempts to initialize the 'freq' variable by dereferencing 'cpudata' before verifying if the 'policy' is valid. This issue occurs on systems booted with the "nosmt" parameter, where all_cpu_data[cpu] is NULL for the SMT sibling threads. As a result, any call to update_qos_requests() will result in a NULL pointer dereference as the code will attempt to access pstate.turbo_freq using the NULL cpudata pointer. Also, pstate.turbo_freq may be updated by intel_pstate_get_hwp_cap() after initializing the 'freq' variable, so it is better to defer the 'freq' until intel_pstate_get_hwp_cap() has been called. Fix this by deferring the 'freq' assignment until after the policy and driver_data have been validated. Fixes: ae1bdd23b99f ("cpufreq: intel_pstate: Adjust frequency percentage computations") Reported-by: Jirka Hladky Closes: https://lore.kernel.org/all/CAE4VaGDfiPvz3AzrwrwM4kWB3SCkMci25nPO8W1JmTBd=xHzZg@mail.gmail.com/ Signed-off-by: David Arcari Cc: 6.18+ # 6.18+ [ rjw: Added one paragraph to the changelog ] Link: https://patch.msgid.link/20260224122106.228116-1-darcari@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index a48af3540c74..bdc37080d319 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1647,8 +1647,8 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, static void update_cpu_qos_request(int cpu, enum freq_qos_req_type type) { struct cpudata *cpudata = all_cpu_data[cpu]; - unsigned int freq = cpudata->pstate.turbo_freq; struct freq_qos_request *req; + unsigned int freq; struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); if (!policy) @@ -1661,6 +1661,8 @@ static void update_cpu_qos_request(int cpu, enum freq_qos_req_type type) if (hwp_active) intel_pstate_get_hwp_cap(cpudata); + freq = cpudata->pstate.turbo_freq; + if (type == FREQ_QOS_MIN) { freq = DIV_ROUND_UP(freq * global.min_perf_pct, 100); } else { -- cgit v1.2.3 From 5ede90206273ff156a778254f0f972a55e973c89 Mon Sep 17 00:00:00 2001 From: Sofia Schneider Date: Sun, 22 Feb 2026 23:52:40 -0300 Subject: ACPI: OSI: Add DMI quirk for Acer Aspire One D255 The screen backlight turns off during boot (specifically during udev device initialization) when returning true for _OSI("Windows 2009"). Analyzing the device's DSDT reveals that the firmware takes a different code path when Windows 7 is reported, which leads to the backlight shutoff. Add a DMI quirk to invoke dmi_disable_osi_win7 for this model. Signed-off-by: Sofia Schneider Link: https://patch.msgid.link/20260223025240.518509-1-sofia@schn.dev Signed-off-by: Rafael J. Wysocki --- drivers/acpi/osi.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c index f2c943b934be..9470f1830ff5 100644 --- a/drivers/acpi/osi.c +++ b/drivers/acpi/osi.c @@ -389,6 +389,19 @@ static const struct dmi_system_id acpi_osi_dmi_table[] __initconst = { }, }, + /* + * The screen backlight turns off during udev device creation + * when returning true for _OSI("Windows 2009") + */ + { + .callback = dmi_disable_osi_win7, + .ident = "Acer Aspire One D255", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOD255"), + }, + }, + /* * The wireless hotkey does not work on those machines when * returning true for _OSI("Windows 2012") -- cgit v1.2.3 From 364410170ab33f6e7ef0eb2afb12bf89b0feb3a6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Feb 2026 07:59:20 -0500 Subject: nfsd: report the requested maximum number of threads instead of number running The current netlink and /proc interfaces deviate from their traditional values when dynamic threading is enabled, and there is currently no way to know what the current setting is. This patch brings the reporting back in line with traditional behavior. Make these interfaces report the requested maximum number of threads instead of the number currently running. Also, update documentation and comments to reflect that this value represents a maximum and not the number currently running. Fixes: d8316b837c2c ("nfsd: add controls to set the minimum number of threads per pool") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 4 ++-- fs/nfsd/nfsctl.c | 18 +++++++++--------- fs/nfsd/nfssvc.c | 7 ++++--- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index badb2fe57c98..f87b5a05e5e9 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -152,7 +152,7 @@ operations: - compound-ops - name: threads-set - doc: set the number of running threads + doc: set the maximum number of running threads attribute-set: server flags: [admin-perm] do: @@ -165,7 +165,7 @@ operations: - min-threads - name: threads-get - doc: get the number of running threads + doc: get the maximum number of running threads attribute-set: server do: reply: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b06adb5d5a2e..369da69d5efe 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -377,15 +377,15 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) } /* - * write_threads - Start NFSD, or report the current number of running threads + * write_threads - Start NFSD, or report the configured number of threads * * Input: * buf: ignored * size: zero * Output: * On success: passed-in buffer filled with '\n'-terminated C - * string numeric value representing the number of - * running NFSD threads; + * string numeric value representing the configured + * number of NFSD threads; * return code is the size in bytes of the string * On error: return code is zero * @@ -399,8 +399,8 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) * Output: * On success: NFS service is started; * passed-in buffer filled with '\n'-terminated C - * string numeric value representing the number of - * running NFSD threads; + * string numeric value representing the configured + * number of NFSD threads; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ @@ -430,7 +430,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) } /* - * write_pool_threads - Set or report the current number of threads per pool + * write_pool_threads - Set or report the configured number of threads per pool * * Input: * buf: ignored @@ -447,7 +447,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) * Output: * On success: passed-in buffer filled with '\n'-terminated C * string containing integer values representing the - * number of NFSD threads in each pool; + * configured number of NFSD threads in each pool; * return code is the size in bytes of the string * On error: return code is zero or a negative errno value */ @@ -1657,7 +1657,7 @@ out_unlock: } /** - * nfsd_nl_threads_get_doit - get the number of running threads + * nfsd_nl_threads_get_doit - get the maximum number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments * @@ -1700,7 +1700,7 @@ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i]; err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, - sp->sp_nrthreads); + sp->sp_nrthrmax); if (err) goto err_unlock; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 0887ee601d3c..4a04208393b8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -239,12 +239,13 @@ static void nfsd_net_free(struct percpu_ref *ref) int nfsd_nrthreads(struct net *net) { - int rv = 0; + int i, rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) - rv = nn->nfsd_serv->sv_nrthreads; + for (i = 0; i < nn->nfsd_serv->sv_nrpools; ++i) + rv += nn->nfsd_serv->sv_pools[i].sp_nrthrmax; mutex_unlock(&nfsd_mutex); return rv; } @@ -659,7 +660,7 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) if (serv) for (i = 0; i < serv->sv_nrpools && i < n; i++) - nthreads[i] = serv->sv_pools[i].sp_nrthreads; + nthreads[i] = serv->sv_pools[i].sp_nrthrmax; return 0; } -- cgit v1.2.3 From 297318a1c26dabb5a2d8540fdf436c22094eb2d7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 24 Feb 2026 12:52:18 +0100 Subject: spi: dt-bindings: snps,dw-abp-ssi: Remove unused bindings As stated in the da0a672268b3 ("spi: dw: Remove not-going-to-be-supported code for Baikal SoC") the Baikal platforms are not supported and the respective driver code was removed. Remove the currently unused bindings. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20260224115218.3499222-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- .../devicetree/bindings/spi/snps,dw-apb-ssi.yaml | 31 +--------------------- 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml index 81838577cf9c..8ebebcebca16 100644 --- a/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml +++ b/Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml @@ -22,21 +22,6 @@ allOf: properties: reg: minItems: 2 - - if: - properties: - compatible: - contains: - enum: - - baikal,bt1-sys-ssi - then: - properties: - mux-controls: - maxItems: 1 - required: - - mux-controls - else: - required: - - interrupts - if: properties: compatible: @@ -75,10 +60,6 @@ properties: const: intel,mountevans-imc-ssi - description: AMD Pensando Elba SoC SPI Controller const: amd,pensando-elba-spi - - description: Baikal-T1 SPI Controller - const: baikal,bt1-ssi - - description: Baikal-T1 System Boot SPI Controller - const: baikal,bt1-sys-ssi - description: Canaan Kendryte K210 SoS SPI Controller const: canaan,k210-spi - description: Renesas RZ/N1 SPI Controller @@ -170,6 +151,7 @@ required: - "#address-cells" - "#size-cells" - clocks + - interrupts examples: - | @@ -190,15 +172,4 @@ examples: rx-sample-delay-ns = <7>; }; }; - - | - spi@1f040100 { - compatible = "baikal,bt1-sys-ssi"; - reg = <0x1f040100 0x900>, - <0x1c000000 0x1000000>; - #address-cells = <1>; - #size-cells = <0>; - mux-controls = <&boot_mux>; - clocks = <&ccu_sys>; - clock-names = "ssi_clk"; - }; ... -- cgit v1.2.3 From 96a1fd0d84b17360840f344826897fa71049870e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 12 Feb 2026 14:50:38 -0700 Subject: cxl: Fix race of nvdimm_bus object when creating nvdimm objects Found issue during running of cxl-translate.sh unit test. Adding a 3s sleep right before the test seems to make the issue reproduce fairly consistently. The cxl_translate module has dependency on cxl_acpi and causes orphaned nvdimm objects to reprobe after cxl_acpi is removed. The nvdimm_bus object is registered by the cxl_nvb object when cxl_acpi_probe() is called. With the nvdimm_bus object missing, __nd_device_register() will trigger NULL pointer dereference when accessing the dev->parent that points to &nvdimm_bus->dev. [ 192.884510] BUG: kernel NULL pointer dereference, address: 000000000000006c [ 192.895383] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20250812-19.fc42 08/12/2025 [ 192.897721] Workqueue: cxl_port cxl_bus_rescan_queue [cxl_core] [ 192.899459] RIP: 0010:kobject_get+0xc/0x90 [ 192.924871] Call Trace: [ 192.925959] [ 192.926976] ? pm_runtime_init+0xb9/0xe0 [ 192.929712] __nd_device_register.part.0+0x4d/0xc0 [libnvdimm] [ 192.933314] __nvdimm_create+0x206/0x290 [libnvdimm] [ 192.936662] cxl_nvdimm_probe+0x119/0x1d0 [cxl_pmem] [ 192.940245] cxl_bus_probe+0x1a/0x60 [cxl_core] [ 192.943349] really_probe+0xde/0x380 This patch also relies on the previous change where devm_cxl_add_nvdimm_bridge() is called from drivers/cxl/pmem.c instead of drivers/cxl/core.c to ensure the dependency of cxl_acpi on cxl_pmem. 1. Set probe_type of cxl_nvb to PROBE_FORCE_SYNCHRONOUS to ensure the driver is probed synchronously when add_device() is called. 2. Add a check in __devm_cxl_add_nvdimm_bridge() to ensure that the cxl_nvb driver is attached during cxl_acpi_probe(). 3. Take the cxl_root uport_dev lock and the cxl_nvb->dev lock in devm_cxl_add_nvdimm() before checking nvdimm_bus is valid. 4. Set cxl_nvdimm flag to CXL_NVD_F_INVALIDATED so cxl_nvdimm_probe() will exit with -EBUSY. The removal of cxl_nvdimm devices should prevent any orphaned devices from probing once the nvdimm_bus is gone. [ dj: Fixed 0-day reported kdoc issue. ] [ dj: Fix cxl_nvb reference leak on error. Gregory (kreview-0811365) ] Suggested-by: Dan Williams Fixes: 8fdcb1704f61 ("cxl/pmem: Add initial infrastructure for pmem support") Tested-by: Alison Schofield Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260205001633.1813643-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pmem.c | 29 +++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 5 +++++ drivers/cxl/pmem.c | 10 ++++++++-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index b652e3486038..68462e38a977 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -115,6 +115,15 @@ static void unregister_nvb(void *_cxl_nvb) device_unregister(&cxl_nvb->dev); } +static bool cxl_nvdimm_bridge_failed_attach(struct cxl_nvdimm_bridge *cxl_nvb) +{ + struct device *dev = &cxl_nvb->dev; + + guard(device)(dev); + /* If the device has no driver, then it failed to attach. */ + return dev->driver == NULL; +} + struct cxl_nvdimm_bridge *__devm_cxl_add_nvdimm_bridge(struct device *host, struct cxl_port *port) { @@ -138,6 +147,11 @@ struct cxl_nvdimm_bridge *__devm_cxl_add_nvdimm_bridge(struct device *host, if (rc) goto err; + if (cxl_nvdimm_bridge_failed_attach(cxl_nvb)) { + unregister_nvb(cxl_nvb); + return ERR_PTR(-ENODEV); + } + rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb); if (rc) return ERR_PTR(rc); @@ -248,6 +262,21 @@ int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, if (!cxl_nvb) return -ENODEV; + /* + * Take the uport_dev lock to guard against race of nvdimm_bus object. + * cxl_acpi_probe() registers the nvdimm_bus and is done under the + * root port uport_dev lock. + * + * Take the cxl_nvb device lock to ensure that cxl_nvb driver is in a + * consistent state. And the driver registers nvdimm_bus. + */ + guard(device)(cxl_nvb->port->uport_dev); + guard(device)(&cxl_nvb->dev); + if (!cxl_nvb->nvdimm_bus) { + rc = -ENODEV; + goto err_alloc; + } + cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd); if (IS_ERR(cxl_nvd)) { rc = PTR_ERR(cxl_nvd); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index f5850800f400..9b947286eb9b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -574,11 +574,16 @@ struct cxl_nvdimm_bridge { #define CXL_DEV_ID_LEN 19 +enum { + CXL_NVD_F_INVALIDATED = 0, +}; + struct cxl_nvdimm { struct device dev; struct cxl_memdev *cxlmd; u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */ u64 dirty_shutdowns; + unsigned long flags; }; struct cxl_pmem_region_mapping { diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index c67b30b516bf..082ec0f1c3a0 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -14,7 +14,7 @@ static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); /** - * __devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology + * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology * @host: platform firmware root device * @port: CXL port at the root of a CXL topology * @@ -143,6 +143,9 @@ static int cxl_nvdimm_probe(struct device *dev) struct nvdimm *nvdimm; int rc; + if (test_bit(CXL_NVD_F_INVALIDATED, &cxl_nvd->flags)) + return -EBUSY; + set_exclusive_cxl_commands(mds, exclusive_cmds); rc = devm_add_action_or_reset(dev, clear_exclusive, mds); if (rc) @@ -323,8 +326,10 @@ static int detach_nvdimm(struct device *dev, void *data) scoped_guard(device, dev) { if (dev->driver) { cxl_nvd = to_cxl_nvdimm(dev); - if (cxl_nvd->cxlmd && cxl_nvd->cxlmd->cxl_nvb == data) + if (cxl_nvd->cxlmd && cxl_nvd->cxlmd->cxl_nvb == data) { release = true; + set_bit(CXL_NVD_F_INVALIDATED, &cxl_nvd->flags); + } } } if (release) @@ -367,6 +372,7 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = { .probe = cxl_nvdimm_bridge_probe, .id = CXL_DEVICE_NVDIMM_BRIDGE, .drv = { + .probe_type = PROBE_FORCE_SYNCHRONOUS, .suppress_bind_attrs = true, }, }; -- cgit v1.2.3 From 60b5d1f68338aff2c5af0113f04aefa7169c50c2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 19 Feb 2026 16:16:17 -0800 Subject: cxl/mbox: validate payload size before accessing contents in cxl_payload_from_user_allowed() cxl_payload_from_user_allowed() casts and dereferences the input payload without first verifying its size. When a raw mailbox command is sent with an undersized payload (ie: 1 byte for CXL_MBOX_OP_CLEAR_LOG, which expects a 16-byte UUID), uuid_equal() reads past the allocated buffer, triggering a KASAN splat: BUG: KASAN: slab-out-of-bounds in memcmp+0x176/0x1d0 lib/string.c:683 Read of size 8 at addr ffff88810130f5c0 by task syz.1.62/2258 CPU: 2 UID: 0 PID: 2258 Comm: syz.1.62 Not tainted 6.19.0-dirty #3 PREEMPT(voluntary) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xab/0xe0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xce/0x650 mm/kasan/report.c:482 kasan_report+0xce/0x100 mm/kasan/report.c:595 memcmp+0x176/0x1d0 lib/string.c:683 uuid_equal include/linux/uuid.h:73 [inline] cxl_payload_from_user_allowed drivers/cxl/core/mbox.c:345 [inline] cxl_mbox_cmd_ctor drivers/cxl/core/mbox.c:368 [inline] cxl_validate_cmd_from_user drivers/cxl/core/mbox.c:522 [inline] cxl_send_cmd+0x9c0/0xb50 drivers/cxl/core/mbox.c:643 __cxl_memdev_ioctl drivers/cxl/core/memdev.c:698 [inline] cxl_memdev_ioctl+0x14f/0x190 drivers/cxl/core/memdev.c:713 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl fs/ioctl.c:583 [inline] __x64_sys_ioctl+0x18e/0x210 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xa8/0x330 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fdaf331ba79 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdaf1d77038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fdaf3585fa0 RCX: 00007fdaf331ba79 RDX: 00002000000001c0 RSI: 00000000c030ce02 RDI: 0000000000000003 RBP: 00007fdaf33749df R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fdaf3586038 R14: 00007fdaf3585fa0 R15: 00007ffced2af768 Add 'in_size' parameter to cxl_payload_from_user_allowed() and validate the payload is large enough. Fixes: 6179045ccc0c ("cxl/mbox: Block immediate mode in SET_PARTITION_INFO command") Fixes: 206f9fa9d555 ("cxl/mbox: Add Clear Log mailbox command") Signed-off-by: Davidlohr Bueso Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260220001618.963490-2-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index fa6dd0c94656..e7a6452bf544 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -311,6 +311,7 @@ static bool cxl_mem_raw_command_allowed(u16 opcode) * cxl_payload_from_user_allowed() - Check contents of in_payload. * @opcode: The mailbox command opcode. * @payload_in: Pointer to the input payload passed in from user space. + * @in_size: Size of @payload_in in bytes. * * Return: * * true - payload_in passes check for @opcode. @@ -325,12 +326,15 @@ static bool cxl_mem_raw_command_allowed(u16 opcode) * * The specific checks are determined by the opcode. */ -static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in) +static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in, + size_t in_size) { switch (opcode) { case CXL_MBOX_OP_SET_PARTITION_INFO: { struct cxl_mbox_set_partition_info *pi = payload_in; + if (in_size < sizeof(*pi)) + return false; if (pi->flags & CXL_SET_PARTITION_IMMEDIATE_FLAG) return false; break; @@ -338,6 +342,8 @@ static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in) case CXL_MBOX_OP_CLEAR_LOG: { const uuid_t *uuid = (uuid_t *)payload_in; + if (in_size < sizeof(uuid_t)) + return false; /* * Restrict the ‘Clear log’ action to only apply to * Vendor debug logs. @@ -365,7 +371,8 @@ static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox_cmd, if (IS_ERR(mbox_cmd->payload_in)) return PTR_ERR(mbox_cmd->payload_in); - if (!cxl_payload_from_user_allowed(opcode, mbox_cmd->payload_in)) { + if (!cxl_payload_from_user_allowed(opcode, mbox_cmd->payload_in, + in_size)) { dev_dbg(cxl_mbox->host, "%s: input payload not allowed\n", cxl_mem_opcode_to_name(opcode)); kvfree(mbox_cmd->payload_in); -- cgit v1.2.3 From 0a70b7cd397e545e926c93715ff6366b67c716f6 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 23 Feb 2026 11:13:39 -0800 Subject: cxl: Test CXL_DECODER_F_LOCK as a bitmask The CXL decoder flags are defined as bitmasks, not bit indices. Using test_bit() to check them interprets the mask value as a bit index, which is the wrong test. For CXL_DECODER_F_LOCK the test reads beyond the defined bits, causing the test to always return false and allowing resets that should have been blocked. Replace test_bit() with a bitmask check. Fixes: 2230c4bdc412 ("cxl: Add handling of locked CXL decoder") Signed-off-by: Alison Schofield Reviewed-by: Gregory Price Tested-by: Gregory Price Link: https://patch.msgid.link/98851c4770e4631753cf9f75b58a3a6daeca2ea2.1771873256.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 2 +- drivers/cxl/core/region.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index e3f0c39e6812..c222e98ae736 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -904,7 +904,7 @@ static void cxl_decoder_reset(struct cxl_decoder *cxld) if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) return; - if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + if (cxld->flags & CXL_DECODER_F_LOCK) return; if (port->commit_end == id) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index fec37af1dfbf..780ec947ecf2 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1100,7 +1100,7 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, static void cxl_region_setup_flags(struct cxl_region *cxlr, struct cxl_decoder *cxld) { - if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) { + if (cxld->flags & CXL_DECODER_F_LOCK) { set_bit(CXL_REGION_F_LOCK, &cxlr->flags); clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); } -- cgit v1.2.3 From e46f25f5a81f6f1a9ab93bcda80d5dfaea9f4897 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 23 Feb 2026 11:13:40 -0800 Subject: cxl/region: Test CXL_DECODER_F_NORMALIZED_ADDRESSING as a bitmask The CXL decoder flags are defined as bitmasks, not bit indices. Using test_bit() to check them interprets the mask value as a bit index, which is the wrong test. For CXL_DECODER_F_NORMALIZED_ADDRESSING the test reads beyond the flags word, making the flag sometimes appear set and blocking creation of CXL region debugfs attributes that support poison operations. Replace test_bit() with a bitmask check. Found with cxl-test. Fixes: 208f432406b7 ("cxl: Disable HPA/SPA translation handlers for Normalized Addressing") Signed-off-by: Alison Schofield Reviewed-by: Gregory Price Tested-by: Gregory Price Link: https://patch.msgid.link/63fe4a6203e40e404347f1cdc7a1c55cb4959b86.1771873256.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 780ec947ecf2..42874948b589 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1105,7 +1105,7 @@ static void cxl_region_setup_flags(struct cxl_region *cxlr, clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); } - if (test_bit(CXL_DECODER_F_NORMALIZED_ADDRESSING, &cxld->flags)) + if (cxld->flags & CXL_DECODER_F_NORMALIZED_ADDRESSING) set_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags); } -- cgit v1.2.3 From 43277d8f57b490550d33f6f4cb73c28ec6024696 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:19 -0800 Subject: selftests/bpf: Replace strncpy() with strscpy() strncpy() does not guarantee NULL-termination and is considered deprecated [1]. Replace strncpy() calls with strscpy(). [1] https://docs.kernel.org/process/deprecated.html#strncpy-on-nul-terminated-strings Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/network_helpers.c | 3 +-- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 3 +-- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/queue_stack_map.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c | 2 +- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 2 +- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/test_progs.c | 2 +- tools/testing/selftests/bpf/xdp_hw_metadata.c | 4 ++-- 9 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 5374b7e16d53..b82f572641b7 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -581,8 +581,7 @@ int open_tuntap(const char *dev_name, bool need_mac) return -1; ifr.ifr_flags = IFF_NO_PI | (need_mac ? IFF_TAP : IFF_TUN); - strncpy(ifr.ifr_name, dev_name, IFNAMSIZ - 1); - ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + strscpy(ifr.ifr_name, dev_name); err = ioctl(fd, TUNSETIFF, &ifr); if (!ASSERT_OK(err, "ioctl(TUNSETIFF)")) { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 5225d69bf79b..c69080ca14f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -346,8 +346,7 @@ static void test_task_sleepable(void) close(finish_pipe[1]); test_data = malloc(sizeof(char) * 10); - strncpy(test_data, "test_data", 10); - test_data[9] = '\0'; + strscpy(test_data, "test_data", 10); test_data_long = malloc(sizeof(char) * 5000); for (int i = 0; i < 5000; ++i) { diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 08bae13248c4..fb4892681464 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -570,7 +570,7 @@ static int create_tap(const char *ifname) }; int fd, ret; - strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + strscpy(ifr.ifr_name, ifname); fd = open("/dev/net/tun", O_RDWR); if (fd < 0) @@ -599,7 +599,7 @@ static int ifup(const char *ifname) struct ifreq ifr = {}; int sk, ret; - strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + strscpy(ifr.ifr_name, ifname); sk = socket(PF_INET, SOCK_DGRAM, 0); if (sk < 0) diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index a043af9cd6d9..41441325e179 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -28,9 +28,9 @@ static void test_queue_stack_map_by_type(int type) vals[i] = rand(); if (type == QUEUE) - strncpy(file, "./test_queue_map.bpf.o", sizeof(file)); + strscpy(file, "./test_queue_map.bpf.o"); else if (type == STACK) - strncpy(file, "./test_stack_map.bpf.o", sizeof(file)); + strscpy(file, "./test_stack_map.bpf.o"); else return; diff --git a/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c b/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c index 3eefdfed1db9..657d897958b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c +++ b/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c @@ -34,7 +34,7 @@ void test_skc_to_unix_sock(void) memset(&sockaddr, 0, sizeof(sockaddr)); sockaddr.sun_family = AF_UNIX; - strncpy(sockaddr.sun_path, sock_path, strlen(sock_path)); + strscpy(sockaddr.sun_path, sock_path); sockaddr.sun_path[0] = '\0'; err = bind(sockfd, (struct sockaddr *)&sockaddr, sizeof(sockaddr)); diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 0f86b9275cf9..8342e2fe5260 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -262,7 +262,7 @@ retry: if (!atomic_compare_exchange_strong(&tld_meta_p->cnt, &cnt, cnt + 1)) goto retry; - strncpy(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN); + strscpy(tld_meta_p->metadata[i].name, name); atomic_store(&tld_meta_p->metadata[i].size, size); return (tld_key_t){(__s16)off}; } diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 76d72a59365e..64fbda082309 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -1095,7 +1095,7 @@ static int tun_open(char *name) ifr.ifr_flags = IFF_TUN | IFF_NO_PI; if (*name) - strncpy(ifr.ifr_name, name, IFNAMSIZ); + strscpy(ifr.ifr_name, name); err = ioctl(fd, TUNSETIFF, &ifr); if (!ASSERT_OK(err, "ioctl TUNSETIFF")) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 02a85dda30e6..d1418ec1f351 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1799,7 +1799,7 @@ static int worker_main_send_subtests(int sock, struct test_state *state) msg.subtest_done.num = i; - strncpy(msg.subtest_done.name, subtest_state->name, MAX_SUBTEST_NAME); + strscpy(msg.subtest_done.name, subtest_state->name, MAX_SUBTEST_NAME); msg.subtest_done.error_cnt = subtest_state->error_cnt; msg.subtest_done.skipped = subtest_state->skipped; diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 3d8de0d4c96a..6db3b5555a22 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -550,7 +550,7 @@ static int rxq_num(const char *ifname) struct ifreq ifr = { .ifr_data = (void *)&ch, }; - strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1); + strscpy(ifr.ifr_name, ifname); int fd, ret; fd = socket(AF_UNIX, SOCK_DGRAM, 0); @@ -571,7 +571,7 @@ static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *c struct ifreq ifr = { .ifr_data = (void *)cfg, }; - strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1); + strscpy(ifr.ifr_name, ifname); int fd, ret; fd = socket(AF_UNIX, SOCK_DGRAM, 0); -- cgit v1.2.3 From 3ed0bc2d4994c718fb5476b45d4aafec42a1d040 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:20 -0800 Subject: selftests/bpf: Use strscpy in bpftool_helpers.c Replace strncpy() calls in bpftool_helpers.c with strscpy(). Pass the destination buffer size to detect_bpftool_path() instead of hardcoding BPFTOOL_PATH_MAX_LEN. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-5-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpftool_helpers.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c index a5824945a4a5..595a636fa13b 100644 --- a/tools/testing/selftests/bpf/bpftool_helpers.c +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -1,15 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "bpftool_helpers.h" #include #include #include +#include "bpf_util.h" +#include "bpftool_helpers.h" + #define BPFTOOL_PATH_MAX_LEN 64 #define BPFTOOL_FULL_CMD_MAX_LEN 512 #define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" -static int detect_bpftool_path(char *buffer) +static int detect_bpftool_path(char *buffer, size_t size) { char tmp[BPFTOOL_PATH_MAX_LEN]; @@ -18,7 +20,7 @@ static int detect_bpftool_path(char *buffer) */ snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); if (access(tmp, X_OK) == 0) { - strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + strscpy(buffer, tmp, size); return 0; } @@ -27,7 +29,7 @@ static int detect_bpftool_path(char *buffer) */ snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); if (access(tmp, X_OK) == 0) { - strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + strscpy(buffer, tmp, size); return 0; } @@ -44,7 +46,7 @@ static int run_command(char *args, char *output_buf, size_t output_max_len) int ret; /* Detect and cache bpftool binary location */ - if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path, sizeof(bpftool_path))) return 1; ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", -- cgit v1.2.3 From 9d8685239e85ba5a4899b5b3534c732e3ae5f2aa Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:21 -0800 Subject: selftests/bpf: Use memcpy() for bounded non-NULL-terminated copies Replace strncpy() with memcpy() in cases where the source is non-NULL-terminated and the copy length is known. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-6-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c | 6 ++++-- tools/testing/selftests/bpf/test_verifier.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index dd75ccb03770..469e92869523 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -308,8 +308,10 @@ static int find_field_offset(struct btf *btf, char *pattern, regmatch_t *matches return -1; } - strncpy(type_str, type, type_sz); - strncpy(field_str, field, field_sz); + memcpy(type_str, type, type_sz); + type_str[type_sz] = '\0'; + memcpy(field_str, field, field_sz); + field_str[field_sz] = '\0'; btf_id = btf__find_by_name(btf, type_str); if (btf_id < 0) { PRINT_FAIL("No BTF info for type %s\n", type_str); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 27db34ecf3f5..a8ae03c57bba 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1320,7 +1320,7 @@ static bool cmp_str_seq(const char *log, const char *exp) printf("FAIL\nTestcase bug\n"); return false; } - strncpy(needle, exp, len); + memcpy(needle, exp, len); needle[len] = 0; q = strstr(log, needle); if (!q) { -- cgit v1.2.3 From 4021848a903e65f74cf88997c12b96d6bc2453e6 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:22 -0800 Subject: selftests/bpf: Pass through build flags to bpftool and resolve_btfids EXTRA_* and SAN_* build flags were not correctly propagated to bpftool and resolve_btids when building selftests/bpf. This led to various build errors on attempt to build with SAN_CFLAGS="-fsanitize=address", for example. Fix the makefiles to address this: - Pass SAN_CFLAGS/SAN_LDFLAGS to bpftool and resolve_btfids build - Propagate EXTRA_LDFLAGS to resolve_btfids link command - Use pkg-config to detect zlib and zstd for resolve_btfids, similar libelf handling Also check for ASAN flag in selftests/bpf/Makefile for convenience. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-7-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/Makefile | 7 +++++-- tools/testing/selftests/bpf/Makefile | 13 +++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 1733a6e93a07..ef083602b73a 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -65,6 +65,9 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null) LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) +ZLIB_LIBS := $(shell $(HOSTPKG_CONFIG) zlib --libs 2>/dev/null || echo -lz) +ZSTD_LIBS := $(shell $(HOSTPKG_CONFIG) libzstd --libs 2>/dev/null || echo -lzstd) + HOSTCFLAGS_resolve_btfids += -g \ -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ @@ -73,7 +76,7 @@ HOSTCFLAGS_resolve_btfids += -g \ $(LIBELF_FLAGS) \ -Wall -Werror -LIBS = $(LIBELF_LIBS) -lz +LIBS = $(LIBELF_LIBS) $(ZLIB_LIBS) $(ZSTD_LIBS) export srctree OUTPUT HOSTCFLAGS_resolve_btfids Q HOSTCC HOSTLD HOSTAR include $(srctree)/tools/build/Makefile.include @@ -83,7 +86,7 @@ $(BINARY_IN): fixdep FORCE prepare | $(OUTPUT) $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN) $(call msg,LINK,$@) - $(Q)$(HOSTCC) $(BINARY_IN) $(KBUILD_HOSTLDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS) + $(Q)$(HOSTCC) $(BINARY_IN) $(KBUILD_HOSTLDFLAGS) $(EXTRA_LDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS) clean_objects := $(wildcard $(OUTPUT)/*.o \ $(OUTPUT)/.*.o.cmd \ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6776158f1f3e..72a9ba41f95e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -27,7 +27,11 @@ ifneq ($(wildcard $(GENHDR)),) endif BPF_GCC ?= $(shell command -v bpf-gcc;) +ifdef ASAN +SAN_CFLAGS ?= -fsanitize=address -fno-omit-frame-pointer +else SAN_CFLAGS ?= +endif SAN_LDFLAGS ?= $(SAN_CFLAGS) RELEASE ?= OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) @@ -326,8 +330,8 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)' \ + EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/ \ @@ -338,8 +342,8 @@ $(CROSS_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(BPFOBJ) | $(BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)' \ + EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(BUILD_DIR)/bpftool/ \ LIBBPF_OUTPUT=$(BUILD_DIR)/libbpf/ \ LIBBPF_DESTDIR=$(SCRATCH_DIR)/ \ @@ -404,6 +408,7 @@ $(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids \ $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids \ CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" \ LIBBPF_INCLUDE=$(HOST_INCLUDE_DIR) \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(HOST_BUILD_DIR)/resolve_btfids/ BPFOBJ=$(HOST_BPFOBJ) # Get Clang's default includes on this system, as opposed to those seen by -- cgit v1.2.3 From c5c1e313493cd836863bb673bb2b8beaa915cad9 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:23 -0800 Subject: resolve_btfids: Fix memory leaks reported by ASAN Running resolve_btfids with ASAN reveals memory leaks in btf_id handling. - Change get_id() to use a local buffer - Make btf_id__add() strdup the name internally - Add btf_id__free_all() that frees all nodese of a tree - Call the cleanup function on exit for every tree Acked-by: Jiri Olsa Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 81 +++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index ca7fcd03efb6..5208f650080f 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -226,7 +226,7 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) } static struct btf_id *__btf_id__add(struct rb_root *root, - char *name, + const char *name, enum btf_id_kind kind, bool unique) { @@ -250,7 +250,11 @@ static struct btf_id *__btf_id__add(struct rb_root *root, id = zalloc(sizeof(*id)); if (id) { pr_debug("adding symbol %s\n", name); - id->name = name; + id->name = strdup(name); + if (!id->name) { + free(id); + return NULL; + } id->kind = kind; rb_link_node(&id->rb_node, parent, p); rb_insert_color(&id->rb_node, root); @@ -258,17 +262,21 @@ static struct btf_id *__btf_id__add(struct rb_root *root, return id; } -static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind) +static inline struct btf_id *btf_id__add(struct rb_root *root, + const char *name, + enum btf_id_kind kind) { return __btf_id__add(root, name, kind, false); } -static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind) +static inline struct btf_id *btf_id__add_unique(struct rb_root *root, + const char *name, + enum btf_id_kind kind) { return __btf_id__add(root, name, kind, true); } -static char *get_id(const char *prefix_end) +static int get_id(const char *prefix_end, char *buf, size_t buf_sz) { /* * __BTF_ID__func__vfs_truncate__0 @@ -277,28 +285,28 @@ static char *get_id(const char *prefix_end) */ int len = strlen(prefix_end); int pos = sizeof("__") - 1; - char *p, *id; + char *p; if (pos >= len) - return NULL; + return -1; - id = strdup(prefix_end + pos); - if (id) { - /* - * __BTF_ID__func__vfs_truncate__0 - * id = ^ - * - * cut the unique id part - */ - p = strrchr(id, '_'); - p--; - if (*p != '_') { - free(id); - return NULL; - } - *p = '\0'; - } - return id; + if (len - pos >= buf_sz) + return -1; + + strcpy(buf, prefix_end + pos); + /* + * __BTF_ID__func__vfs_truncate__0 + * buf = ^ + * + * cut the unique id part + */ + p = strrchr(buf, '_'); + p--; + if (*p != '_') + return -1; + *p = '\0'; + + return 0; } static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind) @@ -335,10 +343,9 @@ static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind k static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) { - char *id; + char id[KSYM_NAME_LEN]; - id = get_id(name + size); - if (!id) { + if (get_id(name + size, id, sizeof(id))) { pr_err("FAILED to parse symbol name: %s\n", name); return NULL; } @@ -346,6 +353,21 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, BTF_ID_KIND_SYM); } +static void btf_id__free_all(struct rb_root *root) +{ + struct rb_node *next; + struct btf_id *id; + + next = rb_first(root); + while (next) { + id = rb_entry(next, struct btf_id, rb_node); + next = rb_next(&id->rb_node); + rb_erase(&id->rb_node, root); + free(id->name); + free(id); + } +} + static void bswap_32_data(void *data, u32 nr_bytes) { u32 cnt, i; @@ -1547,6 +1569,11 @@ dump_btf: out: btf__free(obj.base_btf); btf__free(obj.btf); + btf_id__free_all(&obj.structs); + btf_id__free_all(&obj.unions); + btf_id__free_all(&obj.typedefs); + btf_id__free_all(&obj.funcs); + btf_id__free_all(&obj.sets); if (obj.efile.elf) { elf_end(obj.efile.elf); close(obj.efile.fd); -- cgit v1.2.3 From 45897ced3c1d3940fb5b68fe48c9e144143ae0ae Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:24 -0800 Subject: selftests/bpf: Add DENYLIST.asan Add a denylist file for tests that should be skipped when built with userspace ASAN: $ make ... SAN_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" Skip the following tests: - *arena*: userspace ASAN does not understand BPF arena maps and gets confused particularly when map_extra is non-zero - non-zero map_extra leads to mmap with MAP_FIXED, and ASAN treats this as an unknown memory region - task_local_data: ASAN complains about "incorrect" aligned_alloc() usage, but it's intentional in the test - uprobe_multi_test: very slow with ASAN enabled Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-9-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.asan | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/bpf/DENYLIST.asan diff --git a/tools/testing/selftests/bpf/DENYLIST.asan b/tools/testing/selftests/bpf/DENYLIST.asan new file mode 100644 index 000000000000..d7fe372a2293 --- /dev/null +++ b/tools/testing/selftests/bpf/DENYLIST.asan @@ -0,0 +1,3 @@ +*arena* +task_local_data +uprobe_multi_test -- cgit v1.2.3 From a1a771bd649212ef32cf9b0bcc63213a762d354a Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:25 -0800 Subject: selftests/bpf: Refactor bpf_get_ksyms() trace helper ASAN reported a memory leak in bpf_get_ksyms(): it allocates a struct ksyms internally and never frees it. Move struct ksyms to trace_helpers.h and return it from the bpf_get_ksyms(), giving ownership to the caller. Add filtered_syms and filtered_cnt fields to the ksyms to hold the filtered array of symbols, previously returned by bpf_get_ksyms(). Fixup the call sites: kprobe_multi_test and bench_trigger. Signed-off-by: Ihor Solodrai Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260223190736.649171-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/benchs/bench_trigger.c | 14 +++++++------ .../selftests/bpf/prog_tests/kprobe_multi_test.c | 12 +++++------ tools/testing/selftests/bpf/trace_helpers.c | 23 +++++++++++----------- tools/testing/selftests/bpf/trace_helpers.h | 11 +++++++++-- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index aeec9edd3851..f74b313d6ae4 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -230,8 +230,8 @@ static void trigger_fentry_setup(void) static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - char **syms = NULL; - size_t cnt = 0; + struct bpf_link *link = NULL; + struct ksyms *ksyms = NULL; /* Some recursive functions will be skipped in * bpf_get_ksyms -> skip_entry, as they can introduce sufficient @@ -241,16 +241,18 @@ static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) * So, don't run the kprobe-multi-all and kretprobe-multi-all on * a debug kernel. */ - if (bpf_get_ksyms(&syms, &cnt, true)) { + if (bpf_get_ksyms(&ksyms, true)) { fprintf(stderr, "failed to get ksyms\n"); exit(1); } - opts.syms = (const char **) syms; - opts.cnt = cnt; + opts.syms = (const char **)ksyms->filtered_syms; + opts.cnt = ksyms->filtered_cnt; opts.retprobe = kretprobe; /* attach empty to all the kernel functions except bpf_get_numa_node_id. */ - if (!bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts)) { + link = bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts); + free_kallsyms_local(ksyms); + if (!link) { fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n"); exit(1); } diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 9caef222e528..f81dcd609ee9 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -456,25 +456,23 @@ static void test_kprobe_multi_bench_attach(bool kernel) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); struct kprobe_multi_empty *skel = NULL; - char **syms = NULL; - size_t cnt = 0; + struct ksyms *ksyms = NULL; - if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms")) + if (!ASSERT_OK(bpf_get_ksyms(&ksyms, kernel), "bpf_get_ksyms")) return; skel = kprobe_multi_empty__open_and_load(); if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) goto cleanup; - opts.syms = (const char **) syms; - opts.cnt = cnt; + opts.syms = (const char **)ksyms->filtered_syms; + opts.cnt = ksyms->filtered_cnt; do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) - free(syms); + free_kallsyms_local(ksyms); } static void test_kprobe_multi_bench_attach_addr(bool kernel) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index eeaab7013ca2..0e63daf83ed5 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -24,12 +24,6 @@ #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" -struct ksyms { - struct ksym *syms; - size_t sym_cap; - size_t sym_cnt; -}; - static struct ksyms *ksyms; static pthread_mutex_t ksyms_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -54,6 +48,8 @@ void free_kallsyms_local(struct ksyms *ksyms) if (!ksyms) return; + free(ksyms->filtered_syms); + if (!ksyms->syms) { free(ksyms); return; @@ -610,7 +606,7 @@ static int search_kallsyms_compare(const void *p1, const struct ksym *p2) return compare_name(p1, p2->name); } -int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) +int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel) { size_t cap = 0, cnt = 0; char *name = NULL, *ksym_name, **syms = NULL; @@ -637,8 +633,10 @@ int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) else f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); - if (!f) + if (!f) { + free_kallsyms_local(ksyms); return -EINVAL; + } map = hashmap__new(symbol_hash, symbol_equal, NULL); if (IS_ERR(map)) { @@ -679,15 +677,18 @@ int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) syms[cnt++] = ksym_name; } - *symsp = syms; - *cntp = cnt; + ksyms->filtered_syms = syms; + ksyms->filtered_cnt = cnt; + *ksymsp = ksyms; error: free(name); fclose(f); hashmap__free(map); - if (err) + if (err) { free(syms); + free_kallsyms_local(ksyms); + } return err; } diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index a5576b2dfc26..d5bf1433675d 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -23,7 +23,14 @@ struct ksym { long addr; char *name; }; -struct ksyms; + +struct ksyms { + struct ksym *syms; + size_t sym_cap; + size_t sym_cnt; + char **filtered_syms; + size_t filtered_cnt; +}; typedef int (*ksym_cmp_t)(const void *p1, const void *p2); typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); @@ -53,7 +60,7 @@ ssize_t get_rel_offset(uintptr_t addr); int read_build_id(const char *path, char *build_id, size_t size); -int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel); +int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel); int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); #endif -- cgit v1.2.3 From 9d0272c91fbc3ae86f2c3b6ba0a0b0ee77d862e3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:26 -0800 Subject: selftests/bpf: Fix memory leaks in tests Fix trivial memory leaks detected by userspace ASAN: - htab_update: free value buffer in test_reenter_update cleanup - test_xsk: inline pkt_stream_replace() in testapp_stats_rx_full() and testapp_stats_fill_empty() - testing_helpers: free buffer allocated by getline() in parse_test_list_file Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/htab_update.c | 1 + tools/testing/selftests/bpf/prog_tests/test_xsk.c | 24 ++++++++++++++++++---- tools/testing/selftests/bpf/testing_helpers.c | 1 + 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index d0b405eb2966..ea1a6766fbe9 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -61,6 +61,7 @@ static void test_reenter_update(void) ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy"); out: + free(value); htab_update__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index bab4a31621c7..7e38ec6e656b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -2003,9 +2003,17 @@ int testapp_stats_tx_invalid_descs(struct test_spec *test) int testapp_stats_rx_full(struct test_spec *test) { - if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + struct pkt_stream *tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); + if (!tmp) + return TEST_FAILURE; + test->ifobj_tx->xsk->pkt_stream = tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + if (!tmp) return TEST_FAILURE; - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + test->ifobj_rx->xsk->pkt_stream = tmp; test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; test->ifobj_rx->release_rx = false; @@ -2015,9 +2023,17 @@ int testapp_stats_rx_full(struct test_spec *test) int testapp_stats_fill_empty(struct test_spec *test) { - if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + struct pkt_stream *tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); + if (!tmp) + return TEST_FAILURE; + test->ifobj_tx->xsk->pkt_stream = tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + if (!tmp) return TEST_FAILURE; - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + test->ifobj_rx->xsk->pkt_stream = tmp; test->ifobj_rx->use_fill_ring = false; test->ifobj_rx->validation_func = validate_fill_empty; diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 16eb37e5bad6..66af0d13751a 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -212,6 +212,7 @@ int parse_test_list_file(const char *path, break; } + free(buf); fclose(f); return err; } -- cgit v1.2.3 From 3eb4a2e399c6766ea0c8291e2adb5e6dc42b6e68 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:27 -0800 Subject: selftests/bpf: Fix cleanup in check_fd_array_cnt__fd_array_too_big() The Close() macro uses the passed in expression three times, which leads to repeated execution in case it has side effects. That is, Close(i--) would decrement i three times. ASAN caught a stack-buffer-undeflow error at a point where this was overlooked. Fix it. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-12-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/fd_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index c534b4d5f9da..3078d8264deb 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -412,8 +412,8 @@ static void check_fd_array_cnt__fd_array_too_big(void) ASSERT_EQ(prog_fd, -E2BIG, "prog should have been rejected with -E2BIG"); cleanup_fds: - while (i > 0) - Close(extra_fds[--i]); + while (i-- > 0) + Close(extra_fds[i]); } void test_fd_array_cnt(void) -- cgit v1.2.3 From ff9511410d5fbdec21a3bbfaa477aa1f56735b33 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:28 -0800 Subject: veristat: Fix a memory leak for preset ENUMERATOR ASAN detected a memory leak in veristat. The cleanup code handling ENUMERATOR value missed freeing strdup-ed svalue. Fix it. Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-13-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 1be1e353d40a..75f85e0362f5 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -3378,6 +3378,8 @@ int main(int argc, char **argv) } } free(env.presets[i].atoms); + if (env.presets[i].value.type == ENUMERATOR) + free(env.presets[i].value.svalue); } free(env.presets); return -err; -- cgit v1.2.3 From f7ac552be7f13e834a942f64b1ac87e7755b32f4 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:29 -0800 Subject: selftests/bpf: Fix use-after-free in xdp_metadata test ASAN reported a use-after-free in close_xsk(). The xsk->socket internally references xsk->umem via socket->ctx->umem, so the socket must be deleted before the umem. Fix the order of operations in close_xsk(). Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-14-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 19f92affc2da..5c31054ad4a4 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -126,10 +126,10 @@ static int open_xsk(int ifindex, struct xsk *xsk) static void close_xsk(struct xsk *xsk) { - if (xsk->umem) - xsk_umem__delete(xsk->umem); if (xsk->socket) xsk_socket__delete(xsk->socket); + if (xsk->umem) + xsk_umem__delete(xsk->umem); munmap(xsk->umem_area, UMEM_SIZE); } -- cgit v1.2.3 From 68b8bea1eec392aa50736a859b8f1418067a3bc4 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:30 -0800 Subject: selftests/bpf: Fix double thread join in uprobe_multi_test ASAN reported a "joining already joined thread" error. The release_child() may be called multiple times for the same struct child. Fix by resetting child->thread to 0 after pthread_join. Also memset(0) static child variable in test_attach_api(). Acked-by: Mykyta Yatsenko Acked-by: Jiri Olsa Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-15-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 2ee17ef1dae2..56cbea280fbd 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -62,8 +62,10 @@ static void release_child(struct child *child) return; close(child->go[1]); close(child->go[0]); - if (child->thread) + if (child->thread) { pthread_join(child->thread, NULL); + child->thread = 0; + } close(child->c2p[0]); close(child->c2p[1]); if (child->pid > 0) @@ -331,6 +333,8 @@ test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi { static struct child child; + memset(&child, 0, sizeof(child)); + /* no pid filter */ __test_attach_api(binary, pattern, opts, NULL); -- cgit v1.2.3 From 71dca2950fe9abf8e6fd3d9f5e6342da6634b898 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:31 -0800 Subject: selftests/bpf: Fix resource leaks caused by missing cleanups ASAN reported a number of resource leaks: - Add missing *__destroy(skel) calls - Replace bpf_link__detach() with bpf_link__destroy() where appropriate - cgrp_local_storage: Add bpf_link__destroy() when bpf_iter_create fails - dynptr: Add missing bpf_object__close() Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-16-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/cgrp_local_storage.c | 4 +++- tools/testing/selftests/bpf/prog_tests/dynptr.c | 5 +++- .../selftests/bpf/prog_tests/sockmap_basic.c | 28 +++++++++++----------- .../selftests/bpf/prog_tests/sockmap_listen.c | 2 +- .../bpf/prog_tests/struct_ops_private_stack.c | 4 +--- tools/testing/selftests/bpf/prog_tests/tc_opts.c | 6 ++--- .../selftests/bpf/prog_tests/test_tc_tunnel.c | 5 +++- 7 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 9015e2c2ab12..478a77cb67e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -202,7 +202,7 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) iter_fd = bpf_iter_create(bpf_link__fd(link)); if (!ASSERT_GE(iter_fd, 0, "iter_create")) - goto out; + goto out_link; /* trigger the program run */ (void)read(iter_fd, buf, sizeof(buf)); @@ -210,6 +210,8 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); close(iter_fd); +out_link: + bpf_link__destroy(link); out: cgrp_ls_sleepable__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index b9f86cb91e81..5fda11590708 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -137,11 +137,14 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ ); link = bpf_program__attach(prog); - if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) { + bpf_object__close(obj); goto cleanup; + } err = bpf_prog_test_run_opts(aux_prog_fd, &topts); bpf_link__destroy(link); + bpf_object__close(obj); if (!ASSERT_OK(err, "test_run")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 256707e7d20d..dd3c757859f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -204,7 +204,7 @@ static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) /* Fail since bpf_link for the same prog type has been created. */ link2 = bpf_program__attach_sockmap(prog_clone, map); if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { - bpf_link__detach(link2); + bpf_link__destroy(link2); goto out; } @@ -230,7 +230,7 @@ static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) if (!ASSERT_OK(err, "bpf_link_update")) goto out; out: - bpf_link__detach(link); + bpf_link__destroy(link); test_skmsg_load_helpers__destroy(skel); } @@ -417,7 +417,7 @@ static void test_sockmap_skb_verdict_attach_with_link(void) if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) goto out; - bpf_link__detach(link); + bpf_link__destroy(link); err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); if (!ASSERT_OK(err, "bpf_prog_attach")) @@ -426,7 +426,7 @@ static void test_sockmap_skb_verdict_attach_with_link(void) /* Fail since attaching with the same prog/map has been done. */ link = bpf_program__attach_sockmap(prog, map); if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) - bpf_link__detach(link); + bpf_link__destroy(link); err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); if (!ASSERT_OK(err, "bpf_prog_detach2")) @@ -747,13 +747,13 @@ static void test_sockmap_skb_verdict_peek_with_link(void) test_sockmap_skb_verdict_peek_helper(map); ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); out: - bpf_link__detach(link); + bpf_link__destroy(link); test_sockmap_pass_prog__destroy(pass); } static void test_sockmap_unconnected_unix(void) { - int err, map, stream = 0, dgram = 0, zero = 0; + int err, map, stream = -1, dgram = -1, zero = 0; struct test_sockmap_pass_prog *skel; skel = test_sockmap_pass_prog__open_and_load(); @@ -764,22 +764,22 @@ static void test_sockmap_unconnected_unix(void) stream = xsocket(AF_UNIX, SOCK_STREAM, 0); if (stream < 0) - return; + goto out; dgram = xsocket(AF_UNIX, SOCK_DGRAM, 0); - if (dgram < 0) { - close(stream); - return; - } + if (dgram < 0) + goto out; err = bpf_map_update_elem(map, &zero, &stream, BPF_ANY); - ASSERT_ERR(err, "bpf_map_update_elem(stream)"); + if (!ASSERT_ERR(err, "bpf_map_update_elem(stream)")) + goto out; err = bpf_map_update_elem(map, &zero, &dgram, BPF_ANY); ASSERT_OK(err, "bpf_map_update_elem(dgram)"); - +out: close(stream); close(dgram); + test_sockmap_pass_prog__destroy(skel); } static void test_sockmap_many_socket(void) @@ -1027,7 +1027,7 @@ static void test_sockmap_skb_verdict_vsock_poll(void) if (xrecv_nonblock(conn, &buf, 1, 0) != 1) FAIL("xrecv_nonblock"); detach: - bpf_link__detach(link); + bpf_link__destroy(link); close: xclose(conn); xclose(peer); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index f1bdccc7e4e7..cc0c68bab907 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -899,7 +899,7 @@ static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *sk redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); - bpf_link__detach(link); + bpf_link__destroy(link); } static void redir_partial(int family, int sotype, int sock_map, int parser_map) diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c index 4006879ca3fe..d42123a0fb16 100644 --- a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c +++ b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c @@ -54,9 +54,7 @@ static void test_private_stack_fail(void) } err = struct_ops_private_stack_fail__load(skel); - if (!ASSERT_ERR(err, "struct_ops_private_stack_fail__load")) - goto cleanup; - return; + ASSERT_ERR(err, "struct_ops_private_stack_fail__load"); cleanup: struct_ops_private_stack_fail__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c index dd7a138d8c3d..2955750ddada 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c @@ -1360,10 +1360,8 @@ static void test_tc_opts_dev_cleanup_target(int target) assert_mprog_count_ifindex(ifindex, target, 4); - ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth"); - ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed"); - ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed"); - return; + goto cleanup; + cleanup3: err = bpf_prog_detach_opts(fd3, loopback, target, &optd); ASSERT_OK(err, "prog_detach"); diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c index 0fe0a8f62486..7fc4d7dd70ef 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c @@ -699,7 +699,7 @@ void test_tc_tunnel(void) return; if (!ASSERT_OK(setup(), "global setup")) - return; + goto out; for (i = 0; i < ARRAY_SIZE(subtests_cfg); i++) { cfg = &subtests_cfg[i]; @@ -711,4 +711,7 @@ void test_tc_tunnel(void) subtest_cleanup(cfg); } cleanup(); + +out: + test_tc_tunnel__destroy(skel); } -- cgit v1.2.3 From 2bb270a0ac68990648c5e84abf73bb7493230ea6 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:32 -0800 Subject: selftests/bpf: Free bpf_object in test_sysctl ASAN reported a resource leak due to the bpf_object not being tracked in test_sysctl. Add obj field to struct sysctl_test to properly clean it up. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-17-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c index 273dd41ca09e..2a1ff821bc97 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_sysctl.c +++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c @@ -27,6 +27,7 @@ struct sysctl_test { OP_EPERM, SUCCESS, } result; + struct bpf_object *obj; }; static struct sysctl_test tests[] = { @@ -1471,6 +1472,7 @@ static int load_sysctl_prog_file(struct sysctl_test *test) return -1; } + test->obj = obj; return prog_fd; } @@ -1573,6 +1575,7 @@ out: /* Detaching w/o checking return code: best effort attempt. */ if (progfd != -1) bpf_prog_detach(cgfd, atype); + bpf_object__close(test->obj); close(progfd); printf("[%s]\n", err ? "FAIL" : "PASS"); return err; -- cgit v1.2.3 From 3e711c8e4707c46cc45d9775b50204d0f2790d77 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:33 -0800 Subject: selftests/bpf: Fix array bounds warning in jit_disasm_helpers Compiler cannot infer upper bound for labels.cnt and warns about potential buffer overflow in snprintf. Add an explicit bounds check (... && i < MAX_LOCAL_LABELS) in the loop condition to fix the warning. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-18-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/jit_disasm_helpers.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index febd6b12e372..364c557c5115 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -122,15 +122,15 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) pc += cnt; } qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); - for (i = 0; i < labels.cnt; ++i) - /* gcc is unable to infer upper bound for labels.cnt and assumes - * it to be U32_MAX. U32_MAX takes 10 decimal digits. - * snprintf below prints into labels.names[*], - * which has space only for two digits and a letter. - * To avoid truncation warning use (i % MAX_LOCAL_LABELS), - * which informs gcc about printed value upper bound. - */ - snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % MAX_LOCAL_LABELS); + /* gcc is unable to infer upper bound for labels.cnt and + * assumes it to be U32_MAX. U32_MAX takes 10 decimal digits. + * snprintf below prints into labels.names[*], which has space + * only for two digits and a letter. To avoid truncation + * warning use (i < MAX_LOCAL_LABELS), which informs gcc about + * printed value upper bound. + */ + for (i = 0; i < labels.cnt && i < MAX_LOCAL_LABELS; ++i) + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i); /* now print with labels */ labels.print_phase = true; -- cgit v1.2.3 From ad90ecedad755e2f3e31364ec3130e5bb2f4b64a Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:16 -0800 Subject: selftests/bpf: Fix out-of-bounds array access bugs reported by ASAN - kmem_cache_iter: remove unnecessary debug output - lwt_seg6local: change the type of foobar to char[] - the sizeof(foobar) returned the pointer size and not a string length as intended - verifier_log: increase prog_name buffer size in verif_log_subtest() - compiler has a conservative estimate of fixed_log_sz value, making ASAN complain on snprint() call Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c | 7 ++----- tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c | 2 +- tools/testing/selftests/bpf/prog_tests/verifier_log.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c index 6e35e13c2022..399fe9103f83 100644 --- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c @@ -104,11 +104,8 @@ void test_kmem_cache_iter(void) if (!ASSERT_GE(iter_fd, 0, "iter_create")) goto destroy; - memset(buf, 0, sizeof(buf)); - while (read(iter_fd, buf, sizeof(buf)) > 0) { - /* Read out all contents */ - printf("%s", buf); - } + while (read(iter_fd, buf, sizeof(buf)) > 0) + ; /* Read out all contents */ /* Next reads should return 0 */ ASSERT_EQ(read(iter_fd, buf, sizeof(buf)), 0, "read"); diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c index 3bc730b7c7fa..1b25d5c5f8fb 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c @@ -117,7 +117,7 @@ void test_lwt_seg6local(void) const char *ns1 = NETNS_BASE "1"; const char *ns6 = NETNS_BASE "6"; struct nstoken *nstoken = NULL; - const char *foobar = "foobar"; + const char foobar[] = "foobar"; ssize_t bytes; int sfd, cfd; char buf[7]; diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c index 8337c6bc5b95..aaa2854974c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c @@ -47,7 +47,7 @@ static int load_prog(struct bpf_prog_load_opts *opts, bool expect_load_error) static void verif_log_subtest(const char *name, bool expect_load_error, int log_level) { LIBBPF_OPTS(bpf_prog_load_opts, opts); - char *exp_log, prog_name[16], op_name[32]; + char *exp_log, prog_name[24], op_name[32]; struct test_log_buf *skel; struct bpf_program *prog; size_t fixed_log_sz; -- cgit v1.2.3 From a2714e730304c79bf85d2178387255ef8348b897 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:17 -0800 Subject: selftests/bpf: Check BPFTOOL env var in detect_bpftool_path() The bpftool_maps_access and bpftool_metadata tests may fail on BPF CI with "command not found", depending on a workflow. This happens because detect_bpftool_path() only checks two hardcoded relative paths: - ./tools/sbin/bpftool - ../tools/sbin/bpftool Add support for a BPFTOOL environment variable that allows specifying the exact path to the bpftool binary. Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpftool_helpers.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c index 595a636fa13b..929fc257f431 100644 --- a/tools/testing/selftests/bpf/bpftool_helpers.c +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -14,6 +14,17 @@ static int detect_bpftool_path(char *buffer, size_t size) { char tmp[BPFTOOL_PATH_MAX_LEN]; + const char *env_path; + + /* First, check if BPFTOOL environment variable is set */ + env_path = getenv("BPFTOOL"); + if (env_path && access(env_path, X_OK) == 0) { + strscpy(buffer, env_path, size); + return 0; + } else if (env_path) { + fprintf(stderr, "bpftool '%s' doesn't exist or is not executable\n", env_path); + return 1; + } /* Check default bpftool location (will work if we are running the * default flavor of test_progs) @@ -33,7 +44,7 @@ static int detect_bpftool_path(char *buffer, size_t size) return 0; } - /* Failed to find bpftool binary */ + fprintf(stderr, "Failed to detect bpftool path, use BPFTOOL env var to override\n"); return 1; } -- cgit v1.2.3 From 4c9d07865c06bb9b9faff1ecf6c4b7cc8f7d67a9 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:18 -0800 Subject: selftests/bpf: Don't override SIGSEGV handler with ASAN test_progs has custom SIGSEGV handler, which interferes with the address sanitizer [1]. Add an #ifndef to avoid this. Additionally, declare an __asan_on_error() to dump the test logs in the same way it happens in the custom SIGSEGV handler. [1] https://lore.kernel.org/bpf/73d832948b01dbc0ebc60d85574bdf8537f3a810.camel@gmail.com/ Acked-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 36 +++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index d1418ec1f351..0929f4a7bda4 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1261,14 +1261,8 @@ int get_bpf_max_tramp_links(void) return ret; } -#define MAX_BACKTRACE_SZ 128 -void crash_handler(int signum) +static void dump_crash_log(void) { - void *bt[MAX_BACKTRACE_SZ]; - size_t sz; - - sz = backtrace(bt, ARRAY_SIZE(bt)); - fflush(stdout); stdout = env.stdout_saved; stderr = env.stderr_saved; @@ -1277,12 +1271,32 @@ void crash_handler(int signum) env.test_state->error_cnt++; dump_test_log(env.test, env.test_state, true, false, NULL); } +} + +#define MAX_BACKTRACE_SZ 128 + +void crash_handler(int signum) +{ + void *bt[MAX_BACKTRACE_SZ]; + size_t sz; + + sz = backtrace(bt, ARRAY_SIZE(bt)); + + dump_crash_log(); + if (env.worker_id != -1) fprintf(stderr, "[%d]: ", env.worker_id); fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); backtrace_symbols_fd(bt, sz, STDERR_FILENO); } +#ifdef __SANITIZE_ADDRESS__ +void __asan_on_error(void) +{ + dump_crash_log(); +} +#endif + void hexdump(const char *prefix, const void *buf, size_t len) { for (int i = 0; i < len; i++) { @@ -1944,13 +1958,15 @@ int main(int argc, char **argv) .parser = parse_arg, .doc = argp_program_doc, }; + int err, i; + +#ifndef __SANITIZE_ADDRESS__ struct sigaction sigact = { .sa_handler = crash_handler, .sa_flags = SA_RESETHAND, - }; - int err, i; - + }; sigaction(SIGSEGV, &sigact, NULL); +#endif env.stdout_saved = stdout; env.stderr_saved = stderr; -- cgit v1.2.3 From 56e0a838277b12d48477b9c7478ceb5ae528ae2c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Tue, 24 Feb 2026 15:57:53 +0800 Subject: MAINTAINERS: Update Shawn Guo's address for HiSilicon PCIe controller driver Shawn is no longer with Linaro. Use his korg email address instead. Signed-off-by: Shawn Guo Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260224075753.122091-1-shawnguo@kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..f02f50c9d695 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20509,7 +20509,7 @@ F: Documentation/devicetree/bindings/pci/hisilicon,kirin-pcie.yaml F: drivers/pci/controller/dwc/pcie-kirin.c PCIE DRIVER FOR HISILICON STB -M: Shawn Guo +M: Shawn Guo L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/hisilicon-histb-pcie.txt -- cgit v1.2.3 From 30df81f2228d65bddf492db3929d9fcaffd38fc5 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 23 Feb 2026 14:56:09 +0800 Subject: scsi: ufs: core: Fix possible NULL pointer dereference in ufshcd_add_command_trace() The kernel log indicates a crash in ufshcd_add_command_trace, due to a NULL pointer dereference when accessing hwq->id. This can happen if ufshcd_mcq_req_to_hwq() returns NULL. This patch adds a NULL check for hwq before accessing its id field to prevent a kernel crash. Kernel log excerpt: [] notify_die+0x4c/0x8c [] __die+0x60/0xb0 [] die+0x4c/0xe0 [] die_kernel_fault+0x74/0x88 [] __do_kernel_fault+0x314/0x318 [] do_page_fault+0xa4/0x5f8 [] do_translation_fault+0x34/0x54 [] do_mem_abort+0x50/0xa8 [] el1_abort+0x3c/0x64 [] el1h_64_sync_handler+0x44/0xcc [] el1h_64_sync+0x80/0x88 [] ufshcd_add_command_trace+0x23c/0x320 [] ufshcd_compl_one_cqe+0xa4/0x404 [] ufshcd_mcq_poll_cqe_lock+0xac/0x104 [] ufs_mtk_mcq_intr+0x54/0x74 [ufs_mediatek_mod] [] __handle_irq_event_percpu+0xc8/0x348 [] handle_irq_event+0x3c/0xa8 [] handle_fasteoi_irq+0xf8/0x294 [] generic_handle_domain_irq+0x54/0x80 [] gic_handle_irq+0x1d4/0x330 [] call_on_irq_stack+0x44/0x68 [] do_interrupt_handler+0x78/0xd8 [] el1_interrupt+0x48/0xa8 [] el1h_64_irq_handler+0x14/0x24 [] el1h_64_irq+0x80/0x88 [] arch_local_irq_enable+0x4/0x1c [] cpuidle_enter+0x34/0x54 [] do_idle+0x1dc/0x2f8 [] cpu_startup_entry+0x30/0x3c [] secondary_start_kernel+0x134/0x1ac [] __secondary_switched+0xc4/0xcc Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260223065657.2432447-1-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 571a73860561..4dc7fbc2a6db 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -518,8 +518,8 @@ static void ufshcd_add_command_trace(struct ufs_hba *hba, struct scsi_cmnd *cmd, if (hba->mcq_enabled) { struct ufs_hw_queue *hwq = ufshcd_mcq_req_to_hwq(hba, rq); - - hwq_id = hwq->id; + if (hwq) + hwq_id = hwq->id; } else { doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); } -- cgit v1.2.3 From 62c015373e1cdb1cdca824bd2dbce2dac0819467 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 23 Feb 2026 18:37:57 +0800 Subject: scsi: ufs: core: Move link recovery for hibern8 exit failure to wl_resume Move the link recovery trigger from ufshcd_uic_pwr_ctrl() to __ufshcd_wl_resume(). Ensure link recovery is only attempted when hibern8 exit fails during resume, not during hibern8 enter in suspend. Improve error handling and prevent unnecessary link recovery attempts. Fixes: 35dabf4503b9 ("scsi: ufs: core: Use link recovery when h8 exit fails during runtime resume") Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260223103906.2533654-1-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 4dc7fbc2a6db..85987373b961 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -4390,14 +4390,6 @@ out_unlock: spin_unlock_irqrestore(hba->host->host_lock, flags); mutex_unlock(&hba->uic_cmd_mutex); - /* - * If the h8 exit fails during the runtime resume process, it becomes - * stuck and cannot be recovered through the error handler. To fix - * this, use link recovery instead of the error handler. - */ - if (ret && hba->pm_op_in_progress) - ret = ufshcd_link_recovery(hba); - return ret; } @@ -10200,7 +10192,15 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) } else { dev_err(hba->dev, "%s: hibern8 exit failed %d\n", __func__, ret); - goto vendor_suspend; + /* + * If the h8 exit fails during the runtime resume + * process, it becomes stuck and cannot be recovered + * through the error handler. To fix this, use link + * recovery instead of the error handler. + */ + ret = ufshcd_link_recovery(hba); + if (ret) + goto vendor_suspend; } } else if (ufshcd_is_link_off(hba)) { /* -- cgit v1.2.3 From 74b6e83942dcc9f3cca9e561b205a5b19940a344 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 19 Feb 2026 12:50:29 -0800 Subject: drm/gpusvm: Fix drm_gpusvm_pages_valid_unlocked() kernel-doc The kernel-doc for drm_gpusvm_pages_valid_unlocked() was stale and still referenced old range-based arguments and naming. Update the documentation to match the current function arguments and signature. Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260219205029.1011336-1-matthew.brost@intel.com --- drivers/gpu/drm/drm_gpusvm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index 24180bfdf5a2..9ef9e52c0547 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1338,14 +1338,14 @@ bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm, EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid); /** - * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked + * drm_gpusvm_pages_valid_unlocked() - GPU SVM pages valid unlocked * @gpusvm: Pointer to the GPU SVM structure - * @range: Pointer to the GPU SVM range structure + * @svm_pages: Pointer to the GPU SVM pages structure * - * This function determines if a GPU SVM range pages are valid. Expected be - * called without holding gpusvm->notifier_lock. + * This function determines if a GPU SVM pages are valid. Expected be called + * without holding gpusvm->notifier_lock. * - * Return: True if GPU SVM range has valid pages, False otherwise + * Return: True if GPU SVM pages are valid, False otherwise */ static bool drm_gpusvm_pages_valid_unlocked(struct drm_gpusvm *gpusvm, struct drm_gpusvm_pages *svm_pages) -- cgit v1.2.3 From 0902010c8d163f7b62e655efda1a843529152c7c Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Tue, 24 Feb 2026 18:07:59 +0800 Subject: regulator: fp9931: Fix PM runtime reference leak in fp9931_hwmon_read() In fp9931_hwmon_read(), if regmap_read() failed, the function returned the error code without calling pm_runtime_put_autosuspend(), causing a PM reference leak. Fixes: 12d821bd13d4 ("regulator: Add FP9931/JD9930 driver") Signed-off-by: Felix Gu Reviewed-by: Andreas Kemnade Link: https://patch.msgid.link/20260224-fp9931-v1-1-1cf05cabef4a@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/fp9931.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/regulator/fp9931.c b/drivers/regulator/fp9931.c index 7fbcc6327cc6..abea3b69d8a0 100644 --- a/drivers/regulator/fp9931.c +++ b/drivers/regulator/fp9931.c @@ -144,13 +144,12 @@ static int fp9931_hwmon_read(struct device *dev, enum hwmon_sensor_types type, return ret; ret = regmap_read(data->regmap, FP9931_REG_TMST_VALUE, &val); - if (ret) - return ret; + if (!ret) + *temp = (s8)val * 1000; pm_runtime_put_autosuspend(data->dev); - *temp = (s8)val * 1000; - return 0; + return ret; } static umode_t fp9931_hwmon_is_visible(const void *data, -- cgit v1.2.3 From 4baaddaa44af01cd4ce239493060738fd0881835 Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Tue, 24 Feb 2026 19:19:03 +0800 Subject: regulator: bq257xx: Fix device node reference leak in bq257xx_reg_dt_parse_gpio() In bq257xx_reg_dt_parse_gpio(), if fails to get subchild, it returns without calling of_node_put(child), causing the device node reference leak. Fixes: 981dd162b635 ("regulator: bq257xx: Add bq257xx boost regulator driver") Signed-off-by: Felix Gu Link: https://patch.msgid.link/20260224-bq257-v1-1-8ebbc731c1c3@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/bq257xx-regulator.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/regulator/bq257xx-regulator.c b/drivers/regulator/bq257xx-regulator.c index fc1ccede4468..dab8f1ab4450 100644 --- a/drivers/regulator/bq257xx-regulator.c +++ b/drivers/regulator/bq257xx-regulator.c @@ -115,11 +115,10 @@ static void bq257xx_reg_dt_parse_gpio(struct platform_device *pdev) return; subchild = of_get_child_by_name(child, pdata->desc.of_match); + of_node_put(child); if (!subchild) return; - of_node_put(child); - pdata->otg_en_gpio = devm_fwnode_gpiod_get_index(&pdev->dev, of_fwnode_handle(subchild), "enable", 0, -- cgit v1.2.3 From bfd7db781e2e7a99b086d645a104d16e368f58ff Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Tue, 24 Feb 2026 20:23:30 +0800 Subject: regulator: Kconfig: fix a typo Fixes a typo in Kconfig, HWWON -> HWMON Signed-off-by: Felix Gu Link: https://patch.msgid.link/20260224-kconfig-v1-1-b0c5459ed7a0@gmail.com Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index a708fc63f581..d10b6f9243d5 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -508,7 +508,7 @@ config REGULATOR_FP9931 This driver supports the FP9931/JD9930 voltage regulator chip which is used to provide power to Electronic Paper Displays so it is found in E-Book readers. - If HWWON is enabled, it also provides temperature measurement. + If HWMON is enabled, it also provides temperature measurement. config REGULATOR_LM363X tristate "TI LM363X voltage regulators" -- cgit v1.2.3 From e08f2adcf990b8cf272e90898401a9e481c1c667 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 24 Feb 2026 13:36:09 +0200 Subject: Revert "irqchip/ls-extirq: Use for_each_of_imap_item iterator" This reverts commit 3ac6dfe3d7a2396602b67667249b146504dfbd2a. The ls-extirq uses interrupt-map but it's a non-standard use documented in fsl,ls-extirq.yaml: # The driver(drivers/irqchip/irq-ls-extirq.c) have not use standard DT # function to parser interrupt-map. So it doesn't consider '#address-size' # in parent interrupt controller, such as GIC. # # When dt-binding verify interrupt-map, item data matrix is spitted at # incorrect position. Remove interrupt-map restriction because it always # wrong. This means that by using for_each_of_imap_item and the underlying of_irq_parse_imap_parent() on its interrupt-map property will effectively break its functionality Revert the patch making use of for_each_of_imap_item() in ls-extirq. Fixes: 3ac6dfe3d7a2 ("irqchip/ls-extirq: Use for_each_of_imap_item iterator") Signed-off-by: Ioana Ciornei Signed-off-by: Thomas Gleixner Acked-by: Herve Codina Link: https://patch.msgid.link/20260224113610.1129022-2-ioana.ciornei@nxp.com --- drivers/irqchip/irq-ls-extirq.c | 47 ++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/drivers/irqchip/irq-ls-extirq.c b/drivers/irqchip/irq-ls-extirq.c index a7e9c3885b09..96f9c20621cf 100644 --- a/drivers/irqchip/irq-ls-extirq.c +++ b/drivers/irqchip/irq-ls-extirq.c @@ -125,32 +125,45 @@ static const struct irq_domain_ops extirq_domain_ops = { static int ls_extirq_parse_map(struct ls_extirq_data *priv, struct device_node *node) { - struct of_imap_parser imap_parser; - struct of_imap_item imap_item; + const __be32 *map; + u32 mapsize; int ret; - ret = of_imap_parser_init(&imap_parser, node, &imap_item); - if (ret) - return ret; + map = of_get_property(node, "interrupt-map", &mapsize); + if (!map) + return -ENOENT; + if (mapsize % sizeof(*map)) + return -EINVAL; + mapsize /= sizeof(*map); - for_each_of_imap_item(&imap_parser, &imap_item) { + while (mapsize) { struct device_node *ipar; - u32 hwirq; - int i; + u32 hwirq, intsize, j; - hwirq = imap_item.child_imap[0]; - if (hwirq >= MAXIRQ) { - of_node_put(imap_item.parent_args.np); + if (mapsize < 3) + return -EINVAL; + hwirq = be32_to_cpup(map); + if (hwirq >= MAXIRQ) return -EINVAL; - } priv->nirq = max(priv->nirq, hwirq + 1); - ipar = of_node_get(imap_item.parent_args.np); - priv->map[hwirq].fwnode = of_fwnode_handle(ipar); + ipar = of_find_node_by_phandle(be32_to_cpup(map + 2)); + map += 3; + mapsize -= 3; + if (!ipar) + return -EINVAL; + priv->map[hwirq].fwnode = &ipar->fwnode; + ret = of_property_read_u32(ipar, "#interrupt-cells", &intsize); + if (ret) + return ret; + + if (intsize > mapsize) + return -EINVAL; - priv->map[hwirq].param_count = imap_item.parent_args.args_count; - for (i = 0; i < priv->map[hwirq].param_count; i++) - priv->map[hwirq].param[i] = imap_item.parent_args.args[i]; + priv->map[hwirq].param_count = intsize; + for (j = 0; j < intsize; ++j) + priv->map[hwirq].param[j] = be32_to_cpup(map++); + mapsize -= intsize; } return 0; } -- cgit v1.2.3 From fe5669e363b129cde285bfb4d45abb72d1d77cfc Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 24 Feb 2026 13:36:10 +0200 Subject: irqchip/ls-extirq: Fix devm_of_iomap() error check The devm_of_iomap() function returns an ERR_PTR() encoded error code on failure. Replace the incorrect check against NULL with IS_ERR(). Fixes: 05cd654829dd ("irqchip/ls-extirq: Convert to a platform driver to make it work again") Reported-by: Dan Carpenter Signed-off-by: Ioana Ciornei Signed-off-by: Thomas Gleixner Reviewed-by: Herve Codina Link: https://patch.msgid.link/20260224113610.1129022-3-ioana.ciornei@nxp.com Closes: https://lore.kernel.org/all/aYXvfbfT6w0TMsXS@stanley.mountain/ --- drivers/irqchip/irq-ls-extirq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-ls-extirq.c b/drivers/irqchip/irq-ls-extirq.c index 96f9c20621cf..d724fe843980 100644 --- a/drivers/irqchip/irq-ls-extirq.c +++ b/drivers/irqchip/irq-ls-extirq.c @@ -190,8 +190,10 @@ static int ls_extirq_probe(struct platform_device *pdev) return dev_err_probe(dev, -ENOMEM, "Failed to allocate memory\n"); priv->intpcr = devm_of_iomap(dev, node, 0, NULL); - if (!priv->intpcr) - return dev_err_probe(dev, -ENOMEM, "Cannot ioremap OF node %pOF\n", node); + if (IS_ERR(priv->intpcr)) { + return dev_err_probe(dev, PTR_ERR(priv->intpcr), + "Cannot ioremap OF node %pOF\n", node); + } ret = ls_extirq_parse_map(priv, node); if (ret) -- cgit v1.2.3 From 095f5693329221ffd9dfe5fcd0079263d462e441 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Tue, 17 Feb 2026 15:25:59 +0800 Subject: tools/sched_ext: Sync README.md Kconfig with upstream scx Sync the documentation with the upstream scx repository to reflect the current recommended configuration. Ref: https://github.com/sched-ext/scx/blob/main/README.md#build--install Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/sched_ext/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 56a9d1557ac4..6e282bce453c 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -58,14 +58,8 @@ CONFIG_SCHED_CLASS_EXT=y CONFIG_BPF_SYSCALL=y CONFIG_BPF_JIT=y CONFIG_DEBUG_INFO_BTF=y -``` - -It's also recommended that you also include the following Kconfig options: - -``` CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_PAHOLE_HAS_BTF_TAG=y ``` There is a `Kconfig` file in this directory whose contents you can append to -- cgit v1.2.3 From ee0ff6690f2641b8f6ba8026ec17f6bc48f86649 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Tue, 17 Feb 2026 15:26:00 +0800 Subject: tools/sched_ext: Add Kconfig to sync with upstream Add the missing Kconfig file to tools/sched_ext/ as referenced in the README. Ref: https://github.com/sched-ext/scx/blob/main/kernel.config Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- tools/sched_ext/Kconfig | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 tools/sched_ext/Kconfig diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig new file mode 100644 index 000000000000..275bd97ae62b --- /dev/null +++ b/tools/sched_ext/Kconfig @@ -0,0 +1,61 @@ +# sched-ext mandatory options +# +CONFIG_BPF=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_SCHED_CLASS_EXT=y + +# Required by some rust schedulers (e.g. scx_p2dq) +# +CONFIG_KALLSYMS_ALL=y + +# Required on arm64 +# +# CONFIG_DEBUG_INFO_REDUCED is not set + +# LAVD tracks futex to give an additional time slice for futex holder +# (i.e., avoiding lock holder preemption) for better system-wide progress. +# LAVD first tries to use ftrace to trace futex function calls. +# If that is not available, it tries to use a tracepoint. +CONFIG_FUNCTION_TRACER=y + +# Enable scheduling debugging +# +CONFIG_SCHED_DEBUG=y + +# Enable extra scheduling features (for a better code coverage while testing +# the schedulers) +# +CONFIG_SCHED_AUTOGROUP=y +CONFIG_SCHED_CORE=y +CONFIG_SCHED_MC=y + +# Enable fully preemptible kernel for a better test coverage of the schedulers +# +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_DYNAMIC=y + +# Additional debugging information (useful to catch potential locking issues) +CONFIG_DEBUG_LOCKDEP=y +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_PROVE_LOCKING=y + +# Bpftrace headers (for additional debug info) +CONFIG_BPF_EVENTS=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_KPROBES=y +CONFIG_KPROBE_EVENTS=y +CONFIG_UPROBES=y +CONFIG_UPROBE_EVENTS=y +CONFIG_DEBUG_FS=y + +# Enable access to kernel configuration and headers at runtime +CONFIG_IKHEADERS=y +CONFIG_IKCONFIG_PROC=y +CONFIG_IKCONFIG=y -- cgit v1.2.3 From a46435537a844d0f7b4b620baf962cad136422de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Feb 2026 11:36:09 -0700 Subject: io_uring/cmd_net: use READ_ONCE() for ->addr3 read Any SQE read should use READ_ONCE(), to ensure the result is read once and only once. Doesn't really matter for this case, but it's better to keep these 100% consistent and always use READ_ONCE() for the prep side of SQE handling. Fixes: 5d24321e4c15 ("io_uring: Introduce getsockname io_uring cmd") Signed-off-by: Jens Axboe --- io_uring/cmd_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 57ddaf874611..125a81c520a6 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -146,7 +146,7 @@ static int io_uring_cmd_getsockname(struct socket *sock, return -EINVAL; uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ulen = u64_to_user_ptr(sqe->addr3); + ulen = u64_to_user_ptr(READ_ONCE(sqe->addr3)); peer = READ_ONCE(sqe->optlen); if (peer > 1) return -EINVAL; -- cgit v1.2.3 From 09833d99db36d74456a4d13eb29c32d56ff8f2b6 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 13 Feb 2026 10:54:10 +0100 Subject: mm/kfence: disable KFENCE upon KASAN HW tags enablement KFENCE does not currently support KASAN hardware tags. As a result, the two features are incompatible when enabled simultaneously. Given that MTE provides deterministic protection and KFENCE is a sampling-based debugging tool, prioritize the stronger hardware protections. Disable KFENCE initialization and free the pre-allocated pool if KASAN hardware tags are detected to ensure the system maintains the security guarantees provided by MTE. Link: https://lkml.kernel.org/r/20260213095410.1862978-1-glider@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Reviewed-by: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Ernesto Martinez Garcia Cc: Greg KH Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- mm/kfence/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index b4ea3262c925..b5aedf505cec 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -916,6 +917,20 @@ void __init kfence_alloc_pool_and_metadata(void) if (!kfence_sample_interval) return; + /* + * If KASAN hardware tags are enabled, disable KFENCE, because it + * does not support MTE yet. + */ + if (kasan_hw_tags_enabled()) { + pr_info("disabled as KASAN HW tags are enabled\n"); + if (__kfence_pool) { + memblock_free(__kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = NULL; + } + kfence_sample_interval = 0; + return; + } + /* * If the pool has already been initialized by arch, there is no need to * re-allocate the memory pool. -- cgit v1.2.3 From eb9549346f7578eda3755683ac2cfb4d94c0675f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 13:17:44 +0100 Subject: mm: change vma_alloc_folio_noprof() macro to inline function In a few rare configurations with extra warnings eanbled, the new drm_pagemap_migrate_populate_ram_pfn() calls vma_alloc_folio_noprof() but that does not use all the arguments, leading to a harmless warning: drivers/gpu/drm/drm_pagemap.c: In function 'drm_pagemap_migrate_populate_ram_pfn': drivers/gpu/drm/drm_pagemap.c:701:63: error: parameter 'addr' set but not used [-Werror=unused-but-set-parameter=] 701 | unsigned long addr) | ~~~~~~~~~~~~~~^~~~ Replace the macro with an inline function so the compiler can see how the argument would be used, but is still able to optimize out the assignments. Link: https://lkml.kernel.org/r/20260216121751.2378374-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Suren Baghdasaryan Cc: Alexei Starovoitov Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/gfp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b30a0529d48..f82d74a77cad 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -339,8 +339,11 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ - folio_alloc_noprof(gfp, order) +static inline struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, + struct vm_area_struct *vma, unsigned long addr) +{ + return folio_alloc_noprof(gfp, order); +} #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -- cgit v1.2.3 From dd085fe9a8ebfc5d10314c60452db38d2b75e609 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 14 Feb 2026 05:45:35 +0530 Subject: mm: thp: deny THP for files on anonymous inodes file_thp_enabled() incorrectly allows THP for files on anonymous inodes (e.g. guest_memfd and secretmem). These files are created via alloc_file_pseudo(), which does not call get_write_access() and leaves inode->i_writecount at 0. Combined with S_ISREG(inode->i_mode) being true, they appear as read-only regular files when CONFIG_READ_ONLY_THP_FOR_FS is enabled, making them eligible for THP collapse. Anonymous inodes can never pass the inode_is_open_for_write() check since their i_writecount is never incremented through the normal VFS open path. The right thing to do is to exclude them from THP eligibility altogether, since CONFIG_READ_ONLY_THP_FOR_FS was designed for real filesystem files (e.g. shared libraries), not for pseudo-filesystem inodes. For guest_memfd, this allows khugepaged and MADV_COLLAPSE to create large folios in the page cache via the collapse path, but the guest_memfd fault handler does not support large folios. This triggers WARN_ON_ONCE(folio_test_large(folio)) in kvm_gmem_fault_user_mapping(). For secretmem, collapse_file() tries to copy page contents through the direct map, but secretmem pages are removed from the direct map. This can result in a kernel crash: BUG: unable to handle page fault for address: ffff88810284d000 RIP: 0010:memcpy_orig+0x16/0x130 Call Trace: collapse_file hpage_collapse_scan_file madvise_collapse Secretmem is not affected by the crash on upstream as the memory failure recovery handles the failed copy gracefully, but it still triggers confusing false memory failure reports: Memory failure: 0x106d96f: recovery action for clean unevictable LRU page: Recovered Check IS_ANON_FILE(inode) in file_thp_enabled() to deny THP for all anonymous inode files. Link: https://syzkaller.appspot.com/bug?extid=33a04338019ac7e43a44 Link: https://lore.kernel.org/linux-mm/CAEvNRgHegcz3ro35ixkDw39ES8=U6rs6S7iP0gkR9enr7HoGtA@mail.gmail.com Link: https://lkml.kernel.org/r/20260214001535.435626-1-kartikey406@gmail.com Fixes: 7fbb5e188248 ("mm: remove VM_EXEC requirement for THP eligibility") Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+33a04338019ac7e43a44@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=33a04338019ac7e43a44 Tested-by: syzbot+33a04338019ac7e43a44@syzkaller.appspotmail.com Tested-by: Lance Yang Acked-by: David Hildenbrand (Arm) Reviewed-by: Barry Song Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Dev Jain Cc: Fangrui Song Cc: Liam Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d4ca8cfd7f9d..8e2746ea74ad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -94,6 +94,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) inode = file_inode(vma->vm_file); + if (IS_ANON_FILE(inode)) + return false; + return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -- cgit v1.2.3 From f85b1c6af5bc3872f994df0a5688c1162de07a62 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 14:22:19 +0100 Subject: liveupdate: luo_file: remember retrieve() status LUO keeps track of successful retrieve attempts on a LUO file. It does so to avoid multiple retrievals of the same file. Multiple retrievals cause problems because once the file is retrieved, the serialized data structures are likely freed and the file is likely in a very different state from what the code expects. The retrieve boolean in struct luo_file keeps track of this, and is passed to the finish callback so it knows what work was already done and what it has left to do. All this works well when retrieve succeeds. When it fails, luo_retrieve_file() returns the error immediately, without ever storing anywhere that a retrieve was attempted or what its error code was. This results in an errored LIVEUPDATE_SESSION_RETRIEVE_FD ioctl to userspace, but nothing prevents it from trying this again. The retry is problematic for much of the same reasons listed above. The file is likely in a very different state than what the retrieve logic normally expects, and it might even have freed some serialization data structures. Attempting to access them or free them again is going to break things. For example, if memfd managed to restore 8 of its 10 folios, but fails on the 9th, a subsequent retrieve attempt will try to call kho_restore_folio() on the first folio again, and that will fail with a warning since it is an invalid operation. Apart from the retry, finish() also breaks. Since on failure the retrieved bool in luo_file is never touched, the finish() call on session close will tell the file handler that retrieve was never attempted, and it will try to access or free the data structures that might not exist, much in the same way as the retry attempt. There is no sane way of attempting the retrieve again. Remember the error retrieve returned and directly return it on a retry. Also pass this status code to finish() so it can make the right decision on the work it needs to do. This is done by changing the bool to an integer. A value of 0 means retrieve was never attempted, a positive value means it succeeded, and a negative value means it failed and the error code is the value. Link: https://lkml.kernel.org/r/20260216132221.987987-1-pratyush@kernel.org Fixes: 7c722a7f44e0 ("liveupdate: luo_file: implement file systems callbacks") Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 9 ++++++--- kernel/liveupdate/luo_file.c | 41 +++++++++++++++++++++++++---------------- mm/memfd_luo.c | 7 ++++++- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index fe82a6c3005f..dd11fdc76a5f 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -23,8 +23,11 @@ struct file; /** * struct liveupdate_file_op_args - Arguments for file operation callbacks. * @handler: The file handler being called. - * @retrieved: The retrieve status for the 'can_finish / finish' - * operation. + * @retrieve_status: The retrieve status for the 'can_finish / finish' + * operation. A value of 0 means the retrieve has not been + * attempted, a positive value means the retrieve was + * successful, and a negative value means the retrieve failed, + * and the value is the error code of the call. * @file: The file object. For retrieve: [OUT] The callback sets * this to the new file. For other ops: [IN] The caller sets * this to the file being operated on. @@ -40,7 +43,7 @@ struct file; */ struct liveupdate_file_op_args { struct liveupdate_file_handler *handler; - bool retrieved; + int retrieve_status; struct file *file; u64 serialized_data; void *private_data; diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 8c79058253e1..5acee4174bf0 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -134,9 +134,12 @@ static LIST_HEAD(luo_file_handler_list); * state that is not preserved. Set by the handler's .preserve() * callback, and must be freed in the handler's .unpreserve() * callback. - * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * @retrieve_status: Status code indicating whether a user/kernel in the new kernel has * successfully called retrieve() on this file. This prevents - * multiple retrieval attempts. + * multiple retrieval attempts. A value of 0 means a retrieve() + * has not been attempted, a positive value means the retrieve() + * was successful, and a negative value means the retrieve() + * failed, and the value is the error code of the call. * @mutex: A mutex that protects the fields of this specific instance * (e.g., @retrieved, @file), ensuring that operations like * retrieving or finishing a file are atomic. @@ -161,7 +164,7 @@ struct luo_file { struct file *file; u64 serialized_data; void *private_data; - bool retrieved; + int retrieve_status; struct mutex mutex; struct list_head list; u64 token; @@ -298,7 +301,6 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) luo_file->file = file; luo_file->fh = fh; luo_file->token = token; - luo_file->retrieved = false; mutex_init(&luo_file->mutex); args.handler = fh; @@ -577,7 +579,12 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, return -ENOENT; guard(mutex)(&luo_file->mutex); - if (luo_file->retrieved) { + if (luo_file->retrieve_status < 0) { + /* Retrieve was attempted and it failed. Return the error code. */ + return luo_file->retrieve_status; + } + + if (luo_file->retrieve_status > 0) { /* * Someone is asking for this file again, so get a reference * for them. @@ -590,16 +597,19 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, args.handler = luo_file->fh; args.serialized_data = luo_file->serialized_data; err = luo_file->fh->ops->retrieve(&args); - if (!err) { - luo_file->file = args.file; - - /* Get reference so we can keep this file in LUO until finish */ - get_file(luo_file->file); - *filep = luo_file->file; - luo_file->retrieved = true; + if (err) { + /* Keep the error code for later use. */ + luo_file->retrieve_status = err; + return err; } - return err; + luo_file->file = args.file; + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieve_status = 1; + + return 0; } static int luo_file_can_finish_one(struct luo_file_set *file_set, @@ -615,7 +625,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; - args.retrieved = luo_file->retrieved; + args.retrieve_status = luo_file->retrieve_status; can_finish = luo_file->fh->ops->can_finish(&args); } @@ -632,7 +642,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; - args.retrieved = luo_file->retrieved; + args.retrieve_status = luo_file->retrieve_status; luo_file->fh->ops->finish(&args); luo_flb_file_finish(luo_file->fh); @@ -788,7 +798,6 @@ int luo_file_deserialize(struct luo_file_set *file_set, luo_file->file = NULL; luo_file->serialized_data = file_ser[i].data; luo_file->token = file_ser[i].token; - luo_file->retrieved = false; mutex_init(&luo_file->mutex); list_add_tail(&luo_file->list, &file_set->files_list); } diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 5c17da3880c5..e485b828d173 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -326,7 +326,12 @@ static void memfd_luo_finish(struct liveupdate_file_op_args *args) struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; - if (args->retrieved) + /* + * If retrieve was successful, nothing to do. If it failed, retrieve() + * already cleaned up everything it could. So nothing to do there + * either. Only need to clean up when retrieve was not called. + */ + if (args->retrieve_status) return; ser = phys_to_virt(args->serialized_data); -- cgit v1.2.3 From 319d0bff22f3dd7a982c289e8336da69f0581299 Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Tue, 17 Feb 2026 11:21:52 +0100 Subject: MAINTAINERS, mailmap: update e-mail address for Vlastimil Babka Hopefully improve e-mail performance. Link: https://lkml.kernel.org/r/20260217102151.10425-2-vbabka@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.mailmap b/.mailmap index e1cf6bb85d33..b3785362f73f 100644 --- a/.mailmap +++ b/.mailmap @@ -876,6 +876,7 @@ Vivien Didelot Vlad Dogaru Vladimir Davydov Vladimir Davydov +Vlastimil Babka WangYuli WangYuli Weiwen Hu diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..02fe14782533 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16656,7 +16656,7 @@ M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes R: Liam R. Howlett -R: Vlastimil Babka +R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan R: Michal Hocko @@ -16786,7 +16786,7 @@ M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes R: Liam R. Howlett -R: Vlastimil Babka +R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan R: Michal Hocko @@ -16841,7 +16841,7 @@ F: mm/oom_kill.c MEMORY MANAGEMENT - PAGE ALLOCATOR M: Andrew Morton -M: Vlastimil Babka +M: Vlastimil Babka R: Suren Baghdasaryan R: Michal Hocko R: Brendan Jackman @@ -16887,7 +16887,7 @@ M: David Hildenbrand M: Lorenzo Stoakes R: Rik van Riel R: Liam R. Howlett -R: Vlastimil Babka +R: Vlastimil Babka R: Harry Yoo R: Jann Horn L: linux-mm@kvack.org @@ -16986,7 +16986,7 @@ MEMORY MAPPING M: Andrew Morton M: Liam R. Howlett M: Lorenzo Stoakes -R: Vlastimil Babka +R: Vlastimil Babka R: Jann Horn R: Pedro Falcato L: linux-mm@kvack.org @@ -17016,7 +17016,7 @@ M: Andrew Morton M: Suren Baghdasaryan M: Liam R. Howlett M: Lorenzo Stoakes -R: Vlastimil Babka +R: Vlastimil Babka R: Shakeel Butt L: linux-mm@kvack.org S: Maintained @@ -17032,7 +17032,7 @@ M: Andrew Morton M: Liam R. Howlett M: Lorenzo Stoakes M: David Hildenbrand -R: Vlastimil Babka +R: Vlastimil Babka R: Jann Horn L: linux-mm@kvack.org S: Maintained @@ -23174,7 +23174,7 @@ K: \b(?i:rust)\b RUST [ALLOC] M: Danilo Krummrich R: Lorenzo Stoakes -R: Vlastimil Babka +R: Vlastimil Babka R: Liam R. Howlett R: Uladzislau Rezki L: rust-for-linux@vger.kernel.org @@ -24350,7 +24350,7 @@ F: Documentation/devicetree/bindings/nvmem/layouts/kontron,sl28-vpd.yaml F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR -M: Vlastimil Babka +M: Vlastimil Babka M: Andrew Morton R: Christoph Lameter R: David Rientjes -- cgit v1.2.3 From fdb24a820a5832ec4532273282cbd4f22c291a0d Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 17 Feb 2026 05:09:55 +0000 Subject: Squashfs: check metadata block offset is within range Syzkaller reports a "general protection fault in squashfs_copy_data" This is ultimately caused by a corrupted index look-up table, which produces a negative metadata block offset. This is subsequently passed to squashfs_copy_data (via squashfs_read_metadata) where the negative offset causes an out of bounds access. The fix is to check that the offset is within range in squashfs_read_metadata. This will trap this and other cases. Link: https://lkml.kernel.org/r/20260217050955.138351-1-phillip@squashfs.org.uk Fixes: f400e12656ab ("Squashfs: cache operations") Reported-by: syzbot+a9747fe1c35a5b115d3f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/699234e2.a70a0220.2c38d7.00e2.GAE@google.com/ Signed-off-by: Phillip Lougher Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- fs/squashfs/cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 8e958db5f786..67abd4dff222 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -344,6 +344,9 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, if (unlikely(length < 0)) return -EIO; + if (unlikely(*offset < 0 || *offset >= SQUASHFS_METADATA_SIZE)) + return -EIO; + while (length) { entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); if (entry->error) { -- cgit v1.2.3 From c80f46ac228b48403866d65391ad09bdf0e8562a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 14 Feb 2026 13:41:21 -0800 Subject: mm/damon/core: disallow non-power of two min_region_sz DAMON core uses min_region_sz parameter value as the DAMON region alignment. The alignment is made using ALIGN() and ALIGN_DOWN(), which support only the power of two alignments. But DAMON core API callers can set min_region_sz to an arbitrary number. Users can also set it indirectly, using addr_unit. When the alignment is not properly set, DAMON behavior becomes difficult to expect and understand, makes it effectively broken. It doesn't cause a kernel crash-like significant issue, though. Fix the issue by disallowing min_region_sz input that is not a power of two. Add the check to damon_commit_ctx(), as all DAMON API callers who set min_region_sz uses the function. This can be a sort of behavioral change, but it does not break users, for the following reasons. As the symptom is making DAMON effectively broken, it is not reasonable to believe there are real use cases of non-power of two min_region_sz. There is no known use case or issue reports from the setup, either. In future, if we find real use cases of non-power of two alignments and we can support it with low enough overhead, we can consider moving the restriction. But, for now, simply disallowing the corner case should be good enough as a hot fix. Link: https://lkml.kernel.org/r/20260214214124.87689-1-sj@kernel.org Fixes: d8f867fa0825 ("mm/damon: add damon_ctx->min_sz_region") Signed-off-by: SeongJae Park Cc: Quanmin Yan Cc: [6.18+] Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 01eba1a547d4..adfc52fee9dc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1252,6 +1252,9 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + if (!is_power_of_2(src->min_region_sz)) + return -EINVAL; + err = damon_commit_schemes(dst, src); if (err) return err; -- cgit v1.2.3 From d155aab90fffa00f93cea1f107aef0a3d548b2ff Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 20 Feb 2026 15:49:40 +0100 Subject: mm/kfence: fix KASAN hardware tag faults during late enablement When KASAN hardware tags are enabled, re-enabling KFENCE late (via /sys/module/kfence/parameters/sample_interval) causes KASAN faults. This happens because the KFENCE pool and metadata are allocated via the page allocator, which tags the memory, while KFENCE continues to access it using untagged pointers during initialization. Use __GFP_SKIP_KASAN for late KFENCE pool and metadata allocations to ensure the memory remains untagged, consistent with early allocations from memblock. To support this, add __GFP_SKIP_KASAN to the allowlist in __alloc_contig_verify_gfp_mask(). Link: https://lkml.kernel.org/r/20260220144940.2779209-1-glider@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Alexander Potapenko Suggested-by: Ernesto Martinez Garcia Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Greg KH Cc: Kees Cook Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- mm/kfence/core.c | 14 ++++++++------ mm/page_alloc.c | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index b5aedf505cec..7393957f9a20 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -1004,14 +1004,14 @@ static int kfence_init_late(void) #ifdef CONFIG_CONTIG_ALLOC struct page *pages; - pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, - NULL); + pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL | __GFP_SKIP_KASAN, + first_online_node, NULL); if (!pages) return -ENOMEM; __kfence_pool = page_to_virt(pages); - pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, - NULL); + pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL | __GFP_SKIP_KASAN, + first_online_node, NULL); if (pages) kfence_metadata_init = page_to_virt(pages); #else @@ -1021,11 +1021,13 @@ static int kfence_init_late(void) return -EINVAL; } - __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); + __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, + GFP_KERNEL | __GFP_SKIP_KASAN); if (!__kfence_pool) return -ENOMEM; - kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); + kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, + GFP_KERNEL | __GFP_SKIP_KASAN); #endif if (!kfence_metadata_init) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fcc32737f451..2d4b6f1a554e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6928,7 +6928,8 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) { const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | - __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO | + __GFP_SKIP_KASAN; const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; /* -- cgit v1.2.3 From 079c24d5690262e83ee476e2a548e416f3237511 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 19 Feb 2026 15:36:56 -0800 Subject: mm/tracing: rss_stat: ensure curr is false from kthread context The rss_stat trace event allows userspace tools, like Perfetto [1], to inspect per-process RSS metric changes over time. The curr field was introduced to rss_stat in commit e4dcad204d3a ("rss_stat: add support to detect RSS updates of external mm"). Its intent is to indicate whether the RSS update is for the mm_struct of the current execution context; and is set to false when operating on a remote mm_struct (e.g., via kswapd or a direct reclaimer). However, an issue arises when a kernel thread temporarily adopts a user process's mm_struct. Kernel threads do not have their own mm_struct and normally have current->mm set to NULL. To operate on user memory, they can "borrow" a memory context using kthread_use_mm(), which sets current->mm to the user process's mm. This can be observed, for example, in the USB Function Filesystem (FFS) driver. The ffs_user_copy_worker() handles AIO completions and uses kthread_use_mm() to copy data to a user-space buffer. If a page fault occurs during this copy, the fault handler executes in the kthread's context. At this point, current is the kthread, but current->mm points to the user process's mm. Since the rss_stat event (from the page fault) is for that same mm, the condition current->mm == mm becomes true, causing curr to be incorrectly set to true when the trace event is emitted. This is misleading because it suggests the mm belongs to the kthread, confusing userspace tools that track per-process RSS changes and corrupting their mm_id-to-process association. Fix this by ensuring curr is always false when the trace event is emitted from a kthread context by checking for the PF_KTHREAD flag. Link: https://lkml.kernel.org/r/20260219233708.1971199-1-kaleshsingh@google.com Link: https://perfetto.dev/ [1] Fixes: e4dcad204d3a ("rss_stat: add support to detect RSS updates of external mm") Signed-off-by: Kalesh Singh Acked-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Pedro Falcato Cc: "David Hildenbrand (Arm)" Cc: Joel Fernandes Cc: Lorenzo Stoakes Cc: Minchan Kim Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: [5.10+] Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 7f93e754da5c..cd7920c81f85 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -440,7 +440,13 @@ TRACE_EVENT(rss_stat, TP_fast_assign( __entry->mm_id = mm_ptr_to_hash(mm); - __entry->curr = !!(current->mm == mm); + /* + * curr is true if the mm matches the current task's mm_struct. + * Since kthreads (PF_KTHREAD) have no mm_struct of their own + * but can borrow one via kthread_use_mm(), we must filter them + * out to avoid incorrectly attributing the RSS update to them. + */ + __entry->curr = current->mm == mm && !(current->flags & PF_KTHREAD); __entry->member = member; __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) << PAGE_SHIFT); -- cgit v1.2.3 From a4ab97e34bb687a2ca63fc70b47e8762e689797f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 22 Feb 2026 19:57:02 +0800 Subject: mm: fix NULL NODE_DATA dereference for memoryless nodes on boot Commit d49004c5f0c1 ("arch, mm: consolidate initialization of nodes, zones and memory map") moved free_area_init() from setup_arch() to mm_core_init_early(), which runs after setup_arch() returns. This changed the ordering relative to init_cpu_to_node() on x86. Before the commit, free_area_init() ran during paging_init() (called from setup_arch()) *before* init_cpu_to_node(). After the commit, it runs *after* init_cpu_to_node(). On machines with memoryless NUMA nodes (e.g., node 0 has CPUs but no memory), this causes a NULL pointer dereference: 1. numa_register_nodes() skips memoryless nodes: no alloc_node_data() and no node_set_online() for them. 2. init_cpu_to_node() sets memoryless nodes online (they have CPUs) but does not allocate NODE_DATA. 3. free_area_init() checks "if (!node_online(nid))" to decide whether to call alloc_offline_node_data(). Since the memoryless node is now online, the allocation is skipped, leaving NODE_DATA(nid) == NULL. 4. The immediate "pgdat = NODE_DATA(nid)" dereferences NULL. The crash happens before console_init(), so no output is visible without earlyprintk. With earlyprintk enabled, the following panic is observed: BUG: unable to handle page fault for address: 000000000002a1e0 Oops: Oops: 0000 [#1] SMP NOPTI RIP: 0010:free_area_init_node+0x3a/0x540 Call Trace: free_area_init+0x331/0x4e0 start_kernel+0x69/0x4a0 x86_64_start_reservations+0x24/0x30 x86_64_start_kernel+0x125/0x130 common_startup_64+0x13e/0x148 Kernel panic - not syncing: Attempted to kill the idle task! Fix this by checking "if (!NODE_DATA(nid))" instead of "if (!node_online(nid))". This directly tests whether the per-node data structure needs to be allocated, regardless of the node's online status. This change is also safe for non-x86 architectures as they all allocate NODE_DATA for every node including memoryless ones, so the check simply evaluates to false with no change in behavior. Link: https://lkml.kernel.org/r/20260222115702.3659-1-ming.lei@redhat.com Fixes: d49004c5f0c1 ("arch, mm: consolidate initialization of nodes, zones and memory map") Signed-off-by: Ming Lei Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 61d983d23f55..df34797691bd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1896,7 +1896,11 @@ static void __init free_area_init(void) for_each_node(nid) { pg_data_t *pgdat; - if (!node_online(nid)) + /* + * If an architecture has not allocated node data for + * this node, presume the node is memoryless or offline. + */ + if (!NODE_DATA(nid)) alloc_offline_node_data(nid); pgdat = NODE_DATA(nid); -- cgit v1.2.3 From 37a012c5c10c3364c5cba5def30dd7a17a6b587a Mon Sep 17 00:00:00 2001 From: Daniele Alessandrelli Date: Mon, 23 Feb 2026 17:09:05 +0000 Subject: mailmap: add entry for Daniele Alessandrelli My Intel email is going to bounce soon. Map it to my personal Gmail address. Link: https://lkml.kernel.org/r/20260223170905.278956-1-daniele.alessandrelli@intel.com Signed-off-by: Daniele Alessandrelli Cc: Daniele Alessandrelli Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index b3785362f73f..cd09bc545586 100644 --- a/.mailmap +++ b/.mailmap @@ -211,6 +211,7 @@ Daniel Borkmann Daniel Borkmann Daniel Borkmann Daniel Thompson +Daniele Alessandrelli Danilo Krummrich David Brownell David Collins -- cgit v1.2.3 From 410aed670cddac1de4f0c2865f30ec623fd20f78 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 23 Feb 2026 16:00:26 +0000 Subject: MAINTAINERS: update Yosry Ahmed's email address Use my kernel.org email address. Link: https://lkml.kernel.org/r/20260223160027.122307-1-yosry@kernel.org Signed-off-by: Yosry Ahmed Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index cd09bc545586..c124a1306d26 100644 --- a/.mailmap +++ b/.mailmap @@ -892,7 +892,8 @@ Yanteng Si Ying Huang Yixun Lan Yixun Lan -Yosry Ahmed +Yosry Ahmed +Yosry Ahmed Yu-Chun Lin Yusuke Goda Zack Rusin diff --git a/MAINTAINERS b/MAINTAINERS index 02fe14782533..e4572a36afd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -29186,7 +29186,7 @@ K: zstd ZSWAP COMPRESSED SWAP CACHING M: Johannes Weiner -M: Yosry Ahmed +M: Yosry Ahmed M: Nhat Pham R: Chengming Zhou L: linux-mm@kvack.org -- cgit v1.2.3 From 2f38fd99c0004676d835ae96ac4f3b54edc02c82 Mon Sep 17 00:00:00 2001 From: wangshuaiwei Date: Tue, 24 Feb 2026 14:32:28 +0800 Subject: scsi: ufs: core: Fix shift out of bounds when MAXQ=32 According to JESD223F, the maximum number of queues (MAXQ) is 32. When MCQ is enabled and ESI is disabled, nr_hw_queues=32 causes a shift overflow problem. Fix this by using 64-bit intermediate values to handle the nr_hw_queues=32 case safely. Signed-off-by: wangshuaiwei Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260224063228.50112-1-wangshuaiwei1@xiaomi.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 85987373b961..899e663fea6e 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -7110,7 +7110,7 @@ static irqreturn_t ufshcd_handle_mcq_cq_events(struct ufs_hba *hba) ret = ufshcd_vops_get_outstanding_cqs(hba, &outstanding_cqs); if (ret) - outstanding_cqs = (1U << hba->nr_hw_queues) - 1; + outstanding_cqs = (1ULL << hba->nr_hw_queues) - 1; /* Exclude the poll queues */ nr_queues = hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL]; -- cgit v1.2.3 From 6acf7860dcc79ed045cc9e6a79c8a8bb6959dba7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Feb 2026 06:21:44 -0800 Subject: zloop: advertise a volatile write cache Zloop is file system backed and thus needs to sync the underlying file system to persist data. Set BLK_FEAT_WRITE_CACHE so that the block layer actually send flush commands, and fix the flush implementation as sync_filesystem requires s_umount to be held and the code currently misses that. Fixes: eb0570c7df23 ("block: new zoned loop block device driver") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 8e334f5025fc..ae9bf2a85c21 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -542,6 +542,21 @@ out: zloop_put_cmd(cmd); } +/* + * Sync the entire FS containing the zone files instead of walking all files. + */ +static int zloop_flush(struct zloop_device *zlo) +{ + struct super_block *sb = file_inode(zlo->data_dir)->i_sb; + int ret; + + down_read(&sb->s_umount); + ret = sync_filesystem(sb); + up_read(&sb->s_umount); + + return ret; +} + static void zloop_handle_cmd(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); @@ -562,11 +577,7 @@ static void zloop_handle_cmd(struct zloop_cmd *cmd) zloop_rw(cmd); return; case REQ_OP_FLUSH: - /* - * Sync the entire FS containing the zone files instead of - * walking all files - */ - cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb); + cmd->ret = zloop_flush(zlo); break; case REQ_OP_ZONE_RESET: cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq)); @@ -981,7 +992,8 @@ static int zloop_ctl_add(struct zloop_options *opts) struct queue_limits lim = { .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, .chunk_sectors = opts->zone_size, - .features = BLK_FEAT_ZONED, + .features = BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE, + }; unsigned int nr_zones, i, j; struct zloop_device *zlo; -- cgit v1.2.3 From 3c4617117a2b7682cf037be5e5533e379707f050 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Feb 2026 06:21:45 -0800 Subject: zloop: check for spurious options passed to remove Zloop uses a command option parser for all control commands, but most options are only valid for adding a new device. Check for incorrectly specified options in the remove handler. Fixes: eb0570c7df23 ("block: new zoned loop block device driver") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index ae9bf2a85c21..9e3bb538d5fc 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -1174,7 +1174,12 @@ static int zloop_ctl_remove(struct zloop_options *opts) int ret; if (!(opts->mask & ZLOOP_OPT_ID)) { - pr_err("No ID specified\n"); + pr_err("No ID specified for remove\n"); + return -EINVAL; + } + + if (opts->mask & ~ZLOOP_OPT_ID) { + pr_err("Invalid option specified for remove\n"); return -EINVAL; } -- cgit v1.2.3 From 4b44cbb264d0ed3f2f2bc2659db6ce45882f4670 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Feb 2026 15:24:52 -0800 Subject: overflow: Make sure size helpers are always inlined With kmalloc_obj() performing implicit size calculations, the embedded size_mul() calls, while marked inline, were not always being inlined. I noticed a couple places where allocations were making a call out for things that would otherwise be compile-time calculated. Force the compilers to always inline these calculations. Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20260224232451.work.614-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/overflow.h b/include/linux/overflow.h index eddd987a8513..a8cb6319b4fb 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -42,7 +42,7 @@ * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ -static inline bool __must_check __must_check_overflow(bool overflow) +static __always_inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } @@ -327,7 +327,7 @@ static inline bool __must_check __must_check_overflow(bool overflow) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_mul(size_t factor1, size_t factor2) +static __always_inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; @@ -346,7 +346,7 @@ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_add(size_t addend1, size_t addend2) +static __always_inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; @@ -367,7 +367,7 @@ static inline size_t __must_check size_add(size_t addend1, size_t addend2) * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ -static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) +static __always_inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; -- cgit v1.2.3 From c601fd5414315fc515f746b499110e46272e7243 Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Tue, 24 Feb 2026 22:12:28 +0000 Subject: drm/client: Do not destroy NULL modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'modes' in drm_client_modeset_probe may fail to kcalloc. If this occurs, we jump to 'out', calling modes_destroy on it, which dereferences it. This may result in a NULL pointer dereference in the error case. Prevent that. Fixes: 3039cc0c0653 ("drm/client: Make copies of modes") Signed-off-by: Jonathan Cavitt Cc: Ville Syrjälä Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20260224221227.69126-2-jonathan.cavitt@intel.com --- drivers/gpu/drm/drm_client_modeset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 262b1b8773c5..bb49b8361271 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -930,7 +930,8 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); - modes_destroy(dev, modes, connector_count); + if (modes) + modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); -- cgit v1.2.3 From 83236b2e43dba00bee5b82eb5758816b1a674f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2026 21:39:58 -1000 Subject: sched_ext: Disable preemption between scx_claim_exit() and kicking helper work scx_claim_exit() atomically sets exit_kind, which prevents scx_error() from triggering further error handling. After claiming exit, the caller must kick the helper kthread work which initiates bypass mode and teardown. If the calling task gets preempted between claiming exit and kicking the helper work, and the BPF scheduler fails to schedule it back (since error handling is now disabled), the helper work is never queued, bypass mode never activates, tasks stop being dispatched, and the system wedges. Disable preemption across scx_claim_exit() and the subsequent work kicking in all callers - scx_disable() and scx_vexit(). Add lockdep_assert_preemption_disabled() to scx_claim_exit() to enforce the requirement. Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c18e81e8ef51..9280381f8923 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4423,10 +4423,19 @@ done: scx_bypass(false); } +/* + * Claim the exit on @sch. The caller must ensure that the helper kthread work + * is kicked before the current task can be preempted. Once exit_kind is + * claimed, scx_error() can no longer trigger, so if the current task gets + * preempted and the BPF scheduler fails to schedule it back, the helper work + * will never be kicked and the whole system can wedge. + */ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + lockdep_assert_preemption_disabled(); + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return false; @@ -4449,6 +4458,7 @@ static void scx_disable(enum scx_exit_kind kind) rcu_read_lock(); sch = rcu_dereference(scx_root); if (sch) { + guard(preempt)(); scx_claim_exit(sch, kind); kthread_queue_work(sch->helper, &sch->disable_work); } @@ -4771,6 +4781,8 @@ static bool scx_vexit(struct scx_sched *sch, { struct scx_exit_info *ei = sch->exit_info; + guard(preempt)(); + if (!scx_claim_exit(sch, kind)) return false; -- cgit v1.2.3 From 29c8b85adb47daefc213381bc1831787f512d89b Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 25 Feb 2026 08:31:40 +0000 Subject: irqchip/gic-v5: Fix inversion of IRS_IDR0.virt flag It appears that a !! became ! during a cleanup, resulting in inverted logic when detecting if a host GICv5 implementation is capable of virtualization. Re-add the missing !, fixing the behaviour. Fixes: 3227c3a89d65f ("irqchip/gic-v5: Check if impl is virt capable") Signed-off-by: Sascha Bischoff Link: https://patch.msgid.link/20260225083130.3378490-1-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5-irs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index eeeb40fb0eaa..4ed2bfa17c01 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -744,7 +744,7 @@ static int __init gicv5_irs_init(struct device_node *node) */ if (list_empty(&irs_nodes)) { idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR0); - gicv5_global_data.virt_capable = !FIELD_GET(GICV5_IRS_IDR0_VIRT, idr); + gicv5_global_data.virt_capable = !!FIELD_GET(GICV5_IRS_IDR0_VIRT, idr); idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1); irs_setup_pri_bits(idr); -- cgit v1.2.3 From 7fe8dec3f628e9779f1631576f8e693370050348 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Feb 2026 09:52:28 +0100 Subject: ALSA: usb-audio: Cap the packet size pre-calculations We calculate the possible packet sizes beforehand for adaptive and synchronous endpoints, but we didn't take care of the max frame size for those pre-calculated values. When a device or a bus limits the packet size, a high sample rate or a high number of channels may lead to the packet sizes that are larger than the given limit, which results in an error from the USB core at submitting URBs. As a simple workaround, just add the sanity checks of pre-calculated packet sizes to have the upper boundary of ep->maxframesize. Fixes: f0bd62b64016 ("ALSA: usb-audio: Improve frames size computation") Link: https://bugzilla.kernel.org/show_bug.cgi?id=221076 Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260225085233.316306-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/endpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 73bce9712dbd..c887c2f5b25d 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -1378,6 +1378,9 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, return -EINVAL; } + ep->packsize[0] = min(ep->packsize[0], ep->maxframesize); + ep->packsize[1] = min(ep->packsize[1], ep->maxframesize); + /* calculate the frequency in 16.16 format */ ep->freqm = ep->freqn; ep->freqshift = INT_MIN; -- cgit v1.2.3 From 7cb2a5422f5bbdf1cf32eae0eda41000485b9346 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Feb 2026 09:52:29 +0100 Subject: ALSA: usb-audio: Check max frame size for implicit feedback mode, too When the packet sizes are taken from the capture stream in the implicit feedback mode, the sizes might be larger than the upper boundary defined by the descriptor. As already done for other transfer modes, we have to cap the sizes accordingly at sending, otherwise this would lead to an error in USB core at submission of URBs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=221076 Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260225085233.316306-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/endpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index c887c2f5b25d..2e70dbc479b4 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -221,6 +221,7 @@ int snd_usb_endpoint_next_packet_size(struct snd_usb_endpoint *ep, packet = ctx->packet_size[idx]; if (packet) { + packet = min(packet, ep->maxframesize); if (avail && packet >= avail) return -EAGAIN; return packet; -- cgit v1.2.3 From c5bf24c8aba1ff711226ee0f039ff01a5754692b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Feb 2026 09:52:30 +0100 Subject: ALSA: usb-audio: Avoid implicit feedback mode on DIYINHK USB Audio 2.0 Although DIYINHK USB Audio 2.0 (ID 20b1:2009) shows the implicit feedback source for the capture stream, this would cause several problems for the playback. Namely, the device can get wMaxPackSize 1024 for 24/32 bit format with 6 channels, and when a high sample rate like 352.8kHz or 384kHz is played, the packet size overflows the max limit. Also, the device has another two playback altsets, and those aren't properly handled with the implicit feedback. Since the device has been working well even before introducing the implicit feedback, we can assume that it works fine in the async mode. This patch adds the explicit skip of the implicit fb detection to make the playback running in the async mode. Link: https://bugzilla.kernel.org/show_bug.cgi?id=221076 Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260225085233.316306-4-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index fbceed8e8d36..c6a78efbcaa3 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2365,6 +2365,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7281, /* Hauppauge HVR-950Q-MXL */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x20b1, 0x2009, /* XMOS Ltd DIYINHK USB Audio 2.0 */ + QUIRK_FLAG_SKIP_IMPLICIT_FB | QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x2040, 0x8200, /* Hauppauge Woodbury */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x21b4, 0x0081, /* AudioQuest DragonFly */ -- cgit v1.2.3 From 4e9113c533acee2ba1f72fd68ee6ecd36b64484e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Feb 2026 09:52:31 +0100 Subject: ALSA: usb-audio: Use inclusive terms Replace the remaining with inclusive terms; it's only this function name we overlooked at the previous conversion. Fixes: 53837b4ac2bd ("ALSA: usb-audio: Replace slave/master terms") Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260225085233.316306-5-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/endpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 2e70dbc479b4..bf4401aba76c 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -160,8 +160,8 @@ int snd_usb_endpoint_implicit_feedback_sink(struct snd_usb_endpoint *ep) * This won't be used for implicit feedback which takes the packet size * returned from the sync source */ -static int slave_next_packet_size(struct snd_usb_endpoint *ep, - unsigned int avail) +static int synced_next_packet_size(struct snd_usb_endpoint *ep, + unsigned int avail) { unsigned int phase; int ret; @@ -228,7 +228,7 @@ int snd_usb_endpoint_next_packet_size(struct snd_usb_endpoint *ep, } if (ep->sync_source) - return slave_next_packet_size(ep, avail); + return synced_next_packet_size(ep, avail); else return next_packet_size(ep, avail); } -- cgit v1.2.3 From 54e367cb94d6bef941bbc1132d9959dc73bd4b6f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 25 Feb 2026 10:47:18 +0000 Subject: KVM: arm64: Deduplicate ASID retrieval code We currently have three versions of the ASID retrieval code, one in the S1 walker, and two in the VNCR handling (although the last two are limited to the EL2&0 translation regime). Make this code common, and take this opportunity to also simplify the code a bit while switching over to the TTBRx_EL1_ASID macro. Reviewed-by: Joey Gouly Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260225104718.14209-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 2 ++ arch/arm64/kvm/at.c | 27 ++--------------- arch/arm64/kvm/nested.c | 60 +++++++++++++++++++------------------ 3 files changed, 35 insertions(+), 54 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 905c658057a4..091544e6af44 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -397,6 +397,8 @@ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); +u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime); + #define vncr_fixmap(c) \ ({ \ u32 __c = (c); \ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 885bd5bb2f41..6588ea251ed7 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -540,31 +540,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); - if (wr->nG) { - u64 asid_ttbr, tcr; - - switch (wi->regime) { - case TR_EL10: - tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); - asid_ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL1) : - vcpu_read_sys_reg(vcpu, TTBR0_EL1)); - break; - case TR_EL20: - tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - asid_ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - break; - default: - BUG(); - } - - wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - wr->asid &= GENMASK(7, 0); - } + if (wr->nG) + wr->asid = get_asid_by_regime(vcpu, wi->regime); return 0; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 7f1ea85dc67a..787776aaf4ad 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -854,6 +854,33 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) return kvm_inject_nested_sync(vcpu, esr_el2); } +u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + enum vcpu_sysreg ttbr_elx; + u64 tcr; + u16 asid; + + switch (regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL1 : TTBR0_EL1; + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL2 : TTBR0_EL2; + break; + default: + BUG(); + } + + asid = FIELD_GET(TTBRx_EL1_ASID, vcpu_read_sys_reg(vcpu, ttbr_elx)); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid; +} + static void invalidate_vncr(struct vncr_tlb *vt) { vt->valid = false; @@ -1333,20 +1360,8 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) if (read_vncr_el2(vcpu) != vt->gva) return false; - if (vt->wr.nG) { - u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - u64 ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - u16 asid; - - asid = FIELD_GET(TTBR_ASID_MASK, ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - asid &= GENMASK(7, 0); - - return asid == vt->wr.asid; - } + if (vt->wr.nG) + return get_asid_by_regime(vcpu, TR_EL20) == vt->wr.asid; return true; } @@ -1449,21 +1464,8 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) if (read_vncr_el2(vcpu) != vt->gva) return; - if (vt->wr.nG) { - u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - u64 ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - u16 asid; - - asid = FIELD_GET(TTBR_ASID_MASK, ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - asid &= GENMASK(7, 0); - - if (asid != vt->wr.asid) - return; - } + if (vt->wr.nG && get_asid_by_regime(vcpu, TR_EL20) != vt->wr.asid) + return; vt->cpu = smp_processor_id(); -- cgit v1.2.3 From 93a4a9b732fbb479f95d327aa867d094aed3f712 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 24 Feb 2026 17:59:52 +0100 Subject: RDMA/core: Check id_priv->restricted_node_type in cma_listen_on_dev() When listening on wildcard addresses we have a global list for the application layer rdma_cm_id and for any existing device or any device added in future we try to listen on any wildcard listener. When the listener has a restricted_node_type we should prevent listening on devices with a different node type. While there fix the documentation comment of rdma_restrict_node_type() to include rdma_resolve_addr() instead of having rdma_bind_addr() twice. Fixes: a760e80e90f5 ("RDMA/core: introduce rdma_restrict_node_type()") Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Steve French Cc: Namjae Jeon Cc: Tom Talpey Cc: Long Li Cc: linux-rdma@vger.kernel.org Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Link: https://patch.msgid.link/20260224165951.3582093-2-metze@samba.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 6 +++++- include/rdma/rdma_cm.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e54c07c74575..9480d1a51c11 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2729,6 +2729,9 @@ static int cma_listen_on_dev(struct rdma_id_private *id_priv, *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; + if (id_priv->restricted_node_type != RDMA_NODE_UNSPECIFIED && + id_priv->restricted_node_type != cma_dev->device->node_type) + return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, @@ -2736,6 +2739,7 @@ static int cma_listen_on_dev(struct rdma_id_private *id_priv, if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); + dev_id_priv->restricted_node_type = id_priv->restricted_node_type; dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); @@ -4194,7 +4198,7 @@ int rdma_restrict_node_type(struct rdma_cm_id *id, u8 node_type) } mutex_lock(&lock); - if (id_priv->cma_dev) + if (READ_ONCE(id_priv->state) != RDMA_CM_IDLE) ret = -EALREADY; else id_priv->restricted_node_type = node_type; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 6de6fd8bd15e..d639ff889e64 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -181,7 +181,7 @@ void rdma_destroy_id(struct rdma_cm_id *id); * * It needs to be called before the RDMA identifier is bound * to an device, which mean it should be called before - * rdma_bind_addr(), rdma_bind_addr() and rdma_listen(). + * rdma_bind_addr(), rdma_resolve_addr() and rdma_listen(). */ int rdma_restrict_node_type(struct rdma_cm_id *id, u8 node_type); -- cgit v1.2.3 From 104016eb671e19709721c1b0048dd912dc2e96be Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Tue, 24 Feb 2026 23:41:53 +0000 Subject: RDMA/umem: Fix double dma_buf_unpin in failure path In ib_umem_dmabuf_get_pinned_with_dma_device(), the call to ib_umem_dmabuf_map_pages() can fail. If this occurs, the dmabuf is immediately unpinned but the umem_dmabuf->pinned flag is still set. Then, when ib_umem_release() is called, it calls ib_umem_dmabuf_revoke() which will call dma_buf_unpin() again. Fix this by removing the immediate unpin upon failure and just let the ib_umem_release/revoke path handle it. This also ensures the proper unmap-unpin unwind ordering if the dmabuf_map_pages call happened to fail due to dma_resv_wait_timeout (and therefore has a non-NULL umem_dmabuf->sgt). Fixes: 1e4df4a21c5a ("RDMA/umem: Allow pinned dmabuf umem usage") Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260224234153.1207849-1-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index f5298c33e581..d30f24b90bca 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -218,13 +218,11 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, err = ib_umem_dmabuf_map_pages(umem_dmabuf); if (err) - goto err_unpin; + goto err_release; dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); return umem_dmabuf; -err_unpin: - dma_buf_unpin(umem_dmabuf->attach); err_release: dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); ib_umem_release(&umem_dmabuf->umem); -- cgit v1.2.3 From 18c16f602a67782f5eb4b5ab9ba73350b9f711ec Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Wed, 4 Feb 2026 20:36:26 +0530 Subject: xfs: Replace ASSERT with XFS_IS_CORRUPT in xfs_rtcopy_summary() Replace ASSERT(sum > 0) with an XFS_IS_CORRUPT() and place it just after the call to xfs_rtget_summary() so that we don't end up using an illegal value of sum. Signed-off-by: Nirjhar Roy (IBM) Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_rtalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 90a94a5b6f7e..aab59f66384e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -112,6 +112,10 @@ xfs_rtcopy_summary( error = xfs_rtget_summary(oargs, log, bbno, &sum); if (error) goto out; + if (XFS_IS_CORRUPT(oargs->mp, sum < 0)) { + error = -EFSCORRUPTED; + goto out; + } if (sum == 0) continue; error = xfs_rtmodify_summary(oargs, log, bbno, -sum); @@ -120,7 +124,6 @@ xfs_rtcopy_summary( error = xfs_rtmodify_summary(nargs, log, bbno, sum); if (error) goto out; - ASSERT(sum > 0); } } error = 0; -- cgit v1.2.3 From a49b7ff63f98ba1c4503869c568c99ecffa478f2 Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Wed, 11 Feb 2026 19:25:13 +0530 Subject: xfs: Refactoring the nagcount and delta calculation Introduce xfs_growfs_compute_delta() to calculate the nagcount and delta blocks and refactor the code from xfs_growfs_data_private(). No functional changes. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ag.c | 28 ++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_ag.h | 3 +++ fs/xfs/xfs_fsops.c | 17 ++--------------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9c6765cc2d44..bd8fbb40b49e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -872,6 +872,34 @@ resv_err: return err2; } +void +xfs_growfs_compute_deltas( + struct xfs_mount *mp, + xfs_rfsblock_t nb, + int64_t *deltap, + xfs_agnumber_t *nagcountp) +{ + xfs_rfsblock_t nb_div, nb_mod; + int64_t delta; + xfs_agnumber_t nagcount; + + nb_div = nb; + nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); + if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) + nb_div++; + else if (nb_mod) + nb = nb_div * mp->m_sb.sb_agblocks; + + if (nb_div > XFS_MAX_AGNUMBER + 1) { + nb_div = XFS_MAX_AGNUMBER + 1; + nb = nb_div * mp->m_sb.sb_agblocks; + } + nagcount = nb_div; + delta = nb - mp->m_sb.sb_dblocks; + *deltap = delta; + *nagcountp = nagcount; +} + /* * Extent the AG indicated by the @id by the length passed in */ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 1f24cfa27321..3cd4790768ff 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -331,6 +331,9 @@ struct aghdr_init_data { int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, xfs_extlen_t delta); +void +xfs_growfs_compute_deltas(struct xfs_mount *mp, xfs_rfsblock_t nb, + int64_t *deltap, xfs_agnumber_t *nagcountp); int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 17255c41786b..8d64d904d73c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -95,18 +95,17 @@ xfs_growfs_data_private( struct xfs_growfs_data *in) /* growfs data input struct */ { xfs_agnumber_t oagcount = mp->m_sb.sb_agcount; + xfs_rfsblock_t nb = in->newblocks; struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; - xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; bool lastag_extended = false; struct xfs_trans *tp; struct aghdr_init_data id = {}; struct xfs_perag *last_pag; - nb = in->newblocks; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); if (error) return error; @@ -125,20 +124,8 @@ xfs_growfs_data_private( mp->m_sb.sb_rextsize); if (error) return error; + xfs_growfs_compute_deltas(mp, nb, &delta, &nagcount); - nb_div = nb; - nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); - if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) - nb_div++; - else if (nb_mod) - nb = nb_div * mp->m_sb.sb_agblocks; - - if (nb_div > XFS_MAX_AGNUMBER + 1) { - nb_div = XFS_MAX_AGNUMBER + 1; - nb = nb_div * mp->m_sb.sb_agblocks; - } - nagcount = nb_div; - delta = nb - mp->m_sb.sb_dblocks; /* * Reject filesystems with a single AG because they are not * supported, and reject a shrink operation that would cause a -- cgit v1.2.3 From 4ad85e633bc576a5cc8c8310aab141af7ed20efa Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Wed, 11 Feb 2026 19:25:14 +0530 Subject: xfs: Replace &rtg->rtg_group with rtg_group() Use the already existing rtg_group() wrapper instead of directly accessing the struct xfs_group member in struct xfs_rtgroup. Reviewed-by: Christoph Hellwig Signed-off-by: Nirjhar Roy (IBM) [cem: Conflict resolution against 06873dbd940d] Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 6 +++--- fs/xfs/xfs_zone_gc.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 67e0c8f5800f..e3d19b6dc64a 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -78,7 +78,7 @@ xfs_zone_account_reclaimable( struct xfs_rtgroup *rtg, uint32_t freed) { - struct xfs_group *xg = &rtg->rtg_group; + struct xfs_group *xg = rtg_group(rtg); struct xfs_mount *mp = rtg_mount(rtg); struct xfs_zone_info *zi = mp->m_zone_info; uint32_t used = rtg_rmap(rtg)->i_used_blocks; @@ -759,7 +759,7 @@ xfs_zone_alloc_blocks( trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); - *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); if (!*is_seq) *sector += XFS_FSB_TO_BB(mp, allocated); @@ -1080,7 +1080,7 @@ xfs_init_zone( if (write_pointer == 0) { /* zone is empty */ atomic_inc(&zi->zi_nr_free_zones); - xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); iz->available += rtg_blocks(rtg); } else if (write_pointer < rtg_blocks(rtg)) { /* zone is open */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 48c6cf584447..7efeecd2d85f 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -627,7 +627,7 @@ xfs_zone_gc_alloc_blocks( if (!*count_fsb) return NULL; - *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); if (!*is_seq) *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); @@ -702,7 +702,7 @@ xfs_zone_gc_start_chunk( chunk->data = data; chunk->oz = oz; chunk->victim_rtg = iter->victim_rtg; - atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); atomic_inc(&chunk->victim_rtg->rtg_gccount); bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); @@ -788,7 +788,7 @@ xfs_zone_gc_split_write( atomic_inc(&chunk->oz->oz_ref); split_chunk->victim_rtg = chunk->victim_rtg; - atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); atomic_inc(&chunk->victim_rtg->rtg_gccount); chunk->offset += split_len; @@ -888,7 +888,7 @@ xfs_zone_gc_finish_reset( goto out; } - xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); atomic_inc(&zi->zi_nr_free_zones); xfs_zoned_add_available(mp, rtg_blocks(rtg)); @@ -917,7 +917,7 @@ xfs_submit_zone_reset_bio( XFS_STATS_INC(mp, xs_gc_zone_reset_calls); - bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { /* * Also use the bio to drive the state machine when neither -- cgit v1.2.3 From fd81d3fd01a5ee4bd26a7dc440e7a2209277d14b Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Fri, 13 Feb 2026 08:50:06 +1000 Subject: xfs: fix code alignment issues in xfs_ondisk.c Fixup some code alignment issues in xfs_ondisk.c Signed-off-by: Wilfred Mallawa Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ondisk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 2e9715cc1641..70605019383c 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -73,7 +73,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4); @@ -116,7 +116,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0); XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2); -- cgit v1.2.3 From 03a6d6c4c85d2758534638fb2bb5f72e0f8877d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Feb 2026 15:14:31 +0100 Subject: xfs: cleanup inode counter stats Most of them are unused, so mark them as such. Give the remaining ones names that match their use instead of the historic IRIX ones based on vnodes. Note that the names are purely internal to the XFS code, the user interface is based on section names and arrays of counters. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_icache.c | 6 +++--- fs/xfs/xfs_stats.c | 10 +++++----- fs/xfs/xfs_stats.h | 16 ++++++++-------- fs/xfs/xfs_super.c | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index dbaab4ae709f..f76c6decdaa3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -106,7 +106,7 @@ xfs_inode_alloc( mapping_set_folio_min_order(VFS_I(ip)->i_mapping, M_IGEO(mp)->min_folio_order); - XFS_STATS_INC(mp, vn_active); + XFS_STATS_INC(mp, xs_inodes_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(ip->i_ino == 0); @@ -172,7 +172,7 @@ __xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); - XFS_STATS_DEC(ip->i_mount, vn_active); + XFS_STATS_DEC(ip->i_mount, xs_inodes_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } @@ -2234,7 +2234,7 @@ xfs_inode_mark_reclaimable( struct xfs_mount *mp = ip->i_mount; bool need_inactive; - XFS_STATS_INC(mp, vn_reclaim); + XFS_STATS_INC(mp, xs_inode_mark_reclaimable); /* * We should never get here with any of the reclaim flags already set. diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 017db0361cd8..bc4a5d6dc795 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -42,7 +42,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "xstrat", xfsstats_offset(xs_write_calls) }, { "rw", xfsstats_offset(xs_attr_get) }, { "attr", xfsstats_offset(xs_iflush_count)}, - { "icluster", xfsstats_offset(vn_active) }, + { "icluster", xfsstats_offset(xs_inodes_active) }, { "vnodes", xfsstats_offset(xb_get) }, { "buf", xfsstats_offset(xs_abtb_2) }, { "abtb2", xfsstats_offset(xs_abtc_2) }, @@ -100,15 +100,15 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) void xfs_stats_clearall(struct xfsstats __percpu *stats) { int c; - uint32_t vn_active; + uint32_t xs_inodes_active; xfs_notice(NULL, "Clearing xfsstats"); for_each_possible_cpu(c) { preempt_disable(); - /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->s.vn_active; + /* save xs_inodes_active, it's a universal truth! */ + xs_inodes_active = per_cpu_ptr(stats, c)->s.xs_inodes_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->s.vn_active = vn_active; + per_cpu_ptr(stats, c)->s.xs_inodes_active = xs_inodes_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 153d2381d0a8..64bc0cc18126 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -100,14 +100,14 @@ struct __xfsstats { uint32_t xs_iflush_count; uint32_t xs_icluster_flushcnt; uint32_t xs_icluster_flushinode; - uint32_t vn_active; /* # vnodes not on free lists */ - uint32_t vn_alloc; /* # times vn_alloc called */ - uint32_t vn_get; /* # times vn_get called */ - uint32_t vn_hold; /* # times vn_hold called */ - uint32_t vn_rele; /* # times vn_rele called */ - uint32_t vn_reclaim; /* # times vn_reclaim called */ - uint32_t vn_remove; /* # times vn_remove called */ - uint32_t vn_free; /* # times vn_free called */ + uint32_t xs_inodes_active; + uint32_t __unused_vn_alloc; + uint32_t __unused_vn_get; + uint32_t __unused_vn_hold; + uint32_t xs_inode_destroy; + uint32_t xs_inode_destroy2; /* same as xs_inode_destroy */ + uint32_t xs_inode_mark_reclaimable; + uint32_t __unused_vn_free; uint32_t xb_get; uint32_t xb_create; uint32_t xb_get_locked; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index abc45f860a73..f8de44443e81 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -712,8 +712,8 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - XFS_STATS_INC(ip->i_mount, vn_rele); - XFS_STATS_INC(ip->i_mount, vn_remove); + XFS_STATS_INC(ip->i_mount, xs_inode_destroy); + XFS_STATS_INC(ip->i_mount, xs_inode_destroy2); xfs_inode_mark_reclaimable(ip); } -- cgit v1.2.3 From 47553dd60b1da88df2354f841a4f71dd4de6478a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Feb 2026 15:14:32 +0100 Subject: xfs: remove metafile inodes from the active inode stat The active inode (or active vnode until recently) stat can get much larger than expected on file systems with a lot of metafile inodes like zoned file systems on SMR hard disks with 10.000s of rtg rmap inodes. Remove all metafile inodes from the active counter to make it more useful to track actual workloads and add a separate counter for active metafile inodes. This fixes xfs/177 on SMR hard drives. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_inode_buf.c | 4 ++++ fs/xfs/libxfs/xfs_metafile.c | 5 +++++ fs/xfs/xfs_icache.c | 5 ++++- fs/xfs/xfs_stats.c | 11 ++++++++--- fs/xfs/xfs_stats.h | 3 ++- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index a017016e9075..3794e5412eba 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -268,6 +268,10 @@ xfs_inode_from_disk( } if (xfs_is_reflink_inode(ip)) xfs_ifork_init_cow(ip); + if (xfs_is_metadir_inode(ip)) { + XFS_STATS_DEC(ip->i_mount, xs_inodes_active); + XFS_STATS_INC(ip->i_mount, xs_inodes_meta); + } return 0; out_destroy_data_fork: diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index cf239f862212..71f004e9dc64 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -61,6 +61,9 @@ xfs_metafile_set_iflag( ip->i_diflags2 |= XFS_DIFLAG2_METADATA; ip->i_metatype = metafile_type; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_DEC(ip->i_mount, xs_inodes_active); + XFS_STATS_INC(ip->i_mount, xs_inodes_meta); } /* Clear the metadata directory inode flag. */ @@ -74,6 +77,8 @@ xfs_metafile_clear_iflag( ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + XFS_STATS_INC(ip->i_mount, xs_inodes_active); + XFS_STATS_DEC(ip->i_mount, xs_inodes_meta); } /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f76c6decdaa3..f2d4294efd37 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -172,7 +172,10 @@ __xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); - XFS_STATS_DEC(ip->i_mount, xs_inodes_active); + if (xfs_is_metadir_inode(ip)) + XFS_STATS_DEC(ip->i_mount, xs_inodes_meta); + else + XFS_STATS_DEC(ip->i_mount, xs_inodes_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index bc4a5d6dc795..c13d600732c9 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -59,7 +59,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_gc_read_calls)}, - { "zoned", xfsstats_offset(__pad1)}, + { "zoned", xfsstats_offset(xs_inodes_meta)}, + { "metafile", xfsstats_offset(xs_xstrat_bytes)}, }; /* Loop over all stats groups */ @@ -99,16 +100,20 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) void xfs_stats_clearall(struct xfsstats __percpu *stats) { + uint32_t xs_inodes_active, xs_inodes_meta; int c; - uint32_t xs_inodes_active; xfs_notice(NULL, "Clearing xfsstats"); for_each_possible_cpu(c) { preempt_disable(); - /* save xs_inodes_active, it's a universal truth! */ + /* + * Save the active / meta inode counters, as they are stateful. + */ xs_inodes_active = per_cpu_ptr(stats, c)->s.xs_inodes_active; + xs_inodes_meta = per_cpu_ptr(stats, c)->s.xs_inodes_meta; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); per_cpu_ptr(stats, c)->s.xs_inodes_active = xs_inodes_active; + per_cpu_ptr(stats, c)->s.xs_inodes_meta = xs_inodes_meta; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 64bc0cc18126..57c32b86c358 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -142,7 +142,8 @@ struct __xfsstats { uint32_t xs_gc_read_calls; uint32_t xs_gc_write_calls; uint32_t xs_gc_zone_reset_calls; - uint32_t __pad1; +/* Metafile counters */ + uint32_t xs_inodes_meta; /* Extra precision counters */ uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; -- cgit v1.2.3 From cddfa648f1ab99e30e91455be19cd5ade26338c2 Mon Sep 17 00:00:00 2001 From: Ethan Tidmore Date: Thu, 19 Feb 2026 21:38:25 -0600 Subject: xfs: Fix error pointer dereference The function try_lookup_noperm() can return an error pointer and is not checked for one. Add checks for error pointer in xrep_adoption_check_dcache() and xrep_adoption_zap_dcache(). Detected by Smatch: fs/xfs/scrub/orphanage.c:449 xrep_adoption_check_dcache() error: 'd_child' dereferencing possible ERR_PTR() fs/xfs/scrub/orphanage.c:485 xrep_adoption_zap_dcache() error: 'd_child' dereferencing possible ERR_PTR() Fixes: 73597e3e42b4 ("xfs: ensure dentry consistency when the orphanage adopts a file") Cc: stable@vger.kernel.org # v6.16 Signed-off-by: Ethan Tidmore Reviewed-by: Darrick J. Wong Reviewed-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/orphanage.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 52a108f6d5f4..33c6db6b4498 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -442,6 +442,11 @@ xrep_adoption_check_dcache( return 0; d_child = try_lookup_noperm(&qname, d_orphanage); + if (IS_ERR(d_child)) { + dput(d_orphanage); + return PTR_ERR(d_child); + } + if (d_child) { trace_xrep_adoption_check_child(sc->mp, d_child); @@ -479,7 +484,7 @@ xrep_adoption_zap_dcache( return; d_child = try_lookup_noperm(&qname, d_orphanage); - while (d_child != NULL) { + while (!IS_ERR_OR_NULL(d_child)) { trace_xrep_adoption_invalidate_child(sc->mp, d_child); ASSERT(d_is_negative(d_child)); -- cgit v1.2.3 From e764dd439d68cfc16724e469db390d779ab49521 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:35 -0800 Subject: xfs: fix copy-paste error in previous fix Chris Mason noticed that there is a copy-paste error in a recent change to xrep_dir_teardown that nulls out pointers after freeing the resources. Fixes: ba408d299a3bb3c ("xfs: only call xf{array,blob}_destroy if we have a valid pointer") Link: https://lore.kernel.org/linux-xfs/20260205194211.2307232-1-clm@meta.com/ Reported-by: Chris Mason Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/dir_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c index 9dc55c918c78..23b80c54aa60 100644 --- a/fs/xfs/scrub/dir_repair.c +++ b/fs/xfs/scrub/dir_repair.c @@ -177,7 +177,7 @@ xrep_dir_teardown( rd->dir_names = NULL; if (rd->dir_entries) xfarray_destroy(rd->dir_entries); - rd->dir_names = NULL; + rd->dir_entries = NULL; } /* Set up for a directory repair. */ -- cgit v1.2.3 From 161456987a1fe4ad73c3f36dec1f684316ac9bdd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:35 -0800 Subject: xfs: fix xfs_group release bug in xfs_verify_report_losses Chris Mason reports that his AI tools noticed that we were using xfs_perag_put and xfs_group_put to release the group reference returned by xfs_group_next_range. However, the iterator function returns an object with an active refcount, which means that we must use the correct function to release the active refcount, which is _rele. Fixes: b8accfd65d31f2 ("xfs: add media verification ioctl") Reported-by: Chris Mason Link: https://lore.kernel.org/linux-xfs/20260206030527.2506821-1-clm@meta.com/ Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_verify_media.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c index 069cd371619d..8bbd4ec567f8 100644 --- a/fs/xfs/xfs_verify_media.c +++ b/fs/xfs/xfs_verify_media.c @@ -122,7 +122,7 @@ xfs_verify_report_losses( error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); if (error) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } @@ -158,7 +158,7 @@ xfs_verify_report_losses( if (rtg) xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); if (error) { - xfs_group_put(xg); + xfs_group_rele(xg); break; } } -- cgit v1.2.3 From eb8550fb75a875657dc29e3925a40244ec6b6bd6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:36 -0800 Subject: xfs: fix xfs_group release bug in xfs_dax_notify_dev_failure Chris Mason reports that his AI tools noticed that we were using xfs_perag_put and xfs_group_put to release the group reference returned by xfs_group_next_range. However, the iterator function returns an object with an active refcount, which means that we must use the correct function to release the active refcount, which is _rele. Cc: # v6.0 Fixes: 6f643c57d57c56 ("xfs: implement ->notify_failure() for XFS") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_notify_failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 6be19fa1ebe2..64c8afb935c2 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -304,7 +304,7 @@ xfs_dax_notify_dev_failure( error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); if (error) { - xfs_perag_put(pag); + xfs_perag_rele(pag); break; } @@ -340,7 +340,7 @@ xfs_dax_notify_dev_failure( if (rtg) xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); if (error) { - xfs_group_put(xg); + xfs_group_rele(xg); break; } } -- cgit v1.2.3 From 94014a23e91a3944947048169ccf38b4561cfd0c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:37 -0800 Subject: xfs: fix potential pointer access race in xfs_healthmon_get Pankaj Raghav asks about this code in xfs_healthmon_get: hm = mp->m_healthmon; if (hm && !refcount_inc_not_zero(&hm->ref)) hm = NULL; rcu_read_unlock(); return hm; (slightly edited to compress a mailing list thread) "Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any compiler tricks that can result in an undefined behaviour? I am not sure if I am being paranoid here. "So this is my understanding: RCU guarantees that we get a valid object (actual data of m_healthmon) but does not guarantee the compiler will not reread the pointer between checking if hm is !NULL and accessing the pointer as we are doing it lockless. "So just a barrier() call in rcu_read_lock is enough to make sure this doesn't happen and probably adding a READ_ONCE() is not needed?" After some initial confusion I concluded that he's correct. The compiler could very well eliminate the hm variable in favor of walking the pointers twice, turning the code into: if (mp->m_healthmon && !refcount_inc_not_zero(&mp->m_healthmon->ref)) If this happens, then xfs_healthmon_detach can sneak in between the two sides of the && expression and set mp->m_healthmon to NULL, and thereby cause a null pointer dereference crash. Fix this by using the rcu pointer assignment and dereference functions, which ensure that the proper reordering barriers are in place. Practically speaking, gcc seems to allocate an actual variable for hm and only reads mp->m_healthmon once (as intended), but we ought to be more explicit about requiring this. Reported-by: Pankaj Raghav Fixes: a48373e7d35a89f6f ("xfs: start creating infrastructure for health monitoring") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Pankaj Raghav Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_healthmon.c | 11 +++++++---- fs/xfs/xfs_mount.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c index e37c18cec372..4a06d6632f65 100644 --- a/fs/xfs/xfs_healthmon.c +++ b/fs/xfs/xfs_healthmon.c @@ -69,7 +69,7 @@ xfs_healthmon_get( struct xfs_healthmon *hm; rcu_read_lock(); - hm = mp->m_healthmon; + hm = rcu_dereference(mp->m_healthmon); if (hm && !refcount_inc_not_zero(&hm->ref)) hm = NULL; rcu_read_unlock(); @@ -110,13 +110,13 @@ xfs_healthmon_attach( struct xfs_healthmon *hm) { spin_lock(&xfs_healthmon_lock); - if (mp->m_healthmon != NULL) { + if (rcu_access_pointer(mp->m_healthmon) != NULL) { spin_unlock(&xfs_healthmon_lock); return -EEXIST; } refcount_inc(&hm->ref); - mp->m_healthmon = hm; + rcu_assign_pointer(mp->m_healthmon, hm); hm->mount_cookie = (uintptr_t)mp->m_super; spin_unlock(&xfs_healthmon_lock); @@ -128,13 +128,16 @@ STATIC void xfs_healthmon_detach( struct xfs_healthmon *hm) { + struct xfs_mount *mp; + spin_lock(&xfs_healthmon_lock); if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { spin_unlock(&xfs_healthmon_lock); return; } - XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL; + mp = XFS_M((struct super_block *)hm->mount_cookie); + rcu_assign_pointer(mp->m_healthmon, NULL); hm->mount_cookie = DETACHED_MOUNT_COOKIE; spin_unlock(&xfs_healthmon_lock); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 61c71128d171..ddd4028be8d6 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -345,7 +345,7 @@ typedef struct xfs_mount { struct xfs_hooks m_dir_update_hooks; /* Private data referring to a health monitor object. */ - struct xfs_healthmon *m_healthmon; + struct xfs_healthmon __rcu *m_healthmon; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) -- cgit v1.2.3 From 75690e5fdd74fc4d2a4aec58be9a82aec7cee721 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:38 -0800 Subject: xfs: don't report metadata inodes to fserror Internal metadata inodes are not exposed to userspace programs, so it makes no sense to pass them to the fserror functions (aka fsnotify). Instead, report metadata file problems as general filesystem corruption. Fixes: 5eb4cb18e445d0 ("xfs: convey metadata health events to the health monitor") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_health.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 169123772cb3..6475159eb930 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -314,6 +314,18 @@ xfs_rgno_mark_sick( xfs_rtgroup_put(rtg); } +static inline void xfs_inode_report_fserror(struct xfs_inode *ip) +{ + /* Report metadata inodes as general filesystem corruption */ + if (xfs_is_internal_inode(ip)) { + fserror_report_metadata(ip->i_mount->m_super, -EFSCORRUPTED, + GFP_NOFS); + return; + } + + fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); +} + /* Mark the unhealthy parts of an inode. */ void xfs_inode_mark_sick( @@ -339,7 +351,7 @@ xfs_inode_mark_sick( inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); - fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); + xfs_inode_report_fserror(ip); if (mask) xfs_healthmon_report_inode(ip, XFS_HEALTHMON_SICK, old_mask, mask); @@ -371,7 +383,7 @@ xfs_inode_mark_corrupt( inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); - fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); + xfs_inode_report_fserror(ip); if (mask) xfs_healthmon_report_inode(ip, XFS_HEALTHMON_CORRUPT, old_mask, mask); -- cgit v1.2.3 From 115ea07b94d2f13942fbd93c6acde376db36b16a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 18 Feb 2026 15:25:38 -0800 Subject: xfs: don't report half-built inodes to fserror Sam Sun apparently found a syzbot way to fuzz a filesystem such that xfs_iget_cache_miss would free the inode before the fserror code could catch up. Frustratingly he doesn't use the syzbot dashboard so there's no C reproducer and not even a full error report, so I'm guessing that: Inodes that are being constructed or torn down inside XFS are not visible to the VFS. They should never be reported to fserror. Also, any inode that has been freshly allocated in _cache_miss should be marked INEW immediately because, well, it's an incompletely constructed inode that isn't yet visible to the VFS. Reported-by: Sam Sun Fixes: 5eb4cb18e445d0 ("xfs: convey metadata health events to the health monitor") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_health.c | 8 ++++++-- fs/xfs/xfs_icache.c | 9 ++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 6475159eb930..239b843e83d4 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -316,8 +316,12 @@ xfs_rgno_mark_sick( static inline void xfs_inode_report_fserror(struct xfs_inode *ip) { - /* Report metadata inodes as general filesystem corruption */ - if (xfs_is_internal_inode(ip)) { + /* + * Do not report inodes being constructed or freed, or metadata inodes, + * to fsnotify. + */ + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIM) || + xfs_is_internal_inode(ip)) { fserror_report_metadata(ip->i_mount->m_super, -EFSCORRUPTED, GFP_NOFS); return; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f2d4294efd37..a7a09e7eec81 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -639,6 +639,14 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; + /* + * Set XFS_INEW as early as possible so that the health code won't pass + * the inode to the fserror code if the ondisk inode cannot be loaded. + * We're going to free the xfs_inode immediately if that happens, which + * would lead to UAF problems. + */ + xfs_iflags_set(ip, XFS_INEW); + error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; @@ -716,7 +724,6 @@ xfs_iget_cache_miss( ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; - xfs_iflags_set(ip, XFS_INEW); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); -- cgit v1.2.3 From 8baa9bccc0156d6952d337bf17f57ce15902dfe4 Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Fri, 20 Feb 2026 12:23:58 +0530 Subject: xfs: Fix xfs_last_rt_bmblock() Bug description: If the size of the last rtgroup i.e, the rtg passed to xfs_last_rt_bmblock() is such that the last rtextent falls in 0th word offset of a bmblock of the bitmap file tracking this (last) rtgroup, then in that case xfs_last_rt_bmblock() incorrectly returns the next bmblock number instead of the current/last used bmblock number. When xfs_last_rt_bmblock() incorrectly returns the next bmblock, the loop to grow/modify the bmblocks in xfs_growfs_rtg() doesn't execute and xfs_growfs basically does a nop in certain cases. xfs_growfs will do a nop when the new size of the fs will have the same number of rtgroups i.e, we are only growing the last rtgroup. Reproduce: $ mkfs.xfs -m metadir=0 -r rtdev=/dev/loop1 /dev/loop0 \ -r size=32769b -f $ mount -o rtdev=/dev/loop1 /dev/loop0 /mnt/scratch $ xfs_growfs -R $(( 32769 + 1 )) /mnt/scratch $ xfs_info /mnt/scratch | grep rtextents $ # We can see that rtextents hasn't changed Fix: Fix this by returning the current/last used bmblock when the last rtgroup size is not a multiple xfs_rtbitmap_rtx_per_rbmblock() and the next bmblock when the rtgroup size is a multiple of xfs_rtbitmap_rtx_per_rbmblock() i.e, the existing blocks are completely used up. Also, I have renamed xfs_last_rt_bmblock() to xfs_last_rt_bmblock_to_extend() to signify that this function returns the bmblock number to extend and NOT always the last used bmblock number. Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_rtalloc.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index aab59f66384e..ae53ba2093b2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1082,17 +1082,27 @@ xfs_last_rtgroup_extents( } /* - * Calculate the last rbmblock currently used. + * This will return the bitmap block number (indexed at 0) that will be + * extended/modified. There are 2 cases here: + * 1. The size of the rtg is such that it is a multiple of + * xfs_rtbitmap_rtx_per_rbmblock() i.e, an integral number of bitmap blocks + * are completely filled up. In this case, we should return + * 1 + (the last used bitmap block number). + * 2. The size of the rtg is not an multiple of xfs_rtbitmap_rtx_per_rbmblock(). + * Here we will return the block number of last used block number. In this + * case, we will modify the last used bitmap block to extend the size of the + * rtgroup. * * This also deals with the case where there were no rtextents before. */ static xfs_fileoff_t -xfs_last_rt_bmblock( +xfs_last_rt_bmblock_to_extend( struct xfs_rtgroup *rtg) { struct xfs_mount *mp = rtg_mount(rtg); xfs_rgnumber_t rgno = rtg_rgno(rtg); xfs_fileoff_t bmbno = 0; + unsigned int mod = 0; ASSERT(!mp->m_sb.sb_rgcount || rgno >= mp->m_sb.sb_rgcount - 1); @@ -1100,9 +1110,16 @@ xfs_last_rt_bmblock( xfs_rtxnum_t nrext = xfs_last_rtgroup_extents(mp); /* Also fill up the previous block if not entirely full. */ - bmbno = xfs_rtbitmap_blockcount_len(mp, nrext); - if (xfs_rtx_to_rbmword(mp, nrext) != 0) - bmbno--; + /* We are doing a -1 to convert it to a 0 based index */ + bmbno = xfs_rtbitmap_blockcount_len(mp, nrext) - 1; + div_u64_rem(nrext, xfs_rtbitmap_rtx_per_rbmblock(mp), &mod); + /* + * mod = 0 means that all the current blocks are full. So + * return the next block number to be used for the rtgroup + * growth. + */ + if (mod == 0) + bmbno++; } return bmbno; @@ -1207,7 +1224,8 @@ xfs_growfs_rtg( goto out_rele; } - for (bmbno = xfs_last_rt_bmblock(rtg); bmbno < bmblocks; bmbno++) { + for (bmbno = xfs_last_rt_bmblock_to_extend(rtg); bmbno < bmblocks; + bmbno++) { error = xfs_growfs_rt_bmblock(rtg, nrblocks, rextsize, bmbno); if (error) goto out_error; -- cgit v1.2.3 From ac1d977096a17d56c55bd7f90be48e81ac4cec3f Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Fri, 20 Feb 2026 12:23:59 +0530 Subject: xfs: Add a comment in xfs_log_sb() Add a comment explaining why the sb_frextents are updated outside the if (xfs_has_lazycount(mp) check even though it is a lazycounter. RT groups are supported only in v5 filesystems which always have lazycounter enabled - so putting it inside the if(xfs_has_lazycount(mp) check is redundant. Suggested-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_sb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 38d16fe1f6d8..47322adb7690 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1347,6 +1347,9 @@ xfs_log_sb( * feature was introduced. This counter can go negative due to the way * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. + * + * RT groups are only supported on v5 file systems, which always + * have lazy SB counters. */ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = -- cgit v1.2.3 From c2368fc89a684be2900daaa2bbf68cbc147e8d3d Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Fri, 20 Feb 2026 12:24:00 +0530 Subject: xfs: Update lazy counters in xfs_growfs_rt_bmblock() Update lazy counters in xfs_growfs_rt_bmblock() similar to the way it is done xfs_growfs_data_private(). This is because the lazy counters are not always updated and synching the counters will avoid inconsistencies between frexents and rtextents(total realtime extent count). This will be more useful once realtime shrink is implemented as this will prevent some transient state to occur where frexents might be greater than total rtextents. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_rtalloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ae53ba2093b2..153f3c378f9f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1050,6 +1050,15 @@ xfs_growfs_rt_bmblock( */ xfs_trans_resv_calc(mp, &mp->m_resv); + /* + * Sync sb counters now to reflect the updated values. Lazy counters are + * not always updated and in order to avoid inconsistencies between + * frextents and rtextents, it is better to sync the counters. + */ + + if (xfs_has_lazysbcount(mp)) + xfs_log_sb(args.tp); + error = xfs_trans_commit(args.tp); if (error) goto out_free; -- cgit v1.2.3 From 9a654a8fa3191e9ea32c4494b943c0872a3f5d27 Mon Sep 17 00:00:00 2001 From: "Nirjhar Roy (IBM)" Date: Fri, 20 Feb 2026 12:24:01 +0530 Subject: xfs: Add comments for usages of some macros. Add comments explaining when to use XFS_IS_CORRUPT() and ASSERT() Suggested-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Nirjhar Roy (IBM) Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_platform.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_platform.h b/fs/xfs/xfs_platform.h index 1e59bf94d1f2..59a33c60e0ca 100644 --- a/fs/xfs/xfs_platform.h +++ b/fs/xfs/xfs_platform.h @@ -235,6 +235,10 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, #ifdef XFS_WARN +/* + * Please note that this ASSERT doesn't kill the kernel. It will if the kernel + * has panic_on_warn set. + */ #define ASSERT(expr) \ (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__)) @@ -245,6 +249,11 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, #endif /* XFS_WARN */ #endif /* DEBUG */ +/* + * Use this to catch metadata corruptions that are not caught by block or + * structure verifiers. The reason is that the verifiers check corruptions only + * within the scope of the object being verified. + */ #define XFS_IS_CORRUPT(mp, expr) \ (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \ NULL, 0, __FILE__, __LINE__, \ -- cgit v1.2.3 From e97cbf863d8918452c9f81bebdade8d04e2e7b60 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 11 Feb 2026 13:29:02 +1000 Subject: xfs: remove duplicate static size checks In libxfs/xfs_ondisk.h, remove some duplicate entries of XFS_CHECK_STRUCT_SIZE(). Signed-off-by: Wilfred Mallawa Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ondisk.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 70605019383c..7bccfa7b695c 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -136,16 +136,7 @@ xfs_check_ondisk_structs(void) /* ondisk dir/attr structures from xfs/122 */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3); XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); - XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); -- cgit v1.2.3 From 650b774cf94495465d6a38c31bb1a6ce697b6b37 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 11 Feb 2026 13:29:04 +1000 Subject: xfs: add static size checks for ioctl UABI The ioctl structures in libxfs/xfs_fs.h are missing static size checks. It is useful to have static size checks for these structures as adding new fields to them could cause issues (e.g. extra padding that may be inserted by the compiler). So add these checks to xfs/xfs_ondisk.h. Due to different padding/alignment requirements across different architectures, to avoid build failures, some structures are ommited from the size checks. For example, structures with "compat_" definitions in xfs/xfs_ioctl32.h are ommited. Signed-off-by: Wilfred Mallawa Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ondisk.h | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 7bccfa7b695c..23cde1248f01 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -208,11 +208,6 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); - XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat, 192); - XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); - /* * Make sure the incore inode timestamp range corresponds to hand * converted values based on the ondisk format specification. @@ -292,6 +287,40 @@ xfs_check_ondisk_structs(void) XFS_CHECK_SB_OFFSET(sb_pad, 281); XFS_CHECK_SB_OFFSET(sb_rtstart, 288); XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); + + /* + * ioctl UABI + * + * Due to different padding/alignment requirements across + * different architectures, some structures are ommited from + * the size checks. In addition, structures with architecture + * dependent size fields are also ommited (e.g. __kernel_long_t). + */ + XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat, 192); + XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); + XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); + XFS_CHECK_STRUCT_SIZE(struct dioattr, 12); + XFS_CHECK_STRUCT_SIZE(struct getbmap, 32); + XFS_CHECK_STRUCT_SIZE(struct getbmapx, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrlist_cursor, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrlist, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrlist, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrlist_ent, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_ag_geometry, 128); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtgroup_geometry, 128); + XFS_CHECK_STRUCT_SIZE(struct xfs_error_injection, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_fsop_geom, 256); + XFS_CHECK_STRUCT_SIZE(struct xfs_fsop_geom_v4, 112); + XFS_CHECK_STRUCT_SIZE(struct xfs_fsop_counts, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_fsop_resblks, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_growfs_log, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_bulk_ireq, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_fs_eofblocks, 128); + XFS_CHECK_STRUCT_SIZE(struct xfs_fsid, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_scrub_metadata, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_scrub_vec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_scrub_vec_head, 40); } #endif /* __XFS_ONDISK_H */ -- cgit v1.2.3 From 6b050482ec40569429d963ac52afa878691b04c9 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 24 Feb 2026 16:17:52 -0800 Subject: cpufreq: intel_pstate: Fix crash during turbo disable When the system is booted with kernel command line argument "nosmt" or "maxcpus" to limit the number of CPUs, disabling turbo via: echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo results in a crash: PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI ... RIP: 0010:store_no_turbo+0x100/0x1f0 ... This occurs because for_each_possible_cpu() returns CPUs even if they are not online. For those CPUs, all_cpu_data[] will be NULL. Since commit 973207ae3d7c ("cpufreq: intel_pstate: Rearrange max frequency updates handling code"), all_cpu_data[] is dereferenced even for CPUs which are not online, causing the NULL pointer dereference. To fix that, pass CPU number to intel_pstate_update_max_freq() and use all_cpu_data[] for those CPUs for which there is a valid cpufreq policy. Fixes: 973207ae3d7c ("cpufreq: intel_pstate: Rearrange max frequency updates handling code") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221068 Signed-off-by: Srinivas Pandruvada Cc: 6.16+ # 6.16+ Link: https://patch.msgid.link/20260225001752.890164-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bdc37080d319..11c58af41900 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1476,13 +1476,13 @@ static void __intel_pstate_update_max_freq(struct cpufreq_policy *policy, refresh_frequency_limits(policy); } -static bool intel_pstate_update_max_freq(struct cpudata *cpudata) +static bool intel_pstate_update_max_freq(int cpu) { - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); if (!policy) return false; - __intel_pstate_update_max_freq(policy, cpudata); + __intel_pstate_update_max_freq(policy, all_cpu_data[cpu]); return true; } @@ -1501,7 +1501,7 @@ static void intel_pstate_update_limits_for_all(void) int cpu; for_each_possible_cpu(cpu) - intel_pstate_update_max_freq(all_cpu_data[cpu]); + intel_pstate_update_max_freq(cpu); mutex_lock(&hybrid_capacity_lock); @@ -1910,7 +1910,7 @@ static void intel_pstate_notify_work(struct work_struct *work) struct cpudata *cpudata = container_of(to_delayed_work(work), struct cpudata, hwp_notify_work); - if (intel_pstate_update_max_freq(cpudata)) { + if (intel_pstate_update_max_freq(cpudata->cpu)) { /* * The driver will not be unregistered while this function is * running, so update the capacity without acquiring the driver -- cgit v1.2.3 From c9bc1753b3cc41d0e01fbca7f035258b5f4db0ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 13:29:09 +0100 Subject: perf: Fix __perf_event_overflow() vs perf_remove_from_context() race Make sure that __perf_event_overflow() runs with IRQs disabled for all possible callchains. Specifically the software events can end up running it with only preemption disabled. This opens up a race vs perf_event_exit_event() and friends that will go and free various things the overflow path expects to be present, like the BPF program. Fixes: 592903cdcbf6 ("perf_counter: add an event_list") Reported-by: Simond Hu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Simond Hu Link: https://patch.msgid.link/20260224122909.GV1395416@noisy.programming.kicks-ass.net --- kernel/events/core.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 22a0f405585b..1f5699b339ec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10777,6 +10777,13 @@ int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + /* + * Entry point from hardware PMI, interrupts should be disabled here. + * This serializes us against perf_event_remove_from_context() in + * things like perf_event_release_kernel(). + */ + lockdep_assert_irqs_disabled(); + return __perf_event_overflow(event, 1, data, regs); } @@ -10853,6 +10860,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; + /* + * This is: + * - software preempt + * - tracepoint preempt + * - tp_target_task irq (ctx->lock) + * - uprobes preempt/irq + * - kprobes preempt/irq + * - hw_breakpoint irq + * + * Any of these are sufficient to hold off RCU and thus ensure @event + * exists. + */ + lockdep_assert_preemption_disabled(); local64_add(nr, &event->count); if (!regs) @@ -10861,6 +10881,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + /* + * Serialize against event_function_call() IPIs like normal overflow + * event handling. Specifically, must not allow + * perf_event_release_kernel() -> perf_remove_from_context() to make + * progress and 'release' the event from under us. + */ + guard(irqsave)(); + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); @@ -11359,6 +11389,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_sample_data data; struct perf_event *event; + /* + * Per being a tracepoint, this runs with preemption disabled. + */ + lockdep_assert_preemption_disabled(); + struct perf_raw_record raw = { .frag = { .size = entry_size, @@ -11691,6 +11726,11 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + /* + * Exception context, will have interrupts disabled. + */ + lockdep_assert_irqs_disabled(); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) @@ -12155,7 +12195,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) - if (__perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } -- cgit v1.2.3 From ab6088e7a95943af3452b20e3b96caaaef3eeebd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 24 Feb 2026 16:16:30 -0700 Subject: lib/Kconfig.debug: Require a release version of LLVM 22 for context analysis Using a prerelease version as a minimum supported version for CONFIG_WARN_CONTEXT_ANALYSIS was reasonable to do while LLVM 22 was the development version so that people could immediately build from main and start testing and validating this in their own code. However, it can be problematic when using prerelease versions of LLVM 22, such as Android clang 22.0.1 (the current android mainline compiler) or when bisecting LLVM between llvmorg-22-init and llvmorg-23-init, to build the kernel, as all compiler fixes for the context analysis may not be present, potentially resulting in warnings that can easily turn into errors. Now that LLVM 22 is released as 22.1.0, upgrade the check to require at least this version to ensure that a user's toolchain actually has all the changes needed for a smooth experience with context analysis. Fixes: 3269701cb256 ("compiler-context-analysis: Add infrastructure for Context Analysis with Clang") Signed-off-by: Nathan Chancellor Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260224-bump-clang-ver-context-analysis-v1-1-16cc7a90a040@kernel.org --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4e2dfbbd3d78..8e2b858078e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -630,7 +630,7 @@ config DEBUG_FORCE_WEAK_PER_CPU config WARN_CONTEXT_ANALYSIS bool "Compiler context-analysis warnings" - depends on CC_IS_CLANG && CLANG_VERSION >= 220000 + depends on CC_IS_CLANG && CLANG_VERSION >= 220100 # Branch profiling re-defines "if", which messes with the compiler's # ability to analyze __cond_acquires(..), resulting in false positives. depends on !TRACE_BRANCH_PROFILING @@ -641,7 +641,7 @@ config WARN_CONTEXT_ANALYSIS and releasing user-definable "context locks". Clang's name of the feature is "Thread Safety Analysis". Requires - Clang 22 or later. + Clang 22.1.0 or later. Produces warnings by default. Select CONFIG_WERROR if you wish to turn these warnings into errors. -- cgit v1.2.3 From e9fb2028f1eb563e653cff3b0d1c87c5e0203d45 Mon Sep 17 00:00:00 2001 From: Panagiotis Foliadis Date: Wed, 25 Feb 2026 14:53:43 +0000 Subject: ALSA: hda/intel: increase default bdl_pos_adj for Nvidia controllers The default bdl_pos_adj of 32 for Nvidia HDA controllers is insufficient on GA102 (and likely other recent Nvidia GPUs) after S3 suspend/resume. The controller's DMA timing degrades after resume, causing premature IRQ detection in azx_position_ok() which results in silent HDMI/DP audio output despite userspace reporting a valid playback state and correct ELD data. Increase bdl_pos_adj to 64 for AZX_DRIVER_NVIDIA, matching the value already used by Intel Apollo Lake for the same class of timing issue. Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221069 Suggested-by: Charalampos Mitrodimas Signed-off-by: Panagiotis Foliadis Link: https://patch.msgid.link/20260225-nvidia-audio-fix-v1-1-b1383c37ec49@posteo.net Signed-off-by: Takashi Iwai --- sound/hda/controllers/intel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/hda/controllers/intel.c b/sound/hda/controllers/intel.c index 6fddf400c4a3..3f434994c18d 100644 --- a/sound/hda/controllers/intel.c +++ b/sound/hda/controllers/intel.c @@ -1751,6 +1751,8 @@ static int default_bdl_pos_adj(struct azx *chip) return 1; case AZX_DRIVER_ZHAOXINHDMI: return 128; + case AZX_DRIVER_NVIDIA: + return 64; default: return 32; } -- cgit v1.2.3 From 85f6c439a69afe4fa8a688512e586971e97e273a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Feb 2026 10:35:57 +0000 Subject: io_uring/timeout: READ_ONCE sqe->addr We should use READ_ONCE when reading from a SQE, make sure timeout gets a stable timespec address. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 84dda24f3eb2..cb61d4862fc6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -462,7 +462,7 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2)))) return -EFAULT; if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL; @@ -557,7 +557,7 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) + if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr)))) return -EFAULT; if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) -- cgit v1.2.3 From 0d785e2c324c90662baa4fe07a0d02233ff92824 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:04 +0100 Subject: s390/idle: Fix cpu idle exit cpu time accounting With the conversion to generic entry [1] cpu idle exit cpu time accounting was converted from assembly to C. This introduced an reversed order of cpu time accounting. On cpu idle exit the current accounting happens with the following call chain: -> do_io_irq()/do_ext_irq() -> irq_enter_rcu() -> account_hardirq_enter() -> vtime_account_irq() -> vtime_account_kernel() vtime_account_kernel() accounts the passed cpu time since last_update_timer as system time, and updates last_update_timer to the current cpu timer value. However the subsequent call of -> account_idle_time_irq() will incorrectly subtract passed cpu time from timer_idle_enter to the updated last_update_timer value from system_timer. Then last_update_timer is updated to a sys_enter_timer, which means that last_update_timer goes back in time. Subsequently account_hardirq_exit() will account too much cpu time as hardirq time. The sum of all accounted cpu times is still correct, however some cpu time which was previously accounted as system time is now accounted as hardirq time, plus there is the oddity that last_update_timer goes back in time. Restore previous behavior by extracting cpu time accounting code from account_idle_time_irq() into a new update_timer_idle() function and call it before irq_enter_rcu(). Fixes: 56e62a737028 ("s390: convert to generic entry") [1] Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/idle.h | 1 + arch/s390/kernel/idle.c | 13 +++++++++---- arch/s390/kernel/irq.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 09f763b9eb40..133059d9a949 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -23,5 +23,6 @@ extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); +void update_timer_idle(void); #endif /* _S390_IDLE_H */ diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 39cb8d0ae348..0f9e53f0a068 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -21,11 +21,10 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); -void account_idle_time_irq(void) +void update_timer_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); struct lowcore *lc = get_lowcore(); - unsigned long idle_time; u64 cycles_new[8]; int i; @@ -35,13 +34,19 @@ void account_idle_time_irq(void) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle_time = lc->int_clock - idle->clock_idle_enter; - lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; lc->last_update_clock = lc->int_clock; lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; lc->last_update_timer = lc->sys_enter_timer; +} + +void account_idle_time_irq(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + unsigned long idle_time; + + idle_time = get_lowcore()->int_clock - idle->clock_idle_enter; /* Account time spent with enabled wait psw loaded as idle time. */ WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index f81723bc8856..d10a17e6531d 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -146,6 +146,10 @@ void noinstr do_io_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + update_timer_idle(); + irq_enter_rcu(); if (user_mode(regs)) { @@ -154,7 +158,6 @@ void noinstr do_io_irq(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -182,6 +185,10 @@ void noinstr do_ext_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) + update_timer_idle(); + irq_enter_rcu(); if (user_mode(regs)) { @@ -194,7 +201,6 @@ void noinstr do_ext_irq(struct pt_regs *regs) regs->int_parm = get_lowcore()->ext_params; regs->int_parm_long = get_lowcore()->ext_params2; - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); -- cgit v1.2.3 From dbc0fb35679ed5d0adecf7d02137ac2c77244b3b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:05 +0100 Subject: s390/vtime: Fix virtual timer forwarding Since delayed accounting of system time [1] the virtual timer is forwarded by do_account_vtime() but also vtime_account_kernel(), vtime_account_softirq(), and vtime_account_hardirq(). This leads to double accounting of system, guest, softirq, and hardirq time. Remove accounting from the vtime_account*() family to restore old behavior. There is only one user of the vtimer interface, which might explain why nobody noticed this so far. Fixes: b7394a5f4ce9 ("sched/cputime, s390: Implement delayed accounting of system time") [1] Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vtime.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 234a0ba30510..122d30b10440 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -225,10 +225,6 @@ static u64 vtime_delta(void) return timer - lc->last_update_timer; } -/* - * Update process times based on virtual cpu times stored by entry.S - * to the lowcore fields user_timer, system_timer & steal_clock. - */ void vtime_account_kernel(struct task_struct *tsk) { struct lowcore *lc = get_lowcore(); @@ -238,27 +234,17 @@ void vtime_account_kernel(struct task_struct *tsk) lc->guest_timer += delta; else lc->system_timer += delta; - - virt_timer_forward(delta); } EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_softirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->softirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->softirq_timer += vtime_delta(); } void vtime_account_hardirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->hardirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->hardirq_timer += vtime_delta(); } /* -- cgit v1.2.3 From aefa6ec890f0d39a89fc80e95908ec1996337eee Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:06 +0100 Subject: s390/idle: Add comment for non obvious code Add a comment to update_timer_idle() which describes why wall time (not steal time) is added to steal_timer. This is not obvious and was reported by Frederic Weisbecker. Reported-by: Frederic Weisbecker Closes: https://lore.kernel.org/all/aXEVM-04lj0lntMr@localhost.localdomain/ Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/idle.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 0f9e53f0a068..4e09f126d4fc 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -34,6 +34,15 @@ void update_timer_idle(void) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } + /* + * This is a bit subtle: Forward last_update_clock so it excludes idle + * time. For correct steal time calculation in do_account_vtime() add + * passed wall time before idle_enter to steal_timer: + * During the passed wall time before idle_enter CPU time may have + * been accounted to system, hardirq, softirq, etc. lowcore fields. + * The accounted CPU times will be subtracted again from steal_timer + * when accumulated steal time is calculated in do_account_vtime(). + */ lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; lc->last_update_clock = lc->int_clock; -- cgit v1.2.3 From 00d8b035eb71262abb547d47717b19f4a55ad4f2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:07 +0100 Subject: s390/idle: Slightly optimize idle time accounting Slightly optimize account_idle_time_irq() and update_timer_idle(): - Use fast single instruction __atomic64() primitives to update per cpu idle_time and idle_count, instead of READ_ONCE() / WRITE_ONCE() pairs - stcctm() is an inline assembly with a full memory barrier. This leads to a not necessary extra dereference of smp_cpu_mtid in update_timer_idle(). Avoid this and read smp_cpu_mtid into a variable - Use __this_cpu_add() instead of this_cpu_add() to avoid disabling / enabling of preemption several times in a loop in update_timer_idle(). Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/idle.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4e09f126d4fc..627d82dd900e 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -26,12 +26,13 @@ void update_timer_idle(void) struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); struct lowcore *lc = get_lowcore(); u64 cycles_new[8]; - int i; + int i, mtid; - if (smp_cpu_mtid) { - stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); - for (i = 0; i < smp_cpu_mtid; i++) - this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + mtid = smp_cpu_mtid; + if (mtid) { + stcctm(MT_DIAG, mtid, cycles_new); + for (i = 0; i < mtid; i++) + __this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } /* @@ -58,8 +59,8 @@ void account_idle_time_irq(void) idle_time = get_lowcore()->int_clock - idle->clock_idle_enter; /* Account time spent with enabled wait psw loaded as idle time. */ - WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); - WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + __atomic64_add(idle_time, &idle->idle_time); + __atomic64_add_const(1, &idle->idle_count); account_idle_time(cputime_to_nsecs(idle_time)); } -- cgit v1.2.3 From 257c14e5a1d8de40fded80fd8b525af55a6ded26 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:08 +0100 Subject: s390/idle: Inline update_timer_idle() Inline update_timer_idle() again to avoid an extra function call. This way the generated code is close to old assembler version again. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/idle.h | 3 ++- arch/s390/include/asm/vtime.h | 34 ++++++++++++++++++++++++++++++++++ arch/s390/kernel/entry.h | 2 -- arch/s390/kernel/idle.c | 34 ++-------------------------------- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 133059d9a949..8c5aa7329461 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -19,10 +19,11 @@ struct s390_idle_data { unsigned long mt_cycles_enter[8]; }; +DECLARE_PER_CPU(struct s390_idle_data, s390_idle); + extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); -void update_timer_idle(void); #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 9d25fb35a042..b1db75d14e9d 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,6 +2,12 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H +#include +#include +#include + +DECLARE_PER_CPU(u64, mt_cycles[8]); + static inline void update_timer_sys(void) { struct lowcore *lc = get_lowcore(); @@ -20,4 +26,32 @@ static inline void update_timer_mcck(void) lc->last_update_timer = lc->mcck_enter_timer; } +static inline void update_timer_idle(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); + u64 cycles_new[8]; + int i, mtid; + + mtid = smp_cpu_mtid; + if (mtid) { + stcctm(MT_DIAG, mtid, cycles_new); + for (i = 0; i < mtid; i++) + __this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } + /* + * This is a bit subtle: Forward last_update_clock so it excludes idle + * time. For correct steal time calculation in do_account_vtime() add + * passed wall time before idle_enter to steal_timer: + * During the passed wall time before idle_enter CPU time may have + * been accounted to system, hardirq, softirq, etc. lowcore fields. + * The accounted CPU times will be subtracted again from steal_timer + * when accumulated steal time is calculated in do_account_vtime(). + */ + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; +} + #endif /* _S390_VTIME_H */ diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index dd55cc6bbc28..fb67b4abe68c 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -56,8 +56,6 @@ long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); -DECLARE_PER_CPU(u64, mt_cycles[8]); - unsigned long stack_alloc(void); void stack_free(unsigned long stack); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 627d82dd900e..1f1b06b6b4ef 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -15,41 +15,11 @@ #include #include #include +#include #include #include -#include "entry.h" -static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); - -void update_timer_idle(void) -{ - struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - struct lowcore *lc = get_lowcore(); - u64 cycles_new[8]; - int i, mtid; - - mtid = smp_cpu_mtid; - if (mtid) { - stcctm(MT_DIAG, mtid, cycles_new); - for (i = 0; i < mtid; i++) - __this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); - } - - /* - * This is a bit subtle: Forward last_update_clock so it excludes idle - * time. For correct steal time calculation in do_account_vtime() add - * passed wall time before idle_enter to steal_timer: - * During the passed wall time before idle_enter CPU time may have - * been accounted to system, hardirq, softirq, etc. lowcore fields. - * The accounted CPU times will be subtracted again from steal_timer - * when accumulated steal time is calculated in do_account_vtime(). - */ - lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; - lc->last_update_clock = lc->int_clock; - - lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; - lc->last_update_timer = lc->sys_enter_timer; -} +DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { -- cgit v1.2.3 From d8b5cf9c63143fae54a734c41e3bb55cf3f365c7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:09 +0100 Subject: s390/irq/idle: Remove psw bits early Remove wait, io, external interrupt bits early in do_io_irq()/do_ext_irq() when previous context was idle. This saves one conditional branch and is closer to the original old assembly code. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/irq.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index d10a17e6531d..7fdf960191d3 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -147,8 +147,10 @@ void noinstr do_io_irq(struct pt_regs *regs) bool from_idle; from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); - if (from_idle) + if (from_idle) { update_timer_idle(); + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + } irq_enter_rcu(); @@ -174,9 +176,6 @@ void noinstr do_io_irq(struct pt_regs *regs) set_irq_regs(old_regs); irqentry_exit(regs, state); - - if (from_idle) - regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } void noinstr do_ext_irq(struct pt_regs *regs) @@ -186,8 +185,10 @@ void noinstr do_ext_irq(struct pt_regs *regs) bool from_idle; from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); - if (from_idle) + if (from_idle) { update_timer_idle(); + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + } irq_enter_rcu(); @@ -209,9 +210,6 @@ void noinstr do_ext_irq(struct pt_regs *regs) irq_exit_rcu(); set_irq_regs(old_regs); irqentry_exit(regs, state); - - if (from_idle) - regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } static void show_msi_interrupt(struct seq_file *p, int irq) -- cgit v1.2.3 From e282ccd308de7b2a65eb4596da8030c18b543071 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:10 +0100 Subject: s390/vtime: Use __this_cpu_read() / get rid of READ_ONCE() do_account_vtime() runs always with interrupts disabled, therefore use __this_cpu_read() instead of this_cpu_read() to get rid of a pointless preempt_disable() / preempt_enable() pair. Also there are no concurrent writers to the cpu time accounting fields in lowcore. Therefore get rid of READ_ONCE() usages. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vtime.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 122d30b10440..4111ff4d727c 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -137,23 +137,16 @@ static int do_account_vtime(struct task_struct *tsk) lc->system_timer += timer; /* Update MT utilization calculation */ - if (smp_cpu_mtid && - time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) + if (smp_cpu_mtid && time_after64(jiffies_64, __this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); /* Calculate cputime delta */ - user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(lc->user_timer)); - guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(lc->guest_timer)); - system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(lc->system_timer)); - hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(lc->hardirq_timer)); - softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(lc->softirq_timer)); - lc->steal_timer += - clock - user - guest - system - hardirq - softirq; + user = update_tsk_timer(&tsk->thread.user_timer, lc->user_timer); + guest = update_tsk_timer(&tsk->thread.guest_timer, lc->guest_timer); + system = update_tsk_timer(&tsk->thread.system_timer, lc->system_timer); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, lc->hardirq_timer); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, lc->softirq_timer); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ if (user) { -- cgit v1.2.3 From 80f63fd09efe9d9c016fd6378116eb7394edd2f2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:11 +0100 Subject: s390/vtime: Use lockdep_assert_irqs_disabled() instead of BUG_ON() Use lockdep_assert_irqs_disabled() instead of BUG_ON(). This avoids crashing the kernel, and generates better code if CONFIG_PROVE_LOCKING is disabled. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vtime.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 4111ff4d727c..bf48744d0912 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -48,8 +48,7 @@ static inline void set_vtimer(u64 expires) static inline int virt_timer_forward(u64 elapsed) { - BUG_ON(!irqs_disabled()); - + lockdep_assert_irqs_disabled(); if (list_empty(&virt_timer_list)) return 0; elapsed = atomic64_add_return(elapsed, &virt_timer_elapsed); -- cgit v1.2.3 From acbc0437ca3cf6c19005476a4f1f5a7c96ff4256 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Feb 2026 15:20:12 +0100 Subject: s390/idle: Remove psw_idle() prototype psw_idle() does not exist anymore. Remove its prototype. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/idle.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 8c5aa7329461..32536ee34aa0 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -24,6 +24,4 @@ DECLARE_PER_CPU(struct s390_idle_data, s390_idle); extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; -void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); - #endif /* _S390_IDLE_H */ -- cgit v1.2.3 From 1623a554c68f352c17d0a358bc62580dc187f06b Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 23 Feb 2026 23:33:52 +0100 Subject: s390/kexec: Disable stack protector in s390_reset_system() s390_reset_system() calls set_prefix(0), which switches back to the absolute lowcore. At that point the stack protector canary no longer matches the canary from the lowcore the function was entered with, so the stack check fails. Mark s390_reset_system() __no_stack_protector. This is safe here since its callers (__do_machine_kdump() and __do_machine_kexec()) are effectively no-return and fall back to disabled_wait() on failure. Fixes: f5730d44e05e ("s390: Add stackprotector support") Reported-by: Nikita Dubrovskii Reviewed-by: Heiko Carstens Acked-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ipl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index dcdc7e274848..049c557c452f 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2377,7 +2377,7 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void s390_reset_system(void) +void __no_stack_protector s390_reset_system(void) { /* Disable prefixing */ set_prefix(0); -- cgit v1.2.3 From d879ac6756b662a085a743e76023c768c3241579 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 24 Feb 2026 07:41:07 +0100 Subject: s390/pfault: Fix virtual vs physical address confusion When Linux is running as guest, runs a user space process and the user space process accesses a page that the host has paged out, the guest gets a pfault interrupt and schedules a different process. Without this mechanism the host would have to suspend the whole virtual CPU until the page has been paged in. To setup the pfault interrupt the real address of parameter list should be passed to DIAGNOSE 0x258, but a virtual address is passed instead. That has a performance impact, since the pfault setup never succeeds, the interrupt is never delivered to a guest and the whole virtual CPU is suspended as result. Cc: stable@vger.kernel.org Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Reported-by: Claudio Imbrenda Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/mm/pfault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 2f829448c719..6ecd6b0a22a8 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -62,7 +62,7 @@ int __pfault_init(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : [rc] "+d" (rc) - : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : [refbk] "a" (virt_to_phys(&pfault_init_refbk)), "m" (pfault_init_refbk) : "cc"); return rc; } @@ -84,7 +84,7 @@ void __pfault_fini(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : - : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : [refbk] "a" (virt_to_phys(&pfault_fini_refbk)), "m" (pfault_fini_refbk) : "cc"); } -- cgit v1.2.3 From 3ea20672d23b21327266534e08ca10d8f281f4ab Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 24 Feb 2026 13:41:20 +0100 Subject: MAINTAINERS: Update contact with the kernel.org address Use the kernel.org address as a unified single entry to send patches to. At the same time, update mailmap to group all past contributions. Signed-off-by: Daniel Lezcano --- .mailmap | 4 ++++ MAINTAINERS | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.mailmap b/.mailmap index e1cf6bb85d33..463de4784ecb 100644 --- a/.mailmap +++ b/.mailmap @@ -210,6 +210,10 @@ Daniel Borkmann Daniel Borkmann Daniel Borkmann Daniel Borkmann +Daniel Lezcano +Daniel Lezcano +Daniel Lezcano +Daniel Lezcano Daniel Thompson Danilo Krummrich David Brownell diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..6c8752099924 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6280,7 +6280,7 @@ S: Maintained F: include/linux/clk.h CLOCKSOURCE, CLOCKEVENT DRIVERS -M: Daniel Lezcano +M: Daniel Lezcano M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Supported @@ -6669,7 +6669,7 @@ F: rust/kernel/cpu.rs CPU IDLE TIME MANAGEMENT FRAMEWORK M: "Rafael J. Wysocki" -M: Daniel Lezcano +M: Daniel Lezcano R: Christian Loehle L: linux-pm@vger.kernel.org S: Maintained @@ -6699,7 +6699,7 @@ F: arch/x86/kernel/msr.c CPUIDLE DRIVER - ARM BIG LITTLE M: Lorenzo Pieralisi -M: Daniel Lezcano +M: Daniel Lezcano L: linux-pm@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -6707,7 +6707,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git F: drivers/cpuidle/cpuidle-big_little.c CPUIDLE DRIVER - ARM EXYNOS -M: Daniel Lezcano +M: Daniel Lezcano M: Kukjin Kim R: Krzysztof Kozlowski L: linux-pm@vger.kernel.org @@ -26217,7 +26217,7 @@ F: drivers/media/radio/radio-raremono.c THERMAL M: Rafael J. Wysocki -M: Daniel Lezcano +M: Daniel Lezcano R: Zhang Rui R: Lukasz Luba L: linux-pm@vger.kernel.org @@ -26247,7 +26247,7 @@ F: drivers/thermal/amlogic_thermal.c THERMAL/CPU_COOLING M: Amit Daniel Kachhap -M: Daniel Lezcano +M: Daniel Lezcano M: Viresh Kumar R: Lukasz Luba L: linux-pm@vger.kernel.org -- cgit v1.2.3 From 085f067389d12bd9800c0a9672a174c1de7a8069 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 25 Feb 2026 01:15:23 +0000 Subject: cgroup/cpuset: fix null-ptr-deref in rebuild_sched_domains_cpuslocked A null-pointer-dereference bug was reported by syzbot: Oops: general protection fault, probably for address 0xdffffc0000000000: KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:bitmap_subset include/linux/bitmap.h:433 [inline] RIP: 0010:cpumask_subset include/linux/cpumask.h:836 [inline] RIP: 0010:rebuild_sched_domains_locked kernel/cgroup/cpuset.c:967 RSP: 0018:ffffc90003ecfbc0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000020 RDX: ffff888028de0000 RSI: ffffffff8200f003 RDI: ffffffff8df14f28 RBP: 0000000000000000 R08: 0000000000000cc0 R09: 00000000ffffffff R10: ffffffff8e7d95b3 R11: 0000000000000001 R12: 0000000000000000 R13: 00000000000f4240 R14: dffffc0000000000 R15: 0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f463fff CR3: 000000003704c000 CR4: 00000000003526f0 Call Trace: rebuild_sched_domains_cpuslocked kernel/cgroup/cpuset.c:983 [inline] rebuild_sched_domains+0x21/0x40 kernel/cgroup/cpuset.c:990 sched_rt_handler+0xb5/0xe0 kernel/sched/rt.c:2911 proc_sys_call_handler+0x47f/0x5a0 fs/proc/proc_sysctl.c:600 new_sync_write fs/read_write.c:595 [inline] vfs_write+0x6ac/0x1070 fs/read_write.c:688 ksys_write+0x12a/0x250 fs/read_write.c:740 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x106/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The issue occurs when generate_sched_domains() returns ndoms = 1 and doms = NULL due to a kmalloc failure. This leads to a null-pointer dereference when accessing doms in rebuild_sched_domains_locked(). Fix this by adding a NULL check for doms before accessing it. Fixes: 6ee43047e8ad ("cpuset: Remove unnecessary checks in rebuild_sched_domains_locked") Reported-by: syzbot+460792609a79c085f79f@syzkaller.appspotmail.com Acked-by: Waiman Long Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index aa45351c6f38..271bb99b1b9d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1002,7 +1002,7 @@ void rebuild_sched_domains_locked(void) * offline CPUs, a warning is emitted and we return directly to * prevent the panic. */ - for (i = 0; i < ndoms; ++i) { + for (i = 0; doms && i < ndoms; i++) { if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) return; } -- cgit v1.2.3 From 712896ac4bce38a965a1c175f6e7804ed0381334 Mon Sep 17 00:00:00 2001 From: Li Li Date: Mon, 5 Jan 2026 06:47:28 +0000 Subject: idpf: increment completion queue next_to_clean in sw marker wait routine Currently, in idpf_wait_for_sw_marker_completion(), when an IDPF_TXD_COMPLT_SW_MARKER packet is found, the routine breaks out of the for loop and does not increment the next_to_clean counter. This causes the subsequent NAPI polls to run into the same IDPF_TXD_COMPLT_SW_MARKER packet again and print out the following: [ 23.261341] idpf 0000:05:00.0 eth1: Unknown TX completion type: 5 Instead, we should increment next_to_clean regardless when an IDPF_TXD_COMPLT_SW_MARKER packet is found. Tested: with the patch applied, we do not see the errors above from NAPI polls anymore. Fixes: 9d39447051a0 ("idpf: remove SW marker handling from NAPI") Signed-off-by: Li Li Reviewed-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 376050308b06..761a77510467 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2348,7 +2348,7 @@ void idpf_wait_for_sw_marker_completion(const struct idpf_tx_queue *txq) do { struct idpf_splitq_4b_tx_compl_desc *tx_desc; - struct idpf_tx_queue *target; + struct idpf_tx_queue *target = NULL; u32 ctype_gen, id; tx_desc = flow ? &complq->comp[ntc].common : @@ -2368,14 +2368,14 @@ void idpf_wait_for_sw_marker_completion(const struct idpf_tx_queue *txq) target = complq->txq_grp->txqs[id]; idpf_queue_clear(SW_MARKER, target); - if (target == txq) - break; next: if (unlikely(++ntc == complq->desc_count)) { ntc = 0; gen_flag = !gen_flag; } + if (target == txq) + break; } while (time_before(jiffies, timeout)); idpf_queue_assign(GEN_CHK, complq, gen_flag); -- cgit v1.2.3 From d11e5da2d6d87d123e339dc9af7bc1682fc533ce Mon Sep 17 00:00:00 2001 From: Li Li Date: Mon, 12 Jan 2026 23:09:43 +0000 Subject: idpf: skip deallocating bufq_sets from rx_qgrp if it is NULL In idpf_rxq_group_alloc(), if rx_qgrp->splitq.bufq_sets failed to get allocated: rx_qgrp->splitq.bufq_sets = kcalloc(vport->num_bufqs_per_qgrp, sizeof(struct idpf_bufq_set), GFP_KERNEL); if (!rx_qgrp->splitq.bufq_sets) { err = -ENOMEM; goto err_alloc; } idpf_rxq_group_rel() would attempt to deallocate it in idpf_rxq_sw_queue_rel(), causing a kernel panic: ``` [ 7.967242] early-network-sshd-n-rexd[3148]: knetbase: Info: [ 8.127804] BUG: kernel NULL pointer dereference, address: 00000000000000c0 ... [ 8.129779] RIP: 0010:idpf_rxq_group_rel+0x101/0x170 ... [ 8.133854] Call Trace: [ 8.133980] [ 8.134092] idpf_vport_queues_alloc+0x286/0x500 [ 8.134313] idpf_vport_open+0x4d/0x3f0 [ 8.134498] idpf_open+0x71/0xb0 [ 8.134668] __dev_open+0x142/0x260 [ 8.134840] netif_open+0x2f/0xe0 [ 8.135004] dev_open+0x3d/0x70 [ 8.135166] bond_enslave+0x5ed/0xf50 [ 8.135345] ? nla_put_ifalias+0x3d/0x90 [ 8.135533] ? kvfree_call_rcu+0xb5/0x3b0 [ 8.135725] ? kvfree_call_rcu+0xb5/0x3b0 [ 8.135916] do_set_master+0x114/0x160 [ 8.136098] do_setlink+0x412/0xfb0 [ 8.136269] ? security_sock_rcv_skb+0x2a/0x50 [ 8.136509] ? sk_filter_trim_cap+0x7c/0x320 [ 8.136714] ? skb_queue_tail+0x20/0x50 [ 8.136899] ? __nla_validate_parse+0x92/0xe50 [ 8.137112] ? security_capable+0x35/0x60 [ 8.137304] rtnl_newlink+0x95c/0xa00 [ 8.137483] ? __rtnl_unlock+0x37/0x70 [ 8.137664] ? netdev_run_todo+0x63/0x530 [ 8.137855] ? allocate_slab+0x280/0x870 [ 8.138044] ? security_capable+0x35/0x60 [ 8.138235] rtnetlink_rcv_msg+0x2e6/0x340 [ 8.138431] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 8.138650] netlink_rcv_skb+0x16a/0x1a0 [ 8.138840] netlink_unicast+0x20a/0x320 [ 8.139028] netlink_sendmsg+0x304/0x3b0 [ 8.139217] __sock_sendmsg+0x89/0xb0 [ 8.139399] ____sys_sendmsg+0x167/0x1c0 [ 8.139588] ? ____sys_recvmsg+0xed/0x150 [ 8.139780] ___sys_sendmsg+0xdd/0x120 [ 8.139960] ? ___sys_recvmsg+0x124/0x1e0 [ 8.140152] ? rcutree_enqueue+0x1f/0xb0 [ 8.140341] ? rcutree_enqueue+0x1f/0xb0 [ 8.140528] ? call_rcu+0xde/0x2a0 [ 8.140695] ? evict+0x286/0x2d0 [ 8.140856] ? rcutree_enqueue+0x1f/0xb0 [ 8.141043] ? kmem_cache_free+0x2c/0x350 [ 8.141236] __x64_sys_sendmsg+0x72/0xc0 [ 8.141424] do_syscall_64+0x6f/0x890 [ 8.141603] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 8.141841] RIP: 0033:0x7f2799d21bd0 ... [ 8.149905] Kernel panic - not syncing: Fatal exception [ 8.175940] Kernel Offset: 0xf800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 8.176425] Rebooting in 10 seconds.. ``` Tested: With this patch, the kernel panic no longer appears. Fixes: 95af467d9a4e ("idpf: configure resources for RX queues") Signed-off-by: Li Li Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 761a77510467..59aafadae3d0 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -1341,6 +1341,9 @@ static void idpf_txq_group_rel(struct idpf_q_vec_rsrc *rsrc) */ static void idpf_rxq_sw_queue_rel(struct idpf_rxq_group *rx_qgrp) { + if (!rx_qgrp->splitq.bufq_sets) + return; + for (unsigned int i = 0; i < rx_qgrp->splitq.num_bufq_sets; i++) { struct idpf_bufq_set *bufq_set = &rx_qgrp->splitq.bufq_sets[i]; -- cgit v1.2.3 From e403bf4013a665961bdfdf65193e3076c3d155a5 Mon Sep 17 00:00:00 2001 From: Li Li Date: Mon, 12 Jan 2026 23:09:44 +0000 Subject: idpf: skip deallocating txq group's txqs if it is NULL In idpf_txq_group_alloc(), if any txq group's txqs failed to allocate memory: for (j = 0; j < tx_qgrp->num_txq; j++) { tx_qgrp->txqs[j] = kzalloc(sizeof(*tx_qgrp->txqs[j]), GFP_KERNEL); if (!tx_qgrp->txqs[j]) goto err_alloc; } It would cause a NULL ptr kernel panic in idpf_txq_group_rel(): for (j = 0; j < txq_grp->num_txq; j++) { if (flow_sch_en) { kfree(txq_grp->txqs[j]->refillq); txq_grp->txqs[j]->refillq = NULL; } kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } [ 6.532461] BUG: kernel NULL pointer dereference, address: 0000000000000058 ... [ 6.534433] RIP: 0010:idpf_txq_group_rel+0xc9/0x110 ... [ 6.538513] Call Trace: [ 6.538639] [ 6.538760] idpf_vport_queues_alloc+0x75/0x550 [ 6.538978] idpf_vport_open+0x4d/0x3f0 [ 6.539164] idpf_open+0x71/0xb0 [ 6.539324] __dev_open+0x142/0x260 [ 6.539506] netif_open+0x2f/0xe0 [ 6.539670] dev_open+0x3d/0x70 [ 6.539827] bond_enslave+0x5ed/0xf50 [ 6.540005] ? rcutree_enqueue+0x1f/0xb0 [ 6.540193] ? call_rcu+0xde/0x2a0 [ 6.540375] ? barn_get_empty_sheaf+0x5c/0x80 [ 6.540594] ? __kfree_rcu_sheaf+0xb6/0x1a0 [ 6.540793] ? nla_put_ifalias+0x3d/0x90 [ 6.540981] ? kvfree_call_rcu+0xb5/0x3b0 [ 6.541173] ? kvfree_call_rcu+0xb5/0x3b0 [ 6.541365] do_set_master+0x114/0x160 [ 6.541547] do_setlink+0x412/0xfb0 [ 6.541717] ? security_sock_rcv_skb+0x2a/0x50 [ 6.541931] ? sk_filter_trim_cap+0x7c/0x320 [ 6.542136] ? skb_queue_tail+0x20/0x50 [ 6.542322] ? __nla_validate_parse+0x92/0xe50 ro[o t t o6 .d5e4f2a5u4l0t]- ? security_capable+0x35/0x60 [ 6.542792] rtnl_newlink+0x95c/0xa00 [ 6.542972] ? __rtnl_unlock+0x37/0x70 [ 6.543152] ? netdev_run_todo+0x63/0x530 [ 6.543343] ? allocate_slab+0x280/0x870 [ 6.543531] ? security_capable+0x35/0x60 [ 6.543722] rtnetlink_rcv_msg+0x2e6/0x340 [ 6.543918] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 6.544138] netlink_rcv_skb+0x16a/0x1a0 [ 6.544328] netlink_unicast+0x20a/0x320 [ 6.544516] netlink_sendmsg+0x304/0x3b0 [ 6.544748] __sock_sendmsg+0x89/0xb0 [ 6.544928] ____sys_sendmsg+0x167/0x1c0 [ 6.545116] ? ____sys_recvmsg+0xed/0x150 [ 6.545308] ___sys_sendmsg+0xdd/0x120 [ 6.545489] ? ___sys_recvmsg+0x124/0x1e0 [ 6.545680] ? rcutree_enqueue+0x1f/0xb0 [ 6.545867] ? rcutree_enqueue+0x1f/0xb0 [ 6.546055] ? call_rcu+0xde/0x2a0 [ 6.546222] ? evict+0x286/0x2d0 [ 6.546389] ? rcutree_enqueue+0x1f/0xb0 [ 6.546577] ? kmem_cache_free+0x2c/0x350 [ 6.546784] __x64_sys_sendmsg+0x72/0xc0 [ 6.546972] do_syscall_64+0x6f/0x890 [ 6.547150] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 6.547393] RIP: 0033:0x7fc1a3347bd0 ... [ 6.551375] RIP: 0010:idpf_txq_group_rel+0xc9/0x110 ... [ 6.578856] Rebooting in 10 seconds.. We should skip deallocating txqs[j] if it is NULL in the first place. Tested: with this patch, the kernel panic no longer appears. Fixes: 1c325aac10a8 ("idpf: configure resources for TX queues") Signed-off-by: Li Li Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 59aafadae3d0..7613f020b622 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -1316,6 +1316,9 @@ static void idpf_txq_group_rel(struct idpf_q_vec_rsrc *rsrc) struct idpf_txq_group *txq_grp = &rsrc->txq_grps[i]; for (unsigned int j = 0; j < txq_grp->num_txq; j++) { + if (!txq_grp->txqs[j]) + continue; + if (idpf_queue_has(FLOW_SCH_EN, txq_grp->txqs[j])) { kfree(txq_grp->txqs[j]->refillq); txq_grp->txqs[j]->refillq = NULL; -- cgit v1.2.3 From bc3b31977314433e4685ae92853d1b6e91ad7bc2 Mon Sep 17 00:00:00 2001 From: Li Li Date: Fri, 23 Jan 2026 06:58:06 +0000 Subject: idpf: nullify pointers after they are freed rss_data->rss_key needs to be nullified after it is freed. Checks like "if (!rss_data->rss_key)" in the code could fail if it is not nullified. Tested: built and booted the kernel. Fixes: 83f38f210b85 ("idpf: Fix RSS LUT NULL pointer crash on early ethtool operations") Signed-off-by: Li Li Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 94da5fbd56f1..7ce6a0e4acb6 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1320,6 +1320,7 @@ static struct idpf_vport *idpf_vport_alloc(struct idpf_adapter *adapter, free_rss_key: kfree(rss_data->rss_key); + rss_data->rss_key = NULL; free_qreg_chunks: idpf_vport_deinit_queue_reg_chunks(adapter->vport_config[idx]); free_vector_idxs: -- cgit v1.2.3 From 1500a8662d2d41d6bb03e034de45ddfe6d7d362d Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Mon, 26 Jan 2026 21:55:59 +0000 Subject: idpf: change IRQ naming to match netdev and ethtool queue numbering The code uses the vidx for the IRQ name but that doesn't match ethtool reporting nor netdev naming, this makes it hard to tune the device and associate queues with IRQs. Sequentially requesting irqs starting from '0' makes the output consistent. This commit changes the interrupt numbering but preserves the name format, maintaining ABI compatibility. Existing tools relying on the old numbering are already non-functional, as they lack a useful correlation to the interrupts. Before: ethtool -L eth1 tx 1 combined 3 grep . /proc/irq/*/*idpf*/../smp_affinity_list /proc/irq/67/idpf-Mailbox-0/../smp_affinity_list:0-55,112-167 /proc/irq/68/idpf-eth1-TxRx-1/../smp_affinity_list:0 /proc/irq/70/idpf-eth1-TxRx-3/../smp_affinity_list:1 /proc/irq/71/idpf-eth1-TxRx-4/../smp_affinity_list:2 /proc/irq/72/idpf-eth1-Tx-5/../smp_affinity_list:3 ethtool -S eth1 | grep -v ': 0' NIC statistics: tx_q-0_pkts: 1002 tx_q-1_pkts: 2679 tx_q-2_pkts: 1113 tx_q-3_pkts: 1192 <----- tx_q-3 vs idpf-eth1-Tx-5 rx_q-0_pkts: 1143 rx_q-1_pkts: 3172 rx_q-2_pkts: 1074 After: ethtool -L eth1 tx 1 combined 3 grep . /proc/irq/*/*idpf*/../smp_affinity_list /proc/irq/67/idpf-Mailbox-0/../smp_affinity_list:0-55,112-167 /proc/irq/68/idpf-eth1-TxRx-0/../smp_affinity_list:0 /proc/irq/70/idpf-eth1-TxRx-1/../smp_affinity_list:1 /proc/irq/71/idpf-eth1-TxRx-2/../smp_affinity_list:2 /proc/irq/72/idpf-eth1-Tx-3/../smp_affinity_list:3 ethtool -S eth1 | grep -v ': 0' NIC statistics: tx_q-0_pkts: 118 tx_q-1_pkts: 134 tx_q-2_pkts: 228 tx_q-3_pkts: 138 <--- tx_q-3 matches idpf-eth1-Tx-3 rx_q-0_pkts: 111 rx_q-1_pkts: 366 rx_q-2_pkts: 120 Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport") Signed-off-by: Brian Vazquez Reviewed-by: Brett Creeley Reviewed-by: Aleksandr Loktionov Reviewed-by: Paul Menzel Reviewed-by: Eric Dumazet Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 7613f020b622..b7e922fffc96 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -4077,7 +4077,7 @@ static int idpf_vport_intr_req_irq(struct idpf_vport *vport, continue; name = kasprintf(GFP_KERNEL, "%s-%s-%s-%d", drv_name, if_name, - vec_name, vidx); + vec_name, vector); err = request_irq(irq_num, idpf_vport_intr_clean_queues, 0, name, q_vector); -- cgit v1.2.3 From 2c31557336a8e4d209ed8d4513cef2c0f15e7ef4 Mon Sep 17 00:00:00 2001 From: Sreedevi Joshi Date: Tue, 13 Jan 2026 12:01:13 -0600 Subject: idpf: Fix flow rule delete failure due to invalid validation When deleting a flow rule using "ethtool -N delete ", idpf_sideband_action_ena() incorrectly validates fsp->ring_cookie even though ethtool doesn't populate this field for delete operations. The uninitialized ring_cookie may randomly match RX_CLS_FLOW_DISC or RX_CLS_FLOW_WAKE, causing validation to fail and preventing legitimate rule deletions. Remove the unnecessary sideband action enable check and ring_cookie validation during delete operations since action validation is not required when removing existing rules. Fixes: ada3e24b84a0 ("idpf: add flow steering support") Signed-off-by: Sreedevi Joshi Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_ethtool.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c index 1d78a621d65b..3275b0215655 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c +++ b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c @@ -307,9 +307,6 @@ static int idpf_del_flow_steer(struct net_device *netdev, vport_config = vport->adapter->vport_config[np->vport_idx]; user_config = &vport_config->user_config; - if (!idpf_sideband_action_ena(vport, fsp)) - return -EOPNOTSUPP; - rule = kzalloc(struct_size(rule, rule_info, 1), GFP_KERNEL); if (!rule) return -ENOMEM; -- cgit v1.2.3 From 6aa07e23dd3ccd35a0100c06fcb6b6c3b01e7965 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Thu, 29 Jan 2026 12:00:26 +0800 Subject: ice: recap the VSI and QoS info after rebuild Fix IRDMA hardware initialization timeout (-110) after resume by separating VSI-dependent configuration from RDMA resource allocation, ensuring VSI is rebuilt before IRDMA accesses it. After resume from suspend, IRDMA hardware initialization fails: ice: IRDMA hardware initialization FAILED init_state=4 status=-110 Separate RDMA initialization into two phases: 1. ice_init_rdma() - Allocate resources only (no VSI/QoS access, no plug) 2. ice_rdma_finalize_setup() - Assign VSI/QoS info and plug device This allows: - ice_init_rdma() to stay in ice_resume() (mirrors ice_deinit_rdma() in ice_suspend()) - VSI assignment deferred until after ice_vsi_rebuild() completes - QoS info updated after ice_dcb_rebuild() completes - Device plugged only when control queues, VSI, and DCB are all ready Fixes: bc69ad74867db ("ice: avoid IRQ collision to fix init failure on ACPI S3 resume") Reviewed-by: Aleksandr Loktionov Signed-off-by: Aaron Ma Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_idc.c | 44 ++++++++++++++++++++++++------- drivers/net/ethernet/intel/ice/ice_main.c | 7 ++++- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index def7efa15447..2b2b22af42be 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -987,6 +987,7 @@ int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset); void ice_print_link_msg(struct ice_vsi *vsi, bool isup); int ice_plug_aux_dev(struct ice_pf *pf); void ice_unplug_aux_dev(struct ice_pf *pf); +void ice_rdma_finalize_setup(struct ice_pf *pf); int ice_init_rdma(struct ice_pf *pf); void ice_deinit_rdma(struct ice_pf *pf); bool ice_is_wol_supported(struct ice_hw *hw); diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index 420d45c2558b..ded029aa71d7 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -360,6 +360,39 @@ void ice_unplug_aux_dev(struct ice_pf *pf) auxiliary_device_uninit(adev); } +/** + * ice_rdma_finalize_setup - Complete RDMA setup after VSI is ready + * @pf: ptr to ice_pf + * + * Sets VSI-dependent information and plugs aux device. + * Must be called after ice_init_rdma(), ice_vsi_rebuild(), and + * ice_dcb_rebuild() complete. + */ +void ice_rdma_finalize_setup(struct ice_pf *pf) +{ + struct device *dev = ice_pf_to_dev(pf); + struct iidc_rdma_priv_dev_info *privd; + int ret; + + if (!ice_is_rdma_ena(pf) || !pf->cdev_info) + return; + + privd = pf->cdev_info->iidc_priv; + if (!privd || !pf->vsi || !pf->vsi[0] || !pf->vsi[0]->netdev) + return; + + /* Assign VSI info now that VSI is valid */ + privd->netdev = pf->vsi[0]->netdev; + privd->vport_id = pf->vsi[0]->vsi_num; + + /* Update QoS info after DCB has been rebuilt */ + ice_setup_dcb_qos_info(pf, &privd->qos_info); + + ret = ice_plug_aux_dev(pf); + if (ret) + dev_warn(dev, "Failed to plug RDMA aux device: %d\n", ret); +} + /** * ice_init_rdma - initializes PF for RDMA use * @pf: ptr to ice_pf @@ -398,22 +431,14 @@ int ice_init_rdma(struct ice_pf *pf) } cdev->iidc_priv = privd; - privd->netdev = pf->vsi[0]->netdev; privd->hw_addr = (u8 __iomem *)pf->hw.hw_addr; cdev->pdev = pf->pdev; - privd->vport_id = pf->vsi[0]->vsi_num; pf->cdev_info->rdma_protocol |= IIDC_RDMA_PROTOCOL_ROCEV2; - ice_setup_dcb_qos_info(pf, &privd->qos_info); - ret = ice_plug_aux_dev(pf); - if (ret) - goto err_plug_aux_dev; + return 0; -err_plug_aux_dev: - pf->cdev_info->adev = NULL; - xa_erase(&ice_aux_id, pf->aux_idx); err_alloc_xa: kfree(privd); err_privd_alloc: @@ -432,7 +457,6 @@ void ice_deinit_rdma(struct ice_pf *pf) if (!ice_is_rdma_ena(pf)) return; - ice_unplug_aux_dev(pf); xa_erase(&ice_aux_id, pf->aux_idx); kfree(pf->cdev_info->iidc_priv); kfree(pf->cdev_info); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4da37caa3ec9..62914167d121 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5138,6 +5138,9 @@ int ice_load(struct ice_pf *pf) if (err) goto err_init_rdma; + /* Finalize RDMA: VSI already created, assign info and plug device */ + ice_rdma_finalize_setup(pf); + ice_service_task_restart(pf); clear_bit(ICE_DOWN, pf->state); @@ -5169,6 +5172,7 @@ void ice_unload(struct ice_pf *pf) devl_assert_locked(priv_to_devlink(pf)); + ice_unplug_aux_dev(pf); ice_deinit_rdma(pf); ice_deinit_features(pf); ice_tc_indir_block_unregister(vsi); @@ -5595,6 +5599,7 @@ static int ice_suspend(struct device *dev) */ disabled = ice_service_task_stop(pf); + ice_unplug_aux_dev(pf); ice_deinit_rdma(pf); /* Already suspended?, then there is nothing to do */ @@ -7859,7 +7864,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) ice_health_clear(pf); - ice_plug_aux_dev(pf); + ice_rdma_finalize_setup(pf); if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) ice_lag_rebuild(pf); -- cgit v1.2.3 From a9c354e656597aededa027d63d2ff0973f6b033f Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 2 Feb 2026 11:17:54 +0100 Subject: ice: fix crash in ethtool offline loopback test Since the conversion of ice to page pool, the ethtool loopback test crashes: BUG: kernel NULL pointer dereference, address: 000000000000000c #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 1100f1067 P4D 0 Oops: Oops: 0002 [#1] SMP NOPTI CPU: 23 UID: 0 PID: 5904 Comm: ethtool Kdump: loaded Not tainted 6.19.0-0.rc7.260128g1f97d9dcf5364.49.eln154.x86_64 #1 PREEMPT(lazy) Hardware name: [...] RIP: 0010:ice_alloc_rx_bufs+0x1cd/0x310 [ice] Code: 83 6c 24 30 01 66 41 89 47 08 0f 84 c0 00 00 00 41 0f b7 dc 48 8b 44 24 18 48 c1 e3 04 41 bb 00 10 00 00 48 8d 2c 18 8b 04 24 <89> 45 0c 41 8b 4d 00 49 d3 e3 44 3b 5c 24 24 0f 83 ac fe ff ff 44 RSP: 0018:ff7894738aa1f768 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000700 RDI: 0000000000000000 RBP: 0000000000000000 R08: ff16dcae79880200 R09: 0000000000000019 R10: 0000000000000001 R11: 0000000000001000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ff16dcae6c670000 FS: 00007fcf428850c0(0000) GS:ff16dcb149710000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000000c CR3: 0000000121227005 CR4: 0000000000773ef0 PKRU: 55555554 Call Trace: ice_vsi_cfg_rxq+0xca/0x460 [ice] ice_vsi_cfg_rxqs+0x54/0x70 [ice] ice_loopback_test+0xa9/0x520 [ice] ice_self_test+0x1b9/0x280 [ice] ethtool_self_test+0xe5/0x200 __dev_ethtool+0x1106/0x1a90 dev_ethtool+0xbe/0x1a0 dev_ioctl+0x258/0x4c0 sock_do_ioctl+0xe3/0x130 __x64_sys_ioctl+0xb9/0x100 do_syscall_64+0x7c/0x700 entry_SYSCALL_64_after_hwframe+0x76/0x7e [...] It crashes because we have not initialized libeth for the rx ring. Fix it by treating ICE_VSI_LB VSIs slightly more like normal PF VSIs and letting them have a q_vector. It's just a dummy, because the loopback test does not use interrupts, but it contains a napi struct that can be passed to libeth_rx_fq_create() called from ice_vsi_cfg_rxq() -> ice_rxq_pp_create(). Fixes: 93f53db9f9dc ("ice: switch to Page Pool") Signed-off-by: Michal Schmidt Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 5 ++++- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 ++++ drivers/net/ethernet/intel/ice/ice_lib.c | 15 ++++++++++----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index afbff8aa9ceb..49b4304c4975 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -124,6 +124,8 @@ static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, u16 v_idx) if (vsi->type == ICE_VSI_VF) { ice_calc_vf_reg_idx(vsi->vf, q_vector); goto out; + } else if (vsi->type == ICE_VSI_LB) { + goto skip_alloc; } else if (vsi->type == ICE_VSI_CTRL && vsi->vf) { struct ice_vsi *ctrl_vsi = ice_get_vf_ctrl_vsi(pf, vsi); @@ -662,7 +664,8 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) u32 rx_buf_len; int err; - if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF) { + if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || + ring->vsi->type == ICE_VSI_LB) { if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index, diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index c6bc29cfb8e6..ff003529beba 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -1289,6 +1289,10 @@ static u64 ice_loopback_test(struct net_device *netdev) test_vsi->netdev = netdev; tx_ring = test_vsi->tx_rings[0]; rx_ring = test_vsi->rx_rings[0]; + /* Dummy q_vector and napi. Fill the minimum required for + * ice_rxq_pp_create(). + */ + rx_ring->q_vector->napi.dev = netdev; if (ice_lbtest_prepare_rings(test_vsi)) { ret = 2; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index d921269e1fe7..3cd6296a1fbe 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -107,10 +107,6 @@ static int ice_vsi_alloc_arrays(struct ice_vsi *vsi) if (!vsi->rxq_map) goto err_rxq_map; - /* There is no need to allocate q_vectors for a loopback VSI. */ - if (vsi->type == ICE_VSI_LB) - return 0; - /* allocate memory for q_vector pointers */ vsi->q_vectors = devm_kcalloc(dev, vsi->num_q_vectors, sizeof(*vsi->q_vectors), GFP_KERNEL); @@ -241,6 +237,8 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi) case ICE_VSI_LB: vsi->alloc_txq = 1; vsi->alloc_rxq = 1; + /* A dummy q_vector, no actual IRQ. */ + vsi->num_q_vectors = 1; break; default: dev_warn(ice_pf_to_dev(pf), "Unknown VSI type %d\n", vsi_type); @@ -2428,14 +2426,21 @@ static int ice_vsi_cfg_def(struct ice_vsi *vsi) } break; case ICE_VSI_LB: - ret = ice_vsi_alloc_rings(vsi); + ret = ice_vsi_alloc_q_vectors(vsi); if (ret) goto unroll_vsi_init; + ret = ice_vsi_alloc_rings(vsi); + if (ret) + goto unroll_alloc_q_vector; + ret = ice_vsi_alloc_ring_stats(vsi); if (ret) goto unroll_vector_base; + /* Simply map the dummy q_vector to the only rx_ring */ + vsi->rx_rings[0]->q_vector = vsi->q_vectors[0]; + break; default: /* clean up the resources and exit */ -- cgit v1.2.3 From 4b3d54a85bd37ebf2d9836f0d0de775c0ff21af9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Feb 2026 11:50:23 +0100 Subject: i40e: Fix preempt count leak in napi poll tracepoint Using get_cpu() in the tracepoint assignment causes an obvious preempt count leak because nothing invokes put_cpu() to undo it: softirq: huh, entered softirq 3 NET_RX with preempt_count 00000100, exited with 00000101? This clearly has seen a lot of testing in the last 3+ years... Use smp_processor_id() instead. Fixes: 6d4d584a7ea8 ("i40e: Add i40e_napi_poll tracepoint") Signed-off-by: Thomas Gleixner Cc: Tony Nguyen Cc: Przemek Kitszel Cc: intel-wired-lan@lists.osuosl.org Cc: netdev@vger.kernel.org Reviewed-by: Joe Damato Reviewed-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h index 759f3d1c4c8f..dde0ccd789ed 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_trace.h +++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h @@ -88,7 +88,7 @@ TRACE_EVENT(i40e_napi_poll, __entry->rx_clean_complete = rx_clean_complete; __entry->tx_clean_complete = tx_clean_complete; __entry->irq_num = q->irq_num; - __entry->curr_cpu = get_cpu(); + __entry->curr_cpu = smp_processor_id(); __assign_str(qname); __assign_str(dev_name); __assign_bitmask(irq_affinity, cpumask_bits(&q->affinity_mask), -- cgit v1.2.3 From feae40a6a178bb525a15f19288016e5778102a99 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Wed, 10 Dec 2025 12:26:51 +0100 Subject: ixgbevf: fix link setup issue It may happen that VF spawned for E610 adapter has problem with setting link up. This happens when ixgbevf supporting mailbox API 1.6 cooperates with PF driver which doesn't support this version of API, and hence doesn't support new approach for getting PF link data. In that case VF asks PF to provide link data but as PF doesn't support it, returns -EOPNOTSUPP what leads to early bail from link configuration sequence. Avoid such situation by using legacy VFLINKS approach whenever negotiated API version is less than 1.6. To reproduce the issue just create VF and set its link up - adapter must be any from the E610 family, ixgbevf must support API 1.6 or higher while ixgbevf must not. Fixes: 53f0eb62b4d2 ("ixgbevf: fix getting link speed data for E610 devices") Reviewed-by: Aleksandr Loktionov Reviewed-by: Piotr Kwapulinski Reviewed-by: Paul Menzel Cc: stable@vger.kernel.org Signed-off-by: Jedrzej Jagielski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbevf/vf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 74d320879513..b67b580f7f1c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -852,7 +852,8 @@ static s32 ixgbevf_check_mac_link_vf(struct ixgbe_hw *hw, if (!mac->get_link_status) goto out; - if (hw->mac.type == ixgbe_mac_e610_vf) { + if (hw->mac.type == ixgbe_mac_e610_vf && + hw->api_version >= ixgbe_mbox_api_16) { ret_val = ixgbevf_get_pf_link_state(hw, speed, link_up); if (ret_val) goto out; -- cgit v1.2.3 From 5b644464eeeac485685c6632bcc95347ae6c8677 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Tue, 6 Jan 2026 16:14:19 +0200 Subject: e1000e: introduce new board type for Panther Lake PCH Add new board type for Panther Lake devices for separating device-specific features and flows. Additionally, remove the deprecated device IDs 0x57B5 and 0x57B6, which are not used by any existing devices. Signed-off-by: Vitaly Lifshits Tested-by: Avigail Dahan Reviewed-by: Aleksandr Loktionov Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/e1000.h | 4 +++- drivers/net/ethernet/intel/e1000e/hw.h | 2 -- drivers/net/ethernet/intel/e1000e/ich8lan.c | 22 +++++++++++++++++++++- drivers/net/ethernet/intel/e1000e/netdev.c | 15 +++++++-------- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index aa08f397988e..63ebe00376f5 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -117,7 +117,8 @@ enum e1000_boards { board_pch_cnp, board_pch_tgp, board_pch_adp, - board_pch_mtp + board_pch_mtp, + board_pch_ptp }; struct e1000_ps_page { @@ -527,6 +528,7 @@ extern const struct e1000_info e1000_pch_cnp_info; extern const struct e1000_info e1000_pch_tgp_info; extern const struct e1000_info e1000_pch_adp_info; extern const struct e1000_info e1000_pch_mtp_info; +extern const struct e1000_info e1000_pch_ptp_info; extern const struct e1000_info e1000_es2_info; void e1000e_ptp_init(struct e1000_adapter *adapter); diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index fc8ed38aa095..c7ac599e5a7a 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -118,8 +118,6 @@ struct e1000_hw; #define E1000_DEV_ID_PCH_ARL_I219_V24 0x57A1 #define E1000_DEV_ID_PCH_PTP_I219_LM25 0x57B3 #define E1000_DEV_ID_PCH_PTP_I219_V25 0x57B4 -#define E1000_DEV_ID_PCH_PTP_I219_LM26 0x57B5 -#define E1000_DEV_ID_PCH_PTP_I219_V26 0x57B6 #define E1000_DEV_ID_PCH_PTP_I219_LM27 0x57B7 #define E1000_DEV_ID_PCH_PTP_I219_V27 0x57B8 #define E1000_DEV_ID_PCH_NVL_I219_LM29 0x57B9 diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 0ff8688ac3b8..23421a640097 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -528,7 +528,7 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw) phy->id = e1000_phy_unknown; - if (hw->mac.type == e1000_pch_mtp) { + if (hw->mac.type == e1000_pch_mtp || hw->mac.type == e1000_pch_ptp) { phy->retry_count = 2; e1000e_enable_phy_retry(hw); } @@ -6208,3 +6208,23 @@ const struct e1000_info e1000_pch_mtp_info = { .phy_ops = &ich8_phy_ops, .nvm_ops = &spt_nvm_ops, }; + +const struct e1000_info e1000_pch_ptp_info = { + .mac = e1000_pch_ptp, + .flags = FLAG_IS_ICH + | FLAG_HAS_WOL + | FLAG_HAS_HW_TIMESTAMP + | FLAG_HAS_CTRLEXT_ON_LOAD + | FLAG_HAS_AMT + | FLAG_HAS_FLASH + | FLAG_HAS_JUMBO_FRAMES + | FLAG_APME_IN_WUC, + .flags2 = FLAG2_HAS_PHY_STATS + | FLAG2_HAS_EEE, + .pba = 26, + .max_hw_frame_size = 9022, + .get_variants = e1000_get_variants_ich8lan, + .mac_ops = &ich8_mac_ops, + .phy_ops = &ich8_phy_ops, + .nvm_ops = &spt_nvm_ops, +}; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index ddbe2f7d8112..814698807f3d 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -55,6 +55,7 @@ static const struct e1000_info *e1000_info_tbl[] = { [board_pch_tgp] = &e1000_pch_tgp_info, [board_pch_adp] = &e1000_pch_adp_info, [board_pch_mtp] = &e1000_pch_mtp_info, + [board_pch_ptp] = &e1000_pch_ptp_info, }; struct e1000_reg_info { @@ -7925,14 +7926,12 @@ static const struct pci_device_id e1000_pci_tbl[] = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_LNP_I219_V21), board_pch_mtp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ARL_I219_LM24), board_pch_mtp }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_ARL_I219_V24), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_LM25), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_V25), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_LM26), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_V26), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_LM27), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_V27), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_NVL_I219_LM29), board_pch_mtp }, - { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_NVL_I219_V29), board_pch_mtp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_LM25), board_pch_ptp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_V25), board_pch_ptp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_LM27), board_pch_ptp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_PTP_I219_V27), board_pch_ptp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_NVL_I219_LM29), board_pch_ptp }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_PCH_NVL_I219_V29), board_pch_ptp }, { 0, 0, 0, 0, 0, 0, 0 } /* terminate list */ }; -- cgit v1.2.3 From 0942fc6d324eb9c6b16187b2aa994c0823557f06 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Tue, 6 Jan 2026 16:14:20 +0200 Subject: e1000e: clear DPG_EN after reset to avoid autonomous power-gating Panther Lake systems introduced an autonomous power gating feature for the integrated Gigabit Ethernet in shutdown state (S5) state. As part of it, the reset value of DPG_EN bit was changed to 1. Clear this bit after performing hardware reset to avoid errors such as Tx/Rx hangs, or packet loss/corruption. Fixes: 0c9183ce61bc ("e1000e: Add support for the next LOM generation") Signed-off-by: Vitaly Lifshits Reviewed-by: Aleksandr Loktionov Tested-by: Avigail Dahan Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/defines.h | 1 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index ba331899d186..d4a1041e456d 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -33,6 +33,7 @@ /* Extended Device Control */ #define E1000_CTRL_EXT_LPCD 0x00000004 /* LCD Power Cycle Done */ +#define E1000_CTRL_EXT_DPG_EN 0x00000008 /* Dynamic Power Gating Enable */ #define E1000_CTRL_EXT_SDP3_DATA 0x00000080 /* Value of SW Definable Pin 3 */ #define E1000_CTRL_EXT_FORCE_SMBUS 0x00000800 /* Force SMBus mode */ #define E1000_CTRL_EXT_EE_RST 0x00002000 /* Reinitialize from EEPROM */ diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 23421a640097..dea208db1be5 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -4932,6 +4932,15 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw) reg |= E1000_KABGTXD_BGSQLBIAS; ew32(KABGTXD, reg); + /* The hardware reset value of the DPG_EN bit is 1. + * Clear DPG_EN to prevent unexpected autonomous power gating. + */ + if (hw->mac.type >= e1000_pch_ptp) { + reg = er32(CTRL_EXT); + reg &= ~E1000_CTRL_EXT_DPG_EN; + ew32(CTRL_EXT, reg); + } + return 0; } -- cgit v1.2.3 From f6bf47ab32e0863df50f5501d207dcdddb7fc507 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Feb 2026 22:10:10 +0000 Subject: arm64: io: Rename ioremap_prot() to __ioremap_prot() Rename our ioremap_prot() implementation to __ioremap_prot() and convert all arch-internal callers over to the new function. ioremap_prot() remains as a #define to __ioremap_prot() for generic_access_phys() and will be subsequently extended to handle user permissions in 'prot'. Cc: Zeng Heng Cc: Jinjiang Tu Cc: Catalin Marinas Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 11 ++++++----- arch/arm64/kernel/acpi.c | 2 +- arch/arm64/mm/ioremap.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 83e03abbb2ca..cd2fddfe814a 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -264,19 +264,20 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, pgprot_t *prot); int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); +void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot); -#define ioremap_prot ioremap_prot +#define ioremap_prot __ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) + __ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) #define ioremap_encrypted(addr, size) \ - ioremap_prot((addr), (size), PAGE_KERNEL) + __ioremap_prot((addr), (size), PAGE_KERNEL) /* * io{read,write}{16,32,64}be() macros @@ -297,7 +298,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size) if (pfn_is_map_memory(__phys_to_pfn(addr))) return (void __iomem *)__phys_to_virt(addr); - return ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); + return __ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); } /* diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index af90128cfed5..a9d884fd1d00 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -377,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, prot); + return __ioremap_prot(phys, size, prot); } /* diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b12cbed9b5ad..6eef48ef6278 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -14,8 +14,8 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) return 0; } -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - pgprot_t pgprot) +void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t pgprot) { unsigned long last_addr = phys_addr + size - 1; @@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(ioremap_prot); +EXPORT_SYMBOL(__ioremap_prot); /* * Must be called after early_fixmap_init -- cgit v1.2.3 From 8f098037139b294050053123ab2bc0f819d08932 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Feb 2026 22:10:11 +0000 Subject: arm64: io: Extract user memory type in ioremap_prot() The only caller of ioremap_prot() outside of the generic ioremap() implementation is generic_access_phys(), which passes a 'pgprot_t' value determined from the user mapping of the target 'pfn' being accessed by the kernel. On arm64, the 'pgprot_t' contains all of the non-address bits from the pte, including the permission controls, and so we end up returning a new user mapping from ioremap_prot() which faults when accessed from the kernel on systems with PAN: | Unable to handle kernel read from unreadable memory at virtual address ffff80008ea89000 | ... | Call trace: | __memcpy_fromio+0x80/0xf8 | generic_access_phys+0x20c/0x2b8 | __access_remote_vm+0x46c/0x5b8 | access_remote_vm+0x18/0x30 | environ_read+0x238/0x3e8 | vfs_read+0xe4/0x2b0 | ksys_read+0xcc/0x178 | __arm64_sys_read+0x4c/0x68 Extract only the memory type from the user 'pgprot_t' in ioremap_prot() and assert that we're being passed a user mapping, to protect us against any changes in future that may require additional handling. To avoid falsely flagging users of ioremap(), provide our own ioremap() macro which simply wraps __ioremap_prot(). Cc: Zeng Heng Cc: Jinjiang Tu Cc: Catalin Marinas Fixes: 893dea9ccd08 ("arm64: Add HAVE_IOREMAP_PROT support") Reported-by: Jinjiang Tu Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index cd2fddfe814a..8cbd1e96fd50 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -266,10 +266,23 @@ typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot); -#define ioremap_prot __ioremap_prot +static inline void __iomem *ioremap_prot(phys_addr_t phys, size_t size, + pgprot_t user_prot) +{ + pgprot_t prot; + ptdesc_t user_prot_val = pgprot_val(user_prot); + + if (WARN_ON_ONCE(!(user_prot_val & PTE_USER))) + return NULL; -#define _PAGE_IOREMAP PROT_DEVICE_nGnRE + prot = __pgprot_modify(PAGE_KERNEL, PTE_ATTRINDX_MASK, + user_prot_val & PTE_ATTRINDX_MASK); + return __ioremap_prot(phys, size, prot); +} +#define ioremap_prot ioremap_prot +#define ioremap(addr, size) \ + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_wc(addr, size) \ __ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ -- cgit v1.2.3 From 8a85b3131225a8c8143ba2ae29c0eef8c1f9117f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:30 +0000 Subject: arm64: gcs: Do not set PTE_SHARED on GCS mappings if FEAT_LPA2 is enabled When FEAT_LPA2 is enabled, bits 8-9 of the PTE replace the shareability attribute with bits 50-51 of the output address. The _PAGE_GCS{,_RO} definitions include the PTE_SHARED bits as 0b11 (this matches the other _PAGE_* definitions) but using this macro directly leads to the following panic when enabling GCS on a system/model with LPA2: Unable to handle kernel paging request at virtual address fffff1ffc32d8008 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 52-bit VAs, pgdp=0000000060f4d000 [fffff1ffc32d8008] pgd=100000006184b003, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] SMP CPU: 0 UID: 0 PID: 513 Comm: gcs_write_fault Tainted: G M 7.0.0-rc1 #1 PREEMPT Tainted: [M]=MACHINE_CHECK Hardware name: QEMU QEMU Virtual Machine, BIOS 2025.02-8+deb13u1 11/08/2025 pstate: 03402005 (nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : zap_huge_pmd+0x168/0x468 lr : zap_huge_pmd+0x2c/0x468 sp : ffff800080beb660 x29: ffff800080beb660 x28: fff00000c2058180 x27: ffff800080beb898 x26: fff00000c2058180 x25: ffff800080beb820 x24: 00c800010b600f41 x23: ffffc1ffc30af1a8 x22: fff00000c2058180 x21: 0000ffff8dc00000 x20: fff00000c2bc6370 x19: ffff800080beb898 x18: ffff800080bebb60 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000007 x14: 000000000000000a x13: 0000aaaacbbbffff x12: 0000000000000000 x11: 0000ffff8ddfffff x10: 00000000000001fe x9 : 0000ffff8ddfffff x8 : 0000ffff8de00000 x7 : 0000ffff8da00000 x6 : fff00000c2bc6370 x5 : 0000ffff8da00000 x4 : 000000010b600000 x3 : ffffc1ffc0000000 x2 : fff00000c2058180 x1 : fffff1ffc32d8000 x0 : 000000c00010b600 Call trace: zap_huge_pmd+0x168/0x468 (P) unmap_page_range+0xd70/0x1560 unmap_single_vma+0x48/0x80 unmap_vmas+0x90/0x180 unmap_region+0x88/0xe4 vms_complete_munmap_vmas+0xf8/0x1e0 do_vmi_align_munmap+0x158/0x180 do_vmi_munmap+0xac/0x160 __vm_munmap+0xb0/0x138 vm_munmap+0x14/0x20 gcs_free+0x70/0x80 mm_release+0x1c/0xc8 exit_mm_release+0x28/0x38 do_exit+0x190/0x8ec do_group_exit+0x34/0x90 get_signal+0x794/0x858 arch_do_signal_or_restart+0x11c/0x3e0 exit_to_user_mode_loop+0x10c/0x17c el0_da+0x8c/0x9c el0t_64_sync_handler+0xd0/0xf0 el0t_64_sync+0x198/0x19c Code: aa1603e2 d34cfc00 cb813001 8b011861 (f9400420) Similarly to how the kernel handles protection_map[], use a gcs_page_prot variable to store the protection bits and clear PTE_SHARED if LPA2 is enabled. Also remove the unused PAGE_GCS{,_RO} macros. Signed-off-by: Catalin Marinas Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack") Reported-by: Emanuele Rocca Cc: stable@vger.kernel.org Cc: Mark Brown Cc: Will Deacon Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 3 --- arch/arm64/mm/mmap.c | 8 ++++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index d27e8872fe3c..2b32639160de 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -164,9 +164,6 @@ static inline bool __pure lpa2_is_enabled(void) #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) -#define PAGE_GCS __pgprot(_PAGE_GCS) -#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) - #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 08ee177432c2..75f343009b4b 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; +static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO; + /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. @@ -73,9 +75,11 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } - if (lpa2_is_enabled()) + if (lpa2_is_enabled()) { for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; + gcs_page_prot &= ~PTE_SHARED; + } return 0; } @@ -87,7 +91,7 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = _PAGE_GCS_RO; + prot = gcs_page_prot; } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); -- cgit v1.2.3 From 47a8aad135ac1aed04b7b0c0a8157fd208075827 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:31 +0000 Subject: arm64: gcs: Honour mprotect(PROT_NONE) on shadow stack mappings vm_get_page_prot() short-circuits the protection_map[] lookup for a VM_SHADOW_STACK mapping since it uses a different PIE index from the typical read/write/exec permissions. However, the side effect is that it also ignores mprotect(PROT_NONE) by creating an accessible PTE. Special-case the !(vm_flags & VM_ACCESS_FLAGS) flags to use the protection_map[VM_NONE] permissions instead. No GCS attributes are required for an inaccessible PTE. Signed-off-by: Catalin Marinas Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack") Cc: stable@vger.kernel.org Cc: Mark Brown Cc: Will Deacon Cc: David Hildenbrand Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Will Deacon --- arch/arm64/mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 75f343009b4b..92b2f5097a96 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = gcs_page_prot; + /* Honour mprotect(PROT_NONE) on shadow stack mappings */ + if (vm_flags & VM_ACCESS_FLAGS) + prot = gcs_page_prot; + else + prot = pgprot_val(protection_map[VM_NONE]); } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); -- cgit v1.2.3 From 9d1a7c4a457eac8a7e07e1685f95e54fa551db5e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:32 +0000 Subject: kselftest: arm64: Check access to GCS after mprotect(PROT_NONE) A GCS mapping should not be accessible after mprotect(PROT_NONE). Add a kselftest for this scenario. Signed-off-by: Catalin Marinas Cc: Shuah Khan Cc: Mark Brown Cc: Will Deacon Cc: David Hildenbrand Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- .../arm64/signal/testcases/gcs_prot_none_fault.c | 76 ++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c new file mode 100644 index 000000000000..2259f454a202 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 ARM Limited + */ + +#include +#include +#include + +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static uint64_t *gcs_page; +static bool post_mprotect; + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +static bool alloc_gcs(struct tdescr *td) +{ + long page_size = sysconf(_SC_PAGE_SIZE); + + gcs_page = (void *)syscall(__NR_map_shadow_stack, 0, + page_size, 0); + if (gcs_page == MAP_FAILED) { + fprintf(stderr, "Failed to map %ld byte GCS: %d\n", + page_size, errno); + return false; + } + + return true; +} + +static int gcs_prot_none_fault_trigger(struct tdescr *td) +{ + /* Verify that the page is readable (ie, not completely unmapped) */ + fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]); + + if (mprotect(gcs_page, sysconf(_SC_PAGE_SIZE), PROT_NONE) != 0) { + fprintf(stderr, "mprotect(PROT_NONE) failed: %d\n", errno); + return 0; + } + post_mprotect = true; + + /* This should trigger a fault if PROT_NONE is honoured for the GCS page */ + fprintf(stderr, "Read value after mprotect(PROT_NONE) 0x%lx\n", gcs_page[0]); + return 0; +} + +static int gcs_prot_none_fault_signal(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + ASSERT_GOOD_CONTEXT(uc); + + /* A fault before mprotect(PROT_NONE) is unexpected. */ + if (!post_mprotect) + return 0; + + return 1; +} + +struct tdescr tde = { + .name = "GCS PROT_NONE fault", + .descr = "Read from GCS after mprotect(PROT_NONE) segfaults", + .feats_required = FEAT_GCS, + .timeout = 3, + .sig_ok = SIGSEGV, + .sanity_disabled = true, + .init = alloc_gcs, + .trigger = gcs_prot_none_fault_trigger, + .run = gcs_prot_none_fault_signal, +}; -- cgit v1.2.3 From bfd9c931d19aa59fb8371d557774fa169b15db9a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 18 Feb 2026 16:43:47 +0000 Subject: arm64: tlb: Allow XZR argument to TLBI ops The TLBI instruction accepts XZR as a register argument, and for TLBI operations with a register argument, there is no functional difference between using XZR or another GPR which contains zeroes. Operations without a register argument are encoded as if XZR were used. Allow the __TLBI_1() macro to use XZR when a register argument is all zeroes. Today this only results in a trivial code saving in __do_compat_cache_op()'s workaround for Neoverse-N1 erratum #1542419. In subsequent patches this pattern will be used more generally. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Marc Zyngier Cc: Oliver Upton Cc: Ryan Roberts Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a2d65d7d6aae..bf1cc9949dc8 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -38,12 +38,12 @@ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ - "tlbi " #op ", %0\n" \ + "tlbi " #op ", %x0\n" \ ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op ", %0", \ + "dsb ish\n tlbi " #op ", %x0", \ ARM64_WORKAROUND_REPEAT_TLBI, \ CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ - : : "r" (arg)) + : : "rZ" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) -- cgit v1.2.3 From a8f78680ee6bf795086384e8aea159a52814f827 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 18 Feb 2026 16:43:48 +0000 Subject: arm64: tlb: Optimize ARM64_WORKAROUND_REPEAT_TLBI The ARM64_WORKAROUND_REPEAT_TLBI workaround is used to mitigate several errata where broadcast TLBI;DSB sequences don't provide all the architecturally required synchronization. The workaround performs more work than necessary, and can have significant overhead. This patch optimizes the workaround, as explained below. The workaround was originally added for Qualcomm Falkor erratum 1009 in commit: d9ff80f83ecb ("arm64: Work around Falkor erratum 1009") As noted in the message for that commit, the workaround is applied even in cases where it is not strictly necessary. The workaround was later reused without changes for: * Arm Cortex-A76 erratum #1286807 SDEN v33: https://developer.arm.com/documentation/SDEN-885749/33-0/ * Arm Cortex-A55 erratum #2441007 SDEN v16: https://developer.arm.com/documentation/SDEN-859338/1600/ * Arm Cortex-A510 erratum #2441009 SDEN v19: https://developer.arm.com/documentation/SDEN-1873351/1900/ The important details to note are as follows: 1. All relevant errata only affect the ordering and/or completion of memory accesses which have been translated by an invalidated TLB entry. The actual invalidation of TLB entries is unaffected. 2. The existing workaround is applied to both broadcast and local TLB invalidation, whereas for all relevant errata it is only necessary to apply a workaround for broadcast invalidation. 3. The existing workaround replaces every TLBI with a TLBI;DSB;TLBI sequence, whereas for all relevant errata it is only necessary to execute a single additional TLBI;DSB sequence after any number of TLBIs are completed by a DSB. For example, for a sequence of batched TLBIs: TLBI [, ] TLBI [, ] TLBI [, ] DSB ISH ... the existing workaround will expand this to: TLBI [, ] DSB ISH // additional TLBI [, ] // additional TLBI [, ] DSB ISH // additional TLBI [, ] // additional TLBI [, ] DSB ISH // additional TLBI [, ] // additional DSB ISH ... whereas it is sufficient to have: TLBI [, ] TLBI [, ] TLBI [, ] DSB ISH TLBI [, ] // additional DSB ISH // additional Using a single additional TBLI and DSB at the end of the sequence can have significantly lower overhead as each DSB which completes a TLBI must synchronize with other PEs in the system, with potential performance effects both locally and system-wide. 4. The existing workaround repeats each specific TLBI operation, whereas for all relevant errata it is sufficient for the additional TLBI to use *any* operation which will be broadcast, regardless of which translation regime or stage of translation the operation applies to. For example, for a single TLBI: TLBI ALLE2IS DSB ISH ... the existing workaround will expand this to: TLBI ALLE2IS DSB ISH TLBI ALLE2IS // additional DSB ISH // additional ... whereas it is sufficient to have: TLBI ALLE2IS DSB ISH TLBI VALE1IS, XZR // additional DSB ISH // additional As the additional TLBI doesn't have to match a specific earlier TLBI, the additional TLBI can be implemented in separate code, with no memory of the earlier TLBIs. The additional TLBI can also use a cheaper TLBI operation. 5. The existing workaround is applied to both Stage-1 and Stage-2 TLB invalidation, whereas for all relevant errata it is only necessary to apply a workaround for Stage-1 invalidation. Architecturally, TLBI operations which invalidate only Stage-2 information (e.g. IPAS2E1IS) are not required to invalidate TLB entries which combine information from Stage-1 and Stage-2 translation table entries, and consequently may not complete memory accesses translated by those combined entries. In these cases, completion of memory accesses is only guaranteed after subsequent invalidation of Stage-1 information (e.g. VMALLE1IS). Taking the above points into account, this patch reworks the workaround logic to reduce overhead: * New __tlbi_sync_s1ish() and __tlbi_sync_s1ish_hyp() functions are added and used in place of any dsb(ish) which is used to complete broadcast Stage-1 TLB maintenance. When the ARM64_WORKAROUND_REPEAT_TLBI workaround is enabled, these helpers will execute an additional TLBI;DSB sequence. For consistency, it might make sense to add __tlbi_sync_*() helpers for local and stage 2 maintenance. For now I've left those with open-coded dsb() to keep the diff small. * The duplication of TLBIs in __TLBI_0() and __TLBI_1() is removed. This is no longer needed as the necessary synchronization will happen in __tlbi_sync_s1ish() or __tlbi_sync_s1ish_hyp(). * The additional TLBI operation is chosen to have minimal impact: - __tlbi_sync_s1ish() uses "TLBI VALE1IS, XZR". This is only used at EL1 or at EL2 with {E2H,TGE}=={1,1}, where it will target an unused entry for the reserved ASID in the kernel's own translation regime, and have no adverse affect. - __tlbi_sync_s1ish_hyp() uses "TLBI VALE2IS, XZR". This is only used in hyp code, where it will target an unused entry in the hyp code's TTBR0 mapping, and should have no adverse effect. * As __TLBI_0() and __TLBI_1() no longer replace each TLBI with a TLBI;DSB;TLBI sequence, batching TLBIs is worthwhile, and there's no need for arch_tlbbatch_should_defer() to consider ARM64_WORKAROUND_REPEAT_TLBI. When building defconfig with GCC 15.1.0, compared to v6.19-rc1, this patch saves ~1KiB of text, makes the vmlinux ~42KiB smaller, and makes the resulting Image 64KiB smaller: | [mark@lakrids:~/src/linux]% size vmlinux-* | text data bss dec hex filename | 21179831 19660919 708216 41548966 279fca6 vmlinux-after | 21181075 19660903 708216 41550194 27a0172 vmlinux-before | [mark@lakrids:~/src/linux]% ls -l vmlinux-* | -rwxr-xr-x 1 mark mark 157771472 Feb 4 12:05 vmlinux-after | -rwxr-xr-x 1 mark mark 157815432 Feb 4 12:05 vmlinux-before | [mark@lakrids:~/src/linux]% ls -l Image-* | -rw-r--r-- 1 mark mark 41007616 Feb 4 12:05 Image-after | -rw-r--r-- 1 mark mark 41073152 Feb 4 12:05 Image-before Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Marc Zyngier Cc: Oliver Upton Cc: Ryan Roberts Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 59 +++++++++++++++++++++++---------------- arch/arm64/kernel/sys_compat.c | 2 +- arch/arm64/kvm/hyp/nvhe/mm.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 8 +++--- arch/arm64/kvm/hyp/pgtable.c | 2 +- arch/arm64/kvm/hyp/vhe/tlb.c | 10 +++---- 6 files changed, 47 insertions(+), 36 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bf1cc9949dc8..1416e652612b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -31,18 +31,10 @@ */ #define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op "\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op, \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op ", %x0\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op ", %x0", \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : "rZ" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) @@ -181,6 +173,34 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ }) +#define __repeat_tlbi_sync(op, arg...) \ +do { \ + if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) \ + break; \ + __tlbi(op, ##arg); \ + dsb(ish); \ +} while (0) + +/* + * Complete broadcast TLB maintenance issued by the host which invalidates + * stage 1 information in the host's own translation regime. + */ +static inline void __tlbi_sync_s1ish(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale1is, 0); +} + +/* + * Complete broadcast TLB maintenance issued by hyp code which invalidates + * stage 1 translation information in any translation regime. + */ +static inline void __tlbi_sync_s1ish_hyp(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale2is, 0); +} + /* * TLB Invalidation * ================ @@ -279,7 +299,7 @@ static inline void flush_tlb_all(void) { dsb(ishst); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -291,7 +311,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) asid = __TLBI_VADDR(0, ASID(mm)); __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); - dsb(ish); + __tlbi_sync_s1ish(); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } @@ -345,20 +365,11 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { flush_tlb_page_nosync(vma, uaddr); - dsb(ish); + __tlbi_sync_s1ish(); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { - /* - * TLB flush deferral is not required on systems which are affected by - * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation - * will have two consecutive TLBI instructions with a dsb(ish) in between - * defeating the purpose (i.e save overall 'dsb ish' cost). - */ - if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) - return false; - return true; } @@ -374,7 +385,7 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - dsb(ish); + __tlbi_sync_s1ish(); } /* @@ -509,7 +520,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, { __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, last_level, tlb_level); - dsb(ish); + __tlbi_sync_s1ish(); } static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, @@ -557,7 +568,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end dsb(ishst); __flush_tlb_range_op(vaale1is, start, pages, stride, 0, TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -571,7 +582,7 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ishst); __tlbi(vaae1is, addr); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 4a609e9b65de..b9d4998c97ef 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -37,7 +37,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * We pick the reserved-ASID to minimise the impact. */ __tlbi(aside1is, __TLBI_VADDR(0, 0)); - dsb(ish); + __tlbi_sync_s1ish(); } ret = caches_clean_inval_user_pou(start, start + chunk); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index ae8391baebc3..218976287d3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -271,7 +271,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) */ dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 48da9ca9763f..3dc1ce0d27fe 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -169,7 +169,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -226,7 +226,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -240,7 +240,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -266,5 +266,5 @@ void __kvm_flush_vm_context(void) /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0e4ddd28ef5d..9b480f947da2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -501,7 +501,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, *unmapped += granule; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); mm_ops->put_page(ctx->ptep); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index ec2569818629..35855dadfb1b 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -115,7 +115,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -176,7 +176,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -192,7 +192,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -217,7 +217,7 @@ void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } /* @@ -358,7 +358,7 @@ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) default: ret = -EINVAL; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); if (mmu) -- cgit v1.2.3 From 468711a40d5dfc01bf0a24c1981246a2c93ac405 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 10 Feb 2026 19:12:25 +0100 Subject: PCI: dwc: ep: Refresh MSI Message Address cache on change Endpoint drivers use dw_pcie_ep_raise_msi_irq() to raise MSI interrupts to the host. After 8719c64e76bf ("PCI: dwc: ep: Cache MSI outbound iATU mapping"), dw_pcie_ep_raise_msi_irq() caches the Message Address from the MSI Capability in ep->msi_msg_addr. But that Message Address is controlled by the host, and it may change. For example, if: - firmware on the host configures the Message Address and triggers an MSI, - a driver on the Endpoint raises the MSI via dw_pcie_ep_raise_msi_irq(), which caches the Message Address, - a kernel on the host reconfigures the Message Address and the host kernel driver triggers another MSI, dw_pcie_ep_raise_msi_irq() notices that the Message Address no longer matches the cached ep->msi_msg_addr, warns about it, and returns error instead of raising the MSI. The host kernel may hang because it never receives the MSI. This was seen with the nvmet_pci_epf_driver: the host UEFI performs NVMe commands, e.g. Identify Controller to get the name of the controller, nvmet-pci-epf posts the completion queue entry and raises an IRQ using dw_pcie_ep_raise_msi_irq(). When the host boots Linux, we see a WARN_ON_ONCE() from dw_pcie_ep_raise_msi_irq(), and the host kernel hangs because the nvme driver never gets an IRQ. Remove the warning when dw_pcie_ep_raise_msi_irq() notices that Message Address has changed, remap using the new address, and update the ep->msi_msg_addr cache. Fixes: 8719c64e76bf ("PCI: dwc: ep: Cache MSI outbound iATU mapping") Signed-off-by: Niklas Cassel [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Tested-by: Shin'ichiro Kawasaki Tested-by: Koichiro Den Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260210181225.3926165-2-cassel@kernel.org --- drivers/pci/controller/dwc/pcie-designware-ep.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 295076cf70de..35d8fc5e1d20 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -905,6 +905,19 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, * supported, so we avoid reprogramming the region on every MSI, * specifically unmapping immediately after writel(). */ + if (ep->msi_iatu_mapped && (ep->msi_msg_addr != msg_addr || + ep->msi_map_size != map_size)) { + /* + * The host changed the MSI target address or the required + * mapping size changed. Reprogramming the iATU when there are + * operations in flight is unsafe on this controller. However, + * there is no unified way to check if we have operations in + * flight, thus we don't know if we should WARN() or not. + */ + dw_pcie_ep_unmap_addr(epc, func_no, 0, ep->msi_mem_phys); + ep->msi_iatu_mapped = false; + } + if (!ep->msi_iatu_mapped) { ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr, @@ -915,15 +928,6 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, ep->msi_iatu_mapped = true; ep->msi_msg_addr = msg_addr; ep->msi_map_size = map_size; - } else if (WARN_ON_ONCE(ep->msi_msg_addr != msg_addr || - ep->msi_map_size != map_size)) { - /* - * The host changed the MSI target address or the required - * mapping size changed. Reprogramming the iATU at runtime is - * unsafe on this controller, so bail out instead of trying to - * update the existing region. - */ - return -EINVAL; } writel(msg_data | (interrupt_num - 1), ep->msi_mem + offset); -- cgit v1.2.3 From c22533c66ccae10511ad6a7afc34bb26c47577e3 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Feb 2026 18:55:41 +0100 Subject: PCI: dwc: ep: Flush MSI-X write before unmapping its ATU entry Endpoint drivers use dw_pcie_ep_raise_msix_irq() to raise an MSI-X interrupt to the host using a writel(), which generates a PCI posted write transaction. There's no completion for posted writes, so the writel() may return before the PCI write completes. dw_pcie_ep_raise_msix_irq() also unmaps the outbound ATU entry used for the PCI write, so the write races with the unmap. If the PCI write loses the race with the ATU unmap, the write may corrupt host memory or cause IOMMU errors, e.g., these when running fio with a larger queue depth against nvmet-pci-epf: arm-smmu-v3 fc900000.iommu: 0x0000010000000010 arm-smmu-v3 fc900000.iommu: 0x0000020000000000 arm-smmu-v3 fc900000.iommu: 0x000000090000f040 arm-smmu-v3 fc900000.iommu: 0x0000000000000000 arm-smmu-v3 fc900000.iommu: event: F_TRANSLATION client: 0000:01:00.0 sid: 0x100 ssid: 0x0 iova: 0x90000f040 ipa: 0x0 arm-smmu-v3 fc900000.iommu: unpriv data write s1 "Input address caused fault" stag: 0x0 Flush the write by performing a readl() of the same address to ensure that the write has reached the destination before the ATU entry is unmapped. The same problem was solved for dw_pcie_ep_raise_msi_irq() in commit 8719c64e76bf ("PCI: dwc: ep: Cache MSI outbound iATU mapping"), but there it was solved by dedicating an outbound iATU only for MSI. We can't do the same for MSI-X because each vector can have a different msg_addr and the msg_addr may be changed while the vector is masked. Fixes: beb4641a787d ("PCI: dwc: Add MSI-X callbacks handler") Signed-off-by: Niklas Cassel [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Frank Li Link: https://patch.msgid.link/20260211175540.105677-2-cassel@kernel.org --- drivers/pci/controller/dwc/pcie-designware-ep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 35d8fc5e1d20..c57ae4d6c5c0 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -1014,6 +1014,9 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, writel(msg_data, ep->msi_mem + offset); + /* flush posted write before unmap */ + readl(ep->msi_mem + offset); + dw_pcie_ep_unmap_addr(epc, func_no, 0, ep->msi_mem_phys); return 0; -- cgit v1.2.3 From 75c151ceaacf5ca8f2f34ebf863d88002fb12587 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 25 Feb 2026 12:47:52 -0800 Subject: accel/amdxdna: Use a different name for latest firmware Using legacy driver with latest firmware causes a power off issue. Fix this by assigning a different filename (npu_7.sbin) to the latest firmware. The driver attempts to load the latest firmware first and falls back to the previous firmware version if loading fails. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/5009 Fixes: f1eac46fe5f7 ("accel/amdxdna: Update firmware version check for latest firmware") Reviewed-by: Mario Limonciello (AMD) Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260225204752.2711734-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_pci.c | 20 +++++++++++++++++++- drivers/accel/amdxdna/amdxdna_pci_drv.c | 3 +++ drivers/accel/amdxdna/npu1_regs.c | 2 +- drivers/accel/amdxdna/npu4_regs.c | 2 +- drivers/accel/amdxdna/npu5_regs.c | 2 +- drivers/accel/amdxdna/npu6_regs.c | 2 +- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index 4b3e6bb97bd2..85079b6fc5d9 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -32,6 +32,11 @@ static int aie2_max_col = XRS_MAX_COL; module_param(aie2_max_col, uint, 0600); MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used"); +static char *npu_fw[] = { + "npu_7.sbin", + "npu.sbin" +}; + /* * The management mailbox channel is allocated by firmware. * The related register and ring buffer information is on SRAM BAR. @@ -489,6 +494,7 @@ static int aie2_init(struct amdxdna_dev *xdna) struct psp_config psp_conf; const struct firmware *fw; unsigned long bars = 0; + char *fw_full_path; int i, nvec, ret; if (!hypervisor_is_type(X86_HYPER_NATIVE)) { @@ -503,7 +509,19 @@ static int aie2_init(struct amdxdna_dev *xdna) ndev->priv = xdna->dev_info->dev_priv; ndev->xdna = xdna; - ret = request_firmware(&fw, ndev->priv->fw_path, &pdev->dev); + for (i = 0; i < ARRAY_SIZE(npu_fw); i++) { + fw_full_path = kasprintf(GFP_KERNEL, "%s%s", ndev->priv->fw_path, npu_fw[i]); + if (!fw_full_path) + return -ENOMEM; + + ret = firmware_request_nowarn(&fw, fw_full_path, &pdev->dev); + kfree(fw_full_path); + if (!ret) { + XDNA_INFO(xdna, "Load firmware %s%s", ndev->priv->fw_path, npu_fw[i]); + break; + } + } + if (ret) { XDNA_ERR(xdna, "failed to request_firmware %s, ret %d", ndev->priv->fw_path, ret); diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c index 4ada45d06fcf..a4384593bdcc 100644 --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c @@ -23,6 +23,9 @@ MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin"); MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin"); MODULE_FIRMWARE("amdnpu/17f0_11/npu.sbin"); MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin"); +MODULE_FIRMWARE("amdnpu/1502_00/npu_7.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_10/npu_7.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_11/npu_7.sbin"); /* * 0.0: Initial version diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c index 6f36a27b5a02..6e3d3ca69c04 100644 --- a/drivers/accel/amdxdna/npu1_regs.c +++ b/drivers/accel/amdxdna/npu1_regs.c @@ -72,7 +72,7 @@ static const struct aie2_fw_feature_tbl npu1_fw_feature_table[] = { }; static const struct amdxdna_dev_priv npu1_dev_priv = { - .fw_path = "amdnpu/1502_00/npu.sbin", + .fw_path = "amdnpu/1502_00/", .rt_config = npu1_default_rt_cfg, .dpm_clk_tbl = npu1_dpm_clk_table, .fw_feature_tbl = npu1_fw_feature_table, diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c index a8d6f76dde5f..ce25eef5fc34 100644 --- a/drivers/accel/amdxdna/npu4_regs.c +++ b/drivers/accel/amdxdna/npu4_regs.c @@ -98,7 +98,7 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = { }; static const struct amdxdna_dev_priv npu4_dev_priv = { - .fw_path = "amdnpu/17f0_10/npu.sbin", + .fw_path = "amdnpu/17f0_10/", .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, .fw_feature_tbl = npu4_fw_feature_table, diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c index c0a35cfd886c..c0ac5daf32ee 100644 --- a/drivers/accel/amdxdna/npu5_regs.c +++ b/drivers/accel/amdxdna/npu5_regs.c @@ -63,7 +63,7 @@ #define NPU5_SRAM_BAR_BASE MMNPU_APERTURE1_BASE static const struct amdxdna_dev_priv npu5_dev_priv = { - .fw_path = "amdnpu/17f0_11/npu.sbin", + .fw_path = "amdnpu/17f0_11/", .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, .fw_feature_tbl = npu4_fw_feature_table, diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c index 1fb07df99186..ce591ed0d483 100644 --- a/drivers/accel/amdxdna/npu6_regs.c +++ b/drivers/accel/amdxdna/npu6_regs.c @@ -63,7 +63,7 @@ #define NPU6_SRAM_BAR_BASE MMNPU_APERTURE1_BASE static const struct amdxdna_dev_priv npu6_dev_priv = { - .fw_path = "amdnpu/17f0_10/npu.sbin", + .fw_path = "amdnpu/17f0_10/", .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, .fw_feature_tbl = npu4_fw_feature_table, -- cgit v1.2.3 From 49abfa812617a7f2d0132c70d23ac98b389c6ec1 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Mon, 23 Feb 2026 12:41:30 +0000 Subject: drm/amdgpu/userq: Fix reference leak in amdgpu_userq_wait_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop reference to syncobj and timeline fence when aborting the ioctl due output array being too small. Reviewed-by: Alex Deucher Signed-off-by: Tvrtko Ursulin Fixes: a292fdecd728 ("drm/amdgpu: Implement userqueue signal/wait IOCTL") Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 68951e9c3e6bb22396bc42ef2359751c8315dd27) Cc: # v6.16+ --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 8013260e29dc..9b9947b94b89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -876,6 +876,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_fence_unwrap_for_each(f, &iter, fence) { if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { r = -EINVAL; + dma_fence_put(fence); goto free_fences; } @@ -900,6 +901,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { r = -EINVAL; + dma_fence_put(fence); goto free_fences; } -- cgit v1.2.3 From 7b7d7693a55d606d700beb9549c9f7f0e5d9c24f Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Mon, 23 Feb 2026 12:41:31 +0000 Subject: drm/amdgpu/userq: Do not allow userspace to trivially triger kernel warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace can either deliberately pass in the too small num_fences, or the required number can legitimately grow between the two calls to the userq wait ioctl. In both cases we do not want the emit the kernel warning backtrace since nothing is wrong with the kernel and userspace will simply get an errno reported back. So lets simply drop the WARN_ONs. Reviewed-by: Alex Deucher Signed-off-by: Tvrtko Ursulin Fixes: a292fdecd728 ("drm/amdgpu: Implement userqueue signal/wait IOCTL") Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2c333ea579de6cc20ea7bc50e9595ef72863e65c) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 9b9947b94b89..d972dc46f5a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -833,7 +833,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_resv_for_each_fence(&resv_cursor, gobj_read[i]->resv, DMA_RESV_USAGE_READ, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; goto free_fences; } @@ -850,7 +850,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_resv_for_each_fence(&resv_cursor, gobj_write[i]->resv, DMA_RESV_USAGE_WRITE, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; goto free_fences; } @@ -874,7 +874,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, goto free_fences; dma_fence_unwrap_for_each(f, &iter, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; dma_fence_put(fence); goto free_fences; @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (r) goto free_fences; - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; dma_fence_put(fence); goto free_fences; -- cgit v1.2.3 From ea78f8c68f4f6211c557df49174c54d167821962 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Fri, 20 Feb 2026 13:47:58 +0530 Subject: drm/amdgpu: add upper bound check on user inputs in signal ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huge input values in amdgpu_userq_signal_ioctl can lead to a OOM and could be exploited. So check these input value against AMDGPU_USERQ_MAX_HANDLES which is big enough value for genuine use cases and could potentially avoid OOM. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit be267e15f99bc97cbe202cd556717797cdcf79a5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index d972dc46f5a8..c5f5af20af75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -35,6 +35,8 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops; static struct kmem_cache *amdgpu_userq_fence_slab; +#define AMDGPU_USERQ_MAX_HANDLES (1U << 16) + int amdgpu_userq_fence_slab_init(void) { amdgpu_userq_fence_slab = kmem_cache_create("amdgpu_userq_fence", @@ -478,6 +480,11 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, if (!amdgpu_userq_enabled(dev)) return -ENOTSUPP; + if (args->num_syncobj_handles > AMDGPU_USERQ_MAX_HANDLES || + args->num_bo_write_handles > AMDGPU_USERQ_MAX_HANDLES || + args->num_bo_read_handles > AMDGPU_USERQ_MAX_HANDLES) + return -EINVAL; + num_syncobj_handles = args->num_syncobj_handles; syncobj_handles = memdup_user(u64_to_user_ptr(args->syncobj_handles), size_mul(sizeof(u32), num_syncobj_handles)); -- cgit v1.2.3 From 64ac7c09fc44985ec9bb6a9db740899fa40ca613 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 24 Feb 2026 12:13:09 +0530 Subject: drm/amdgpu: add upper bound check on user inputs in wait ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huge input values in amdgpu_userq_wait_ioctl can lead to a OOM and could be exploited. So check these input value against AMDGPU_USERQ_MAX_HANDLES which is big enough value for genuine use cases and could potentially avoid OOM. v2: squash in Srini's fix Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit fcec012c664247531aed3e662f4280ff804d1476) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index c5f5af20af75..7e9cf1868cc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -671,6 +671,11 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (!amdgpu_userq_enabled(dev)) return -ENOTSUPP; + if (wait_info->num_syncobj_handles > AMDGPU_USERQ_MAX_HANDLES || + wait_info->num_bo_write_handles > AMDGPU_USERQ_MAX_HANDLES || + wait_info->num_bo_read_handles > AMDGPU_USERQ_MAX_HANDLES) + return -EINVAL; + num_read_bo_handles = wait_info->num_bo_read_handles; bo_handles_read = memdup_user(u64_to_user_ptr(wait_info->bo_read_handles), size_mul(sizeof(u32), num_read_bo_handles)); -- cgit v1.2.3 From 28dfe4317541e57fe52f9a290394cd29c348228b Mon Sep 17 00:00:00 2001 From: Natalie Vock Date: Mon, 23 Feb 2026 12:45:37 +0100 Subject: drm/amd/display: Use GFP_ATOMIC in dc_create_stream_for_sink This can be called while preemption is disabled, for example by dcn32_internal_validate_bw which is called with the FPU active. Fixes "BUG: scheduling while atomic" messages I encounter on my Navi31 machine. Signed-off-by: Natalie Vock Signed-off-by: Alex Deucher (cherry picked from commit b42dae2ebc5c84a68de63ec4ffdfec49362d53f1) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc_stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index 246893d80f1f..baf820e6eae8 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -170,11 +170,11 @@ struct dc_stream_state *dc_create_stream_for_sink( if (sink == NULL) goto fail; - stream = kzalloc_obj(struct dc_stream_state); + stream = kzalloc_obj(struct dc_stream_state, GFP_ATOMIC); if (stream == NULL) goto fail; - stream->update_scratch = kzalloc((int32_t) dc_update_scratch_space_size(), GFP_KERNEL); + stream->update_scratch = kzalloc((int32_t) dc_update_scratch_space_size(), GFP_ATOMIC); if (stream->update_scratch == NULL) goto fail; -- cgit v1.2.3 From 5e0bcc7b88bcd081aaae6f481b10d9ab294fcb69 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 14:00:07 -0800 Subject: drm/amdgpu: Unlock a mutex before destroying it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mutexes must be unlocked before these are destroyed. This has been detected by the Clang thread-safety analyzer. Cc: Alex Deucher Cc: Christian König Cc: Yang Wang Cc: Hawking Zhang Cc: amd-gfx@lists.freedesktop.org Fixes: f5e4cc8461c4 ("drm/amdgpu: implement RAS ACA driver framework") Reviewed-by: Yang Wang Acked-by: Christian König Signed-off-by: Bart Van Assche Signed-off-by: Alex Deucher (cherry picked from commit 270258ba320beb99648dceffb67e86ac76786e55) --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index afe5ca81beec..db7858fe0c3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -641,6 +641,7 @@ static void aca_error_fini(struct aca_error *aerr) aca_bank_error_remove(aerr, bank_error); out_unlock: + mutex_unlock(&aerr->lock); mutex_destroy(&aerr->lock); } -- cgit v1.2.3 From 480ad5f6ead4a47b969aab6618573cd6822bb6a4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 13:50:23 -0800 Subject: drm/amdgpu: Fix locking bugs in error paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not unlock psp->ras_context.mutex if it has not been locked. This has been detected by the Clang thread-safety analyzer. Cc: Alex Deucher Cc: Christian König Cc: YiPeng Chai Cc: Hawking Zhang Cc: amd-gfx@lists.freedesktop.org Fixes: b3fb79cda568 ("drm/amdgpu: add mutex to protect ras shared memory") Acked-by: Christian König Signed-off-by: Bart Van Assche Signed-off-by: Alex Deucher (cherry picked from commit 6fa01b4335978051d2cd80841728fd63cc597970) --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 6e8aad91bcd3..0d3c18f04ac3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -332,13 +332,13 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size if (!context || !context->initialized) { dev_err(adev->dev, "TA is not initialized\n"); ret = -EINVAL; - goto err_free_shared_buf; + goto free_shared_buf; } if (!psp->ta_funcs || !psp->ta_funcs->fn_ta_invoke) { dev_err(adev->dev, "Unsupported function to invoke TA\n"); ret = -EOPNOTSUPP; - goto err_free_shared_buf; + goto free_shared_buf; } context->session_id = ta_id; @@ -346,7 +346,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size mutex_lock(&psp->ras_context.mutex); ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len); if (ret) - goto err_free_shared_buf; + goto unlock; ret = psp_fn_ta_invoke(psp, cmd_id); if (ret || context->resp_status) { @@ -354,15 +354,17 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size ret, context->resp_status); if (!ret) { ret = -EINVAL; - goto err_free_shared_buf; + goto unlock; } } if (copy_to_user((char *)&buf[copy_pos], context->mem_context.shared_buf, shared_buf_len)) ret = -EFAULT; -err_free_shared_buf: +unlock: mutex_unlock(&psp->ras_context.mutex); + +free_shared_buf: kfree(shared_buf); return ret; -- cgit v1.2.3 From a5fe1a54513196e4bc8f9170006057dc31e7155e Mon Sep 17 00:00:00 2001 From: sguttula Date: Sat, 21 Feb 2026 10:03:32 +0530 Subject: drm/amdgpu/vcn5: Add SMU dpm interface type This will set AMDGPU_VCN_SMU_DPM_INTERFACE_* smu_type based on soc type and fixing ring timeout issue seen for DPM enabled case. Signed-off-by: sguttula Reviewed-by: Pratik Vishwakarma Signed-off-by: Alex Deucher (cherry picked from commit f0f23c315b38c55e8ce9484cf59b65811f350630) --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index 0202df5db1e1..6109124f852e 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -174,6 +174,10 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block) fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); fw_shared->sq.is_enabled = 1; + fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG); + fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ? + AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU; + if (amdgpu_vcnfw_log) amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); -- cgit v1.2.3 From b57c4ec98c17789136a4db948aec6daadceb5024 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 24 Feb 2026 10:18:51 +0530 Subject: drm/amdgpu: Fix error handling in slot reset If the device has not recovered after slot reset is called, it goes to out label for error handling. There it could make decision based on uninitialized hive pointer and could result in accessing an uninitialized list. Initialize the list and hive properly so that it handles the error situation and also releases the reset domain lock which is acquired during error_detected callback. Fixes: 732c6cefc1ec ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset") Signed-off-by: Lijo Lazar Reviewed-by: Ce Sun Signed-off-by: Alex Deucher (cherry picked from commit bb71362182e59caa227e4192da5a612b09349696) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d9789e0b5201..3e19b51a2763 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -7059,6 +7059,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + } else { + list_add_tail(&adev->reset_list, &device_list); + } if (adev->pcie_reset_ctx.swus) link_dev = adev->pcie_reset_ctx.swus; @@ -7099,19 +7108,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); - INIT_LIST_HEAD(&device_list); - hive = amdgpu_get_xgmi_hive(adev); if (hive) { - mutex_lock(&hive->hive_lock); reset_context.hive = hive; - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) tmp_adev->pcie_reset_ctx.in_link_reset = true; - list_add_tail(&tmp_adev->reset_list, &device_list); - } } else { set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - list_add_tail(&adev->reset_list, &device_list); } r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); -- cgit v1.2.3 From 6b0d812971370c64b837a2db4275410f478272fe Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 25 Feb 2026 10:51:16 -0600 Subject: drm/amd: Disable MES LR compute W/A A workaround was introduced in commit 1fb710793ce2 ("drm/amdgpu: Enable MES lr_compute_wa by default") to help with some hangs observed in gfx1151. This WA didn't fully fix the issue. It was actually fixed by adjusting the VGPR size to the correct value that matched the hardware in commit b42f3bf9536c ("drm/amdkfd: bump minimum vgpr size for gfx1151"). There are reports of instability on other products with newer GC microcode versions, and I believe they're caused by this workaround. As we don't need the workaround any more, remove it. Fixes: b42f3bf9536c ("drm/amdkfd: bump minimum vgpr size for gfx1151") Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 9973e64bd6ee7642860a6f3b6958cbf14e89cabd) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 5 ----- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 5 ----- 2 files changed, 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 09ebb13ca5e8..a926a330700e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -720,11 +720,6 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; mes_set_hw_res_pkt.oversubscription_timer = 50; - if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x7f) - mes_set_hw_res_pkt.enable_lr_compute_wa = 1; - else - dev_info_once(mes->adev->dev, - "MES FW version must be >= 0x7f to enable LR compute workaround.\n"); if (amdgpu_mes_log_enable) { mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index b1c864dc79a8..5bfa5d1d0b36 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -779,11 +779,6 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.use_different_vmid_compute = 1; mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; - if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x82) - mes_set_hw_res_pkt.enable_lr_compute_wa = 1; - else - dev_info_once(adev->dev, - "MES FW version must be >= 0x82 to enable LR compute workaround.\n"); /* * Keep oversubscribe timer for sdma . When we have unmapped doorbell -- cgit v1.2.3 From 2a064262eb378263792cf1fb044de631ac41bcc5 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Wed, 25 Feb 2026 21:15:23 +0000 Subject: sched_ext: Fix out-of-bounds access in scx_idle_init_masks() scx_idle_node_masks is allocated with num_possible_nodes() elements but indexed by NUMA node IDs via for_each_node(). On systems with non-contiguous NUMA node numbering (e.g. nodes 0 and 4), node IDs can exceed the array size, causing out-of-bounds memory corruption. Use nr_node_ids instead, which represents the maximum node ID range and is the correct size for arrays indexed by node ID. Fixes: 7c60329e3521 ("sched_ext: Add NUMA-awareness to the default idle selection policy") Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 3d9d404d5cd2..e2da6c3968a6 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -663,8 +663,8 @@ void scx_idle_init_masks(void) BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); - /* Allocate per-node idle cpumasks */ - scx_idle_node_masks = kcalloc(num_possible_nodes(), + /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */ + scx_idle_node_masks = kcalloc(nr_node_ids, sizeof(*scx_idle_node_masks), GFP_KERNEL); BUG_ON(!scx_idle_node_masks); -- cgit v1.2.3 From aa4876fe2d9fcbcaa0592b25f34ec6f6ea7876c1 Mon Sep 17 00:00:00 2001 From: Zhang Heng Date: Mon, 9 Feb 2026 21:41:49 +0800 Subject: ALSA: hda/realtek: add quirk for Acer Nitro ANV15-51 fix mute/micmute LEDs and headset microphone for Acer Nitro ANV15-51. [ The headset microphone issue is solved by Kailang] Link: https://bugzilla.kernel.org/show_bug.cgi?id=220279 Cc: stable@vger.kernel.org Signed-off-by: Zhang Heng Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260209134149.3076957-1-zhangheng@kylinos.cn --- sound/hda/codecs/realtek/alc269.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 43ecfc63ef87..86bb22d19629 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -4075,6 +4075,7 @@ enum { ALC236_FIXUP_HP_MUTE_LED_MICMUTE_GPIO, ALC233_FIXUP_LENOVO_GPIO2_MIC_HOTKEY, ALC245_FIXUP_BASS_HP_DAC, + ALC245_FIXUP_ACER_MICMUTE_LED, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -6599,6 +6600,12 @@ static const struct hda_fixup alc269_fixups[] = { /* Borrow the DAC routing selected for those Thinkpads */ .v.func = alc285_fixup_thinkpad_x1_gen7, }, + [ALC245_FIXUP_ACER_MICMUTE_LED] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_hp_coef_micmute_led, + .chained = true, + .chain_id = ALC2XX_FIXUP_HEADSET_MIC, + } }; static const struct hda_quirk alc269_fixup_tbl[] = { @@ -6651,6 +6658,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x159c, "Acer Nitro 5 AN515-58", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x1597, "Acer Nitro 5 AN517-55", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x169a, "Acer Swift SFG16", ALC256_FIXUP_ACER_SFG16_MICMUTE_LED), + SND_PCI_QUIRK(0x1025, 0x171e, "Acer Nitro ANV15-51", ALC245_FIXUP_ACER_MICMUTE_LED), SND_PCI_QUIRK(0x1025, 0x1826, "Acer Helios ZPC", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), SND_PCI_QUIRK(0x1025, 0x182c, "Acer Helios ZPD", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), SND_PCI_QUIRK(0x1025, 0x1844, "Acer Helios ZPS", ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2), -- cgit v1.2.3 From 54f9d645a5453d0bfece0c465d34aaf072ea99fa Mon Sep 17 00:00:00 2001 From: Jun Seo Date: Thu, 26 Feb 2026 10:08:20 +0900 Subject: ALSA: usb-audio: Use correct version for UAC3 header validation The entry of the validators table for UAC3 AC header descriptor is defined with the wrong protocol version UAC_VERSION_2, while it should have been UAC_VERSION_3. This results in the validator never matching for actual UAC3 devices (protocol == UAC_VERSION_3), causing their header descriptors to bypass validation entirely. A malicious USB device presenting a truncated UAC3 header could exploit this to cause out-of-bounds reads when the driver later accesses unvalidated descriptor fields. The bug was introduced in the same commit as the recently fixed UAC3 feature unit sub-type typo, and appears to be from the same copy-paste error when the UAC3 section was created from the UAC2 section. Fixes: 57f8770620e9 ("ALSA: usb-audio: More validations of descriptor units") Cc: Signed-off-by: Jun Seo Link: https://patch.msgid.link/20260226010820.36529-1-jun.seo.93@proton.me Signed-off-by: Takashi Iwai --- sound/usb/validate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/validate.c b/sound/usb/validate.c index 4bb4893f6e74..f62b7cc041dc 100644 --- a/sound/usb/validate.c +++ b/sound/usb/validate.c @@ -281,7 +281,7 @@ static const struct usb_desc_validator audio_validators[] = { /* UAC_VERSION_2, UAC2_SAMPLE_RATE_CONVERTER: not implemented yet */ /* UAC3 */ - FIXED(UAC_VERSION_2, UAC_HEADER, struct uac3_ac_header_descriptor), + FIXED(UAC_VERSION_3, UAC_HEADER, struct uac3_ac_header_descriptor), FIXED(UAC_VERSION_3, UAC_INPUT_TERMINAL, struct uac3_input_terminal_descriptor), FIXED(UAC_VERSION_3, UAC_OUTPUT_TERMINAL, -- cgit v1.2.3 From cd3c877d04683b44a4d50dcdfad54b356e65d158 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 24 Feb 2026 07:46:37 -0800 Subject: iomap: don't report direct-io retries to fserror iomap's directio implementation has two magic errno codes that it uses to signal callers -- ENOTBLK tells the filesystem that it should retry a write with the pagecache; and EAGAIN tells the caller that pagecache flushing or invalidation failed and that it should try again. Neither of these indicate data loss, so let's not report them. Fixes: a9d573ee88af98 ("iomap: report file I/O errors to the VFS") Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/20260224154637.GD2390381@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 95254aa1b654..e911daedff65 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -87,6 +87,19 @@ static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio) return FSERR_DIRECTIO_READ; } +static inline bool should_report_dio_fserror(const struct iomap_dio *dio) +{ + switch (dio->error) { + case 0: + case -EAGAIN: + case -ENOTBLK: + /* don't send fsnotify for success or magic retry codes */ + return false; + default: + return true; + } +} + ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; @@ -96,7 +109,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (dops && dops->end_io) ret = dops->end_io(iocb, dio->size, ret, dio->flags); - if (dio->error) + if (should_report_dio_fserror(dio)) fserror_report_io(file_inode(iocb->ki_filp), iomap_dio_err_type(dio), offset, dio->size, dio->error, GFP_NOFS); -- cgit v1.2.3 From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 10:43:55 +0100 Subject: kthread: consolidate kthread exit paths to prevent use-after-free Guillaume reported crashes via corrupted RCU callback function pointers during KUnit testing. The crash was traced back to the pidfs rhashtable conversion which replaced the 24-byte rb_node with an 8-byte rhash_head in struct pid, shrinking it from 160 to 144 bytes. struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to 192 bytes and share the same slab cache. struct pid.rcu.func and struct kthread.affinity_node both sit at offset 0x78. When a kthread exits via make_task_dead() it bypasses kthread_exit() and misses the affinity_node cleanup. free_kthread_struct() frees the memory while the node is still linked into the global kthread_affinity_list. A subsequent list_del() by another kthread writes through dangling list pointers into the freed and reused memory, corrupting the pid's rcu.func pointer. Instead of patching free_kthread_struct() to handle the missed cleanup, consolidate all kthread exit paths. Turn kthread_exit() into a macro that calls do_exit() and add kthread_do_exit() which is called from do_exit() for any task with PF_KTHREAD set. This guarantees that kthread-specific cleanup always happens regardless of the exit path - make_task_dead(), direct do_exit(), or kthread_exit(). Replace __to_kthread() with a new tsk_is_kthread() accessor in the public header. Export do_exit() since module code using the kthread_exit() macro now needs it directly. Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Tested-by: Mark Brown Tested-by: David Gow Cc: Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner Co-developed-by: Linus Torvalds Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/kthread.h | 21 ++++++++++++++++++++- kernel/exit.c | 6 ++++++ kernel/kthread.c | 41 +++++------------------------------------ 3 files changed, 31 insertions(+), 37 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c92c1149ee6e..a01a474719a7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -7,6 +7,24 @@ struct mm_struct; +/* opaque kthread data */ +struct kthread; + +/* + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. + * + * Return NULL for any task that is not a kthread. + */ +static inline struct kthread *tsk_is_kthread(struct task_struct *p) +{ + if (p->flags & PF_KTHREAD) + return p->worker_private; + return NULL; +} + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_exit(long result) __noreturn; +#define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); +void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/kernel/exit.c b/kernel/exit.c index 8a87021211ae..ede3117fa7d4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -896,11 +896,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) void __noreturn do_exit(long code) { struct task_struct *tsk = current; + struct kthread *kthread; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); + kthread = tsk_is_kthread(tsk); + if (unlikely(kthread)) + kthread_do_exit(kthread, code); + kcov_task_exit(tsk); kmsan_task_exit(tsk); @@ -1013,6 +1018,7 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } +EXPORT_SYMBOL(do_exit); void __noreturn make_task_dead(int signr) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 20451b624b67..791210daf8b4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k) return k->worker_private; } -/* - * Variant of to_kthread() that doesn't assume @p is a kthread. - * - * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will - * always remain a kthread. For kthreads p->worker_private always - * points to a struct kthread. For tasks that are not kthreads - * p->worker_private is used to point to other things. - * - * Return NULL for any task that is not a kthread. - */ -static inline struct kthread *__to_kthread(struct task_struct *p) -{ - void *kthread = p->worker_private; - if (kthread && !(p->flags & PF_KTHREAD)) - kthread = NULL; - return kthread; -} - void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); @@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { - struct kthread *kthread = __to_kthread(current); + struct kthread *kthread = tsk_is_kthread(current); if (!kthread) return false; @@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); */ void *kthread_func(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); if (kthread) return kthread->threadfn; return NULL; @@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data); */ void *kthread_probe_data(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); void *data = NULL; if (kthread) @@ -309,19 +291,8 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -/** - * kthread_exit - Cause the current kthread return @result to kthread_stop(). - * @result: The integer value to return to kthread_stop(). - * - * While kthread_exit can be called directly, it exists so that - * functions which do some additional work in non-modular code such as - * module_put_and_kthread_exit can be implemented. - * - * Does not return. - */ -void __noreturn kthread_exit(long result) +void kthread_do_exit(struct kthread *kthread, long result) { - struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->affinity_node)) { mutex_lock(&kthread_affinity_lock); @@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result) kthread->preferred_affinity = NULL; } } - do_exit(0); } -EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. @@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu) bool kthread_is_per_cpu(struct task_struct *p) { - struct kthread *kthread = __to_kthread(p); + struct kthread *kthread = tsk_is_kthread(p); if (!kthread) return false; -- cgit v1.2.3 From 7c2889af823340d1d410939b9d547bf184d5fa54 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 25 Feb 2026 15:48:59 +0200 Subject: RDMA/uverbs: Import DMA-BUF module in uverbs_std_types_dmabuf file Fix the following compilation error: ERROR: modpost: module ib_uverbs uses symbol dma_buf_move_notify from namespace DMA_BUF, but does not import it. Fixes: 0ac6f4056c4a ("RDMA/uverbs: Add DMABUF object type and operations") Link: https://patch.msgid.link/20260225-fix-uverbs-compilation-v1-1-acf7b3d0f9fa@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_std_types_dmabuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/uverbs_std_types_dmabuf.c b/drivers/infiniband/core/uverbs_std_types_dmabuf.c index dfdfcd1d1a44..4a7f8b6f9dc8 100644 --- a/drivers/infiniband/core/uverbs_std_types_dmabuf.c +++ b/drivers/infiniband/core/uverbs_std_types_dmabuf.c @@ -10,6 +10,8 @@ #include "rdma_core.h" #include "uverbs.h" +MODULE_IMPORT_NS("DMA_BUF"); + static int uverbs_dmabuf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { -- cgit v1.2.3 From 003ce8c9b2ca28fbb4860651e76fb1c9a91f2ea1 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 26 Feb 2026 11:17:28 +0000 Subject: ALSA: hda: cs35l56: Fix signedness error in cs35l56_hda_posture_put() In cs35l56_hda_posture_put() assign ucontrol->value.integer.value[0] to a long instead of an unsigned long. ucontrol->value.integer.value[0] is a long. This fixes the sparse warning: sound/hda/codecs/side-codecs/cs35l56_hda.c:256:20: warning: unsigned value that used to be signed checked against zero? sound/hda/codecs/side-codecs/cs35l56_hda.c:252:29: signed value source Signed-off-by: Richard Fitzgerald Fixes: 73cfbfa9caea8 ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Link: https://patch.msgid.link/20260226111728.1700431-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/cs35l56_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index cfc8de2ae499..eb66827eabf8 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -249,7 +249,7 @@ static int cs35l56_hda_posture_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct cs35l56_hda *cs35l56 = snd_kcontrol_chip(kcontrol); - unsigned long pos = ucontrol->value.integer.value[0]; + long pos = ucontrol->value.integer.value[0]; bool changed; int ret; -- cgit v1.2.3 From 786ea2b694f48e1b34f1dcf104e09357fc99ef34 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 26 Feb 2026 12:41:15 +0000 Subject: ALSA: hda: cs35l56: Remove unnecessary struct cs_dsp_client_ops Since commit af37511305c0 ("firmware: cs_dsp: Don't require client to provide a struct cs_dsp_client_ops") the client doesn't have to provide a struct cs_dsp_client_ops. So remove the dummy cs_dsp_client_ops. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260226124115.1811187-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/cs35l56_hda.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index eb66827eabf8..1ace4beef508 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -403,10 +403,6 @@ static void cs35l56_hda_remove_controls(struct cs35l56_hda *cs35l56) snd_ctl_remove(cs35l56->codec->card, cs35l56->volume_ctl); } -static const struct cs_dsp_client_ops cs35l56_hda_client_ops = { - /* cs_dsp requires the client to provide this even if it is empty */ -}; - static int cs35l56_hda_request_firmware_file(struct cs35l56_hda *cs35l56, const struct firmware **firmware, char **filename, const char *base_name, const char *system_name, @@ -1149,7 +1145,6 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) cs35l56->base.cal_index = cs35l56->index; cs35l56_init_cs_dsp(&cs35l56->base, &cs35l56->cs_dsp); - cs35l56->cs_dsp.client_ops = &cs35l56_hda_client_ops; if (cs35l56->base.reset_gpio) { dev_dbg(cs35l56->base.dev, "Hard reset\n"); -- cgit v1.2.3 From a0b4c7a49137ed21279f354eb59f49ddae8dffc2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Feb 2026 13:32:33 +0000 Subject: netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence Fix netfslib such that when it's making an unbuffered or DIO write, to make sure that it sends each subrequest strictly sequentially, waiting till the previous one is 'committed' before sending the next so that we don't have pieces landing out of order and potentially leaving a hole if an error occurs (ENOSPC for example). This is done by copying in just those bits of issuing, collecting and retrying subrequests that are necessary to do one subrequest at a time. Retrying, in particular, is simpler because if the current subrequest needs retrying, the source iterator can just be copied again and the subrequest prepped and issued again without needing to be concerned about whether it needs merging with the previous or next in the sequence. Note that the issuing loop waits for a subrequest to complete right after issuing it, but this wait could be moved elsewhere allowing preparatory steps to be performed whilst the subrequest is in progress. In particular, once content encryption is available in netfslib, that could be done whilst waiting, as could cleanup of buffers that have been completed. Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Signed-off-by: David Howells Link: https://patch.msgid.link/58526.1772112753@warthog.procyon.org.uk Tested-by: Steve French Reviewed-by: Paulo Alcantara (Red Hat) cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 228 ++++++++++++++++++++++++++++++++++++++++--- fs/netfs/internal.h | 4 +- fs/netfs/write_collect.c | 21 ---- fs/netfs/write_issue.c | 41 +------- include/trace/events/netfs.h | 4 +- 5 files changed, 221 insertions(+), 77 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a9d1c3b2c084..dd1451bf7543 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -9,6 +9,202 @@ #include #include "internal.h" +/* + * Perform the cleanup rituals after an unbuffered write is complete. + */ +static void netfs_unbuffered_write_done(struct netfs_io_request *wreq) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + + _enter("R=%x", wreq->debug_id); + + /* Okay, declare that all I/O is complete. */ + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + if (!wreq->error) + netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + /* mmap may have got underfoot and we may now have folios + * locally covering the region we just wrote. Attempt to + * discard the folios, but leave in place any modified locally. + * ->write_iter() is prevented from interfering by the DIO + * counter. + */ + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ + + if (wreq->iocb) { + size_t written = umin(wreq->transferred, wreq->len); + + wreq->iocb->ki_pos += written; + if (wreq->iocb->ki_complete) { + trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete); + wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written); + } + wreq->iocb = VFS_PTR_POISON; + } + + netfs_clear_subrequests(wreq); +} + +/* + * Collect the subrequest results of unbuffered write subrequests. + */ +static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + trace_netfs_collect_sreq(wreq, subreq); + + spin_lock(&wreq->lock); + list_del_init(&subreq->rreq_link); + spin_unlock(&wreq->lock); + + wreq->transferred += subreq->transferred; + iov_iter_advance(&wreq->buffer.iter, subreq->transferred); + + stream->collected_to = subreq->start + subreq->transferred; + wreq->collected_to = stream->collected_to; + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); + + trace_netfs_collect_stream(wreq, stream); + trace_netfs_collect_state(wreq, wreq->collected_to, 0); +} + +/* + * Write data to the server without going through the pagecache and without + * writing it to the local cache. We dispatch the subrequests serially and + * wait for each to complete before dispatching the next, lest we leave a gap + * in the data written due to a failure such as ENOSPC. We could, however + * attempt to do preparation such as content encryption for the next subreq + * whilst the current is in progress. + */ +static int netfs_unbuffered_write(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *subreq = NULL; + struct netfs_io_stream *stream = &wreq->io_streams[0]; + int ret; + + _enter("%llx", wreq->len); + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + stream->collected_to = wreq->start; + + for (;;) { + bool retry = false; + + if (!subreq) { + netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred); + subreq = stream->construct; + stream->construct = NULL; + stream->front = NULL; + } + + /* Check if (re-)preparation failed. */ + if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) { + netfs_write_subrequest_terminated(subreq, subreq->error); + wreq->error = subreq->error; + break; + } + + iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred); + if (!iov_iter_count(&subreq->io_iter)) + break; + + subreq->len = netfs_limit_iter(&subreq->io_iter, 0, + stream->sreq_max_len, + stream->sreq_max_segs); + iov_iter_truncate(&subreq->io_iter, subreq->len); + stream->submit_extendable_to = subreq->len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + stream->issue_write(subreq); + + /* Async, need to wait. */ + netfs_wait_for_in_progress_stream(wreq, stream); + + if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + retry = true; + } else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { + ret = subreq->error; + wreq->error = ret; + netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed); + subreq = NULL; + break; + } + ret = 0; + + if (!retry) { + netfs_unbuffered_write_collect(wreq, stream, subreq); + subreq = NULL; + if (wreq->transferred >= wreq->len) + break; + if (!wreq->iocb && signal_pending(current)) { + ret = wreq->transferred ? -EINTR : -ERESTARTSYS; + trace_netfs_rreq(wreq, netfs_rreq_trace_intr); + break; + } + continue; + } + + /* We need to retry the last subrequest, so first reset the + * iterator, taking into account what, if anything, we managed + * to transfer. + */ + subreq->error = -EAGAIN; + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + if (subreq->transferred > 0) + iov_iter_advance(&wreq->buffer.iter, subreq->transferred); + + if (stream->source == NETFS_UPLOAD_TO_SERVER && + wreq->netfs_ops->retry_request) + wreq->netfs_ops->retry_request(wreq, stream); + + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); + subreq->io_iter = wreq->buffer.iter; + subreq->start = wreq->start + wreq->transferred; + subreq->len = wreq->len - wreq->transferred; + subreq->transferred = 0; + subreq->retry_count += 1; + stream->sreq_max_len = UINT_MAX; + stream->sreq_max_segs = INT_MAX; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + stream->prepare_write(subreq); + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); + } + + netfs_unbuffered_write_done(wreq); + _leave(" = %d", ret); + return ret; +} + +static void netfs_unbuffered_write_async(struct work_struct *work) +{ + struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); + + netfs_unbuffered_write(wreq); + netfs_put_request(wreq, netfs_rreq_trace_put_complete); +} + /* * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. @@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * */ wreq->buffer.iter = *iter; } + + wreq->len = iov_iter_count(&wreq->buffer.iter); } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); - if (async) - __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO /* Dispatch the write. */ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - if (async) - wreq->iocb = iocb; - wreq->len = iov_iter_count(&wreq->buffer.iter); - ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); - if (ret < 0) { - _debug("begin = %zd", ret); - goto out; - } - if (!async) { - ret = netfs_wait_for_write(wreq); - if (ret > 0) - iocb->ki_pos += ret; - } else { + if (async) { + INIT_WORK(&wreq->work, netfs_unbuffered_write_async); + wreq->iocb = iocb; + queue_work(system_dfl_wq, &wreq->work); ret = -EIOCBQUEUED; + } else { + ret = netfs_unbuffered_write(wreq); + if (ret < 0) { + _debug("begin = %zd", ret); + } else { + iocb->ki_pos += wreq->transferred; + ret = wreq->transferred ?: wreq->error; + } + + netfs_put_request(wreq, netfs_rreq_trace_put_complete); } -out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 4319611f5354..d436e20d3418 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, struct file *file, loff_t start, enum netfs_io_origin origin); +void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start); void netfs_reissue_write(struct netfs_io_stream *stream, struct netfs_io_subrequest *subreq, struct iov_iter *source); @@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio **writethrough_cache); ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *writethrough_cache); -int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* * write_retry.c diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 61eab34ea67e..83eb3dc1adf8 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -399,27 +399,6 @@ bool netfs_write_collection(struct netfs_io_request *wreq) ictx->ops->invalidate_cache(wreq); } - if ((wreq->origin == NETFS_UNBUFFERED_WRITE || - wreq->origin == NETFS_DIO_WRITE) && - !wreq->error) - netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); - - if (wreq->origin == NETFS_DIO_WRITE && - wreq->mapping->nrpages) { - /* mmap may have got underfoot and we may now have folios - * locally covering the region we just wrote. Attempt to - * discard the folios, but leave in place any modified locally. - * ->write_iter() is prevented from interfering by the DIO - * counter. - */ - pgoff_t first = wreq->start >> PAGE_SHIFT; - pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; - invalidate_inode_pages2_range(wreq->mapping, first, last); - } - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_end(wreq->inode); - _debug("finished"); netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 34894da5a23e..437268f65640 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -154,9 +154,9 @@ EXPORT_SYMBOL(netfs_prepare_write_failed); * Prepare a write subrequest. We need to allocate a new subrequest * if we don't have one. */ -static void netfs_prepare_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream, - loff_t start) +void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start) { struct netfs_io_subrequest *subreq; struct iov_iter *wreq_iter = &wreq->buffer.iter; @@ -698,41 +698,6 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c return ret; } -/* - * Write data to the server without going through the pagecache and without - * writing it to the local cache. - */ -int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len) -{ - struct netfs_io_stream *upload = &wreq->io_streams[0]; - ssize_t part; - loff_t start = wreq->start; - int error = 0; - - _enter("%zx", len); - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_begin(wreq->inode); - - while (len) { - // TODO: Prepare content encryption - - _debug("unbuffered %zx", len); - part = netfs_advance_write(wreq, upload, start, len, false); - start += part; - len -= part; - rolling_buffer_advance(&wreq->buffer, part); - if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) - netfs_wait_for_paused_write(wreq); - if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) - break; - } - - netfs_end_issue_write(wreq); - _leave(" = %d", error); - return error; -} - /* * Write some of a pending folio data back to the server and/or the cache. */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 64a382fbc31a..2d366be46a1c 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -57,6 +57,7 @@ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_end_copy_to_cache, "END-C2C") \ EM(netfs_rreq_trace_free, "FREE ") \ + EM(netfs_rreq_trace_intr, "INTR ") \ EM(netfs_rreq_trace_ki_complete, "KI-CMPL") \ EM(netfs_rreq_trace_recollect, "RECLLCT") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \ @@ -169,7 +170,8 @@ EM(netfs_sreq_trace_put_oom, "PUT OOM ") \ EM(netfs_sreq_trace_put_wip, "PUT WIP ") \ EM(netfs_sreq_trace_put_work, "PUT WORK ") \ - E_(netfs_sreq_trace_put_terminated, "PUT TERM ") + EM(netfs_sreq_trace_put_terminated, "PUT TERM ") \ + E_(netfs_sreq_trace_see_failed, "SEE FAILED ") #define netfs_folio_traces \ EM(netfs_folio_is_uptodate, "mod-uptodate") \ -- cgit v1.2.3 From 2970525f789c080e7e82ecb09cd85a8bb1d284e4 Mon Sep 17 00:00:00 2001 From: Jingkai Tan Date: Thu, 22 Jan 2026 21:14:10 +0000 Subject: btrfs: handle discard errors in in btrfs_finish_extent_commit() Coverity (ID: 1226842) reported that the return value of btrfs_discard_extent() is assigned to ret but is immediately overwritten by unpin_extent_range() without being checked. Use the same error handling that is done later in the same function. Signed-off-by: Jingkai Tan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 03cf9f242c70..b0d9baf5b412 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2933,9 +2933,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) while (!TRANS_ABORTED(trans) && cached_state) { struct extent_state *next_state; - if (btrfs_test_opt(fs_info, DISCARD_SYNC)) + if (btrfs_test_opt(fs_info, DISCARD_SYNC)) { ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL, true); + if (ret) { + btrfs_warn(fs_info, + "discard failed for extent [%llu, %llu]: errno=%d %s", + start, end, ret, btrfs_decode_error(ret)); + } + } next_state = btrfs_next_extent_state(unpin, cached_state); btrfs_clear_extent_dirty(unpin, start, end, &cached_state); -- cgit v1.2.3 From a4fe134fc1d8eb7dcd07e0961c5711b443decd89 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Feb 2026 14:01:45 +1030 Subject: btrfs: fix a double release on reserved extents in cow_one_range() [BUG] Commit c28214bde6da ("btrfs: refactor the main loop of cow_file_range()") refactored the handling of COWing one range. However it changed the error handling of the reserved extent. The old cleanup looks like this: out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); [...] clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } Which only calls EXTENT_CLEAR_META_RESV. As the reserved extent is properly handled by btrfs_free_reserved_extent(). However the new cleanup is: extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL); btrfs_dec_block_group_reservations(fs_info, ins->objectid); btrfs_free_reserved_extent(fs_info, ins->objectid, ins->offset, true); The flag EXTENT_DO_ACCOUNTING implies both EXTENT_CLEAR_META_RESV and EXTENT_CLEAR_DATA_RESV, which will release the bytes_may_use, which later btrfs_free_reserved_extent() will do again, causing incorrect double release (and may underflow bytes_may_use). [FIX] Use EXTENT_CLEAR_META_RESV to replace EXTENT_DO_ACCOUNTING, and add back the comments on why we only use EXTENT_CLEAR_META_RESV. Fixes: c28214bde6da ("btrfs: refactor the main loop of cow_file_range()") Reported-by: Chris Mason Link: https://lore.kernel.org/linux-btrfs/20260208184920.1102719-1-clm@meta.com/ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6c763a17406..ade590707793 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1392,10 +1392,25 @@ static int cow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, return ret; free_reserved: + /* + * If we have reserved an extent for the current range and failed to + * create the respective extent map or ordered extent, it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. + * + * We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, which + * will be handled by btrfs_free_reserved_extent(). + * + * Therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV, but only + * EXTENT_CLEAR_META_RESV. + */ extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL); -- cgit v1.2.3 From 0649355303995d6539df1fa8feaf50497f94db9b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2026 15:27:14 +0000 Subject: btrfs: change warning messages to error level in open_ctree() Failure to read the fs root results in a mount error, but we log a warning message. Same goes for checking the UUID tree, an error results in a mount failure but we log a warning message. Change the level of the logged messages from warning to error. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 13e400046c87..de8ce9c588fe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3642,7 +3642,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { ret = PTR_ERR(fs_info->fs_root); - btrfs_warn(fs_info, "failed to read fs tree: %d", ret); + btrfs_err(fs_info, "failed to read fs tree: %d", ret); fs_info->fs_root = NULL; goto fail_qgroup; } @@ -3663,8 +3663,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - btrfs_warn(fs_info, - "failed to check the UUID tree: %d", ret); + btrfs_err(fs_info, "failed to check the UUID tree: %d", ret); close_ctree(fs_info); return ret; } -- cgit v1.2.3 From 64def7d7d62b61044fe619e3cc9cbe60454decd9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2026 15:37:41 +0000 Subject: btrfs: remove redundant warning message in btrfs_check_uuid_tree() If we fail to start the UUID rescan kthread, btrfs_check_uuid_tree() logs an error message and returns the error to the single caller, open_ctree(). This however is redundant since the caller already logs an error message, which is also more informative since it logs the error code. Some remove the warning message from btrfs_check_uuid_tree() as it doesn't add any value. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index de8ce9c588fe..b855e2cec2ec 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2972,7 +2972,6 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_rescan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } -- cgit v1.2.3 From 4db8d56c6f4cbea7293d4236efac4a507dbfa6b1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2026 15:47:57 +0000 Subject: btrfs: remove btrfs_handle_fs_error() after failure to recover log trees There is no need to call btrfs_handle_fs_error() (which we are trying to deprecate) if we fail to recover log trees: 1) Such a failure results in failing the mount immediately; 2) If the recovery started a transaction before failing, it has already aborted the transaction down in the call chain. So remove the btrfs_handle_fs_error() call, replace it with an error message and assert that the FS is in error state (so that no partial updates are committed due to a transaction that was not aborted). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b855e2cec2ec..e6574456a306 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2023,9 +2023,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); btrfs_put_root(log_tree_root); - if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to recover log tree"); + if (unlikely(ret)) { + ASSERT(BTRFS_FS_ERROR(fs_info) != 0); + btrfs_err(fs_info, "failed to recover log trees with error: %d", ret); return ret; } -- cgit v1.2.3 From 912db40655684a0a427c8b118dd0c36eb5a61fe4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2026 09:49:49 +0000 Subject: btrfs: convert log messages to error level in btrfs_replay_log() We are logging messages as warnings but they should really have an error level instead, as if the respective conditions are met the mount will fail. So convert them to error level and also log the error code returned by read_tree_block(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6574456a306..028ba86bcf73 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1994,7 +1994,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, int level = btrfs_super_log_root_level(disk_super); if (unlikely(fs_devices->rw_devices == 0)) { - btrfs_warn(fs_info, "log replay required on RO media"); + btrfs_err(fs_info, "log replay required on RO media"); return -EIO; } @@ -2008,9 +2008,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, check.owner_root = BTRFS_TREE_LOG_OBJECTID; log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { - btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); log_tree_root->node = NULL; + btrfs_err(fs_info, "failed to read log tree with error: %d", ret); btrfs_put_root(log_tree_root); return ret; } -- cgit v1.2.3 From 8ac7fad32b93044f4350ab35f11fee1a61286723 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 10 Feb 2026 14:23:29 +0000 Subject: btrfs: remove pointless WARN_ON() in cache_save_setup() This WARN_ON(ret) is never executed since the previous if statement makes us jump into the 'out_put' label when ret is not zero. The existing transaction abort inside the if statement also gives us a stack trace, so we don't need to move the WARN_ON(ret) into the if statement either. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5f76683b3f21..77285ade3a0e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3341,7 +3341,6 @@ again: btrfs_abort_transaction(trans, ret); goto out_put; } - WARN_ON(ret); /* We've already setup this transaction, go ahead and exit */ if (block_group->cache_generation == trans->transid && -- cgit v1.2.3 From 2ab22446425c2c72b44926bc964c263e06e7e137 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 11 Feb 2026 11:01:25 -0800 Subject: btrfs: fix referenced/exclusive check in squota_check_parent_usage() We compared rfer_cmpr against excl_cmpr_sum instead of rfer_cmpr_sum which is confusing. I expect that rfer_cmpr == excl_cmpr in squota, but it is much better to be consistent in case of any surprises or bugs. Reported-by: Chris Mason Link: https://lore.kernel.org/linux-btrfs/cover.1764796022.git.boris@bur.io/T/#mccb231643ffd290b44a010d4419474d280be5537 Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 38adadb936dc..19edd25ff5d1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -370,7 +370,7 @@ static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrf nr_members++; } mismatch = (parent->excl != excl_sum || parent->rfer != rfer_sum || - parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != excl_cmpr_sum); + parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != rfer_cmpr_sum); WARN(mismatch, "parent squota qgroup %hu/%llu has mismatched usage from its %d members. " -- cgit v1.2.3 From 3f501412f2079ca14bf68a18d80a2b7a823f1f64 Mon Sep 17 00:00:00 2001 From: Miquel Sabaté Solà Date: Mon, 16 Feb 2026 22:12:15 +0100 Subject: btrfs: free pages on error in btrfs_uring_read_extent() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this function the 'pages' object is never freed in the hopes that it is picked up by btrfs_uring_read_finished() whenever that executes in the future. But that's just the happy path. Along the way previous allocations might have gone wrong, or we might not get -EIOCBQUEUED from btrfs_encoded_read_regular_fill_pages(). In all these cases, we go to a cleanup section that frees all memory allocated by this function without assuming any deferred execution, and this also needs to happen for the 'pages' allocation. Fixes: 34310c442e17 ("btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)") Signed-off-by: Miquel Sabaté Solà Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f1b56be6f8f4..dadf9bf30f08 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4651,7 +4651,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; - struct page **pages; + struct page **pages = NULL; struct btrfs_uring_priv *priv = NULL; unsigned long nr_pages; int ret; @@ -4709,6 +4709,11 @@ out_fail: btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); + for (int i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); return ret; } -- cgit v1.2.3 From a7526533128fd3210e15cee1e14c7a0fe97087c1 Mon Sep 17 00:00:00 2001 From: Miquel Sabaté Solà Date: Mon, 16 Feb 2026 01:22:52 +0100 Subject: btrfs: don't commit the super block when unmounting a shutdown filesystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unmounting a filesystem we will try, among many other things, to commit the super block. On a filesystem that was shutdown, though, this will always fail with -EROFS as writes are forbidden on this context; and an error will be reported. Don't commit the super block on this situation, which should be fine as the filesystem is frozen before shutdown and, therefore, it should be at a consistent state. Signed-off-by: Miquel Sabaté Solà Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 028ba86bcf73..6a38489a4e18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4397,9 +4397,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_flush_workqueue(fs_info->delayed_workers); - ret = btrfs_commit_super(fs_info); - if (ret) - btrfs_err(fs_info, "commit super ret %d", ret); + /* + * If the filesystem is shutdown, then an attempt to commit the + * super block (or any write) will just fail. Since we freeze + * the filesystem before shutting it down, the filesystem is in + * a consistent state and we don't need to commit super blocks. + */ + if (!btrfs_is_shutdown(fs_info)) { + ret = btrfs_commit_super(fs_info); + if (ret) + btrfs_err(fs_info, "commit super block returned %d", ret); + } } kthread_stop(fs_info->transaction_kthread); -- cgit v1.2.3 From 3cf0f35779d364cf2003c617bb7f3f3e41023372 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 18:25:42 +0000 Subject: btrfs: fix error message order of parameters in btrfs_delete_delayed_dir_index() Fix the error message in btrfs_delete_delayed_dir_index() if __btrfs_add_delayed_item() fails: the message says root, inode, index, error, but we're actually passing index, root, inode, error. Fixes: adc1ef55dc04 ("btrfs: add details to error messages at btrfs_delete_delayed_dir_index()") Signed-off-by: Mark Harmstone Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1739a0b29c49..2746841c167d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1657,7 +1657,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d", - index, btrfs_root_id(node->root), node->inode_id, ret); + btrfs_root_id(node->root), node->inode_id, index, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); } -- cgit v1.2.3 From 511dc8912ae3e929c1a182f5e6b2326516fd42a0 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 10:21:44 +0000 Subject: btrfs: fix incorrect key offset in error message in check_dev_extent_item() Fix the error message in check_dev_extent_item(), when an overlapping stripe is encountered. For dev extents, objectid is the disk number and offset the physical address, so prev_key->objectid should actually be prev_key->offset. (I can't take any credit for this one - this was discovered by Chris and his friend Claude.) Reported-by: Chris Mason Fixes: 008e2512dc56 ("btrfs: tree-checker: add dev extent item checks") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 452394b34d01..9774779f060b 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1921,7 +1921,7 @@ static int check_dev_extent_item(const struct extent_buffer *leaf, if (unlikely(prev_key->offset + prev_len > key->offset)) { generic_err(leaf, slot, "dev extent overlap, prev offset %llu len %llu current offset %llu", - prev_key->objectid, prev_len, key->offset); + prev_key->offset, prev_len, key->offset); return -EUCLEAN; } } -- cgit v1.2.3 From a10172780526c2002e062102ad4f2aabac495889 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 14:39:46 +0000 Subject: btrfs: fix objectid value in error message in check_extent_data_ref() Fix a copy-paste error in check_extent_data_ref(): we're printing root as in the message above, we should be printing objectid. Fixes: f333a3c7e832 ("btrfs: tree-checker: validate dref root and objectid") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9774779f060b..ac4c4573ee39 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1740,7 +1740,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf, objectid > BTRFS_LAST_FREE_OBJECTID)) { extent_err(leaf, slot, "invalid extent data backref objectid value %llu", - root); + objectid); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { -- cgit v1.2.3 From 44e2fda66427a0442d8d2c0e6443256fb458ab6b Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 17:46:13 +0000 Subject: btrfs: fix warning in scrub_verify_one_metadata() Commit b471965fdb2d ("btrfs: fix replace/scrub failure with metadata_uuid") fixed the comparison in scrub_verify_one_metadata() to use metadata_uuid rather than fsid, but left the warning as it was. Fix it so it matches what we're doing. Fixes: b471965fdb2d ("btrfs: fix replace/scrub failure with metadata_uuid") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2a64e2d50ced..052d83feea9a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -744,7 +744,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr btrfs_warn_rl(fs_info, "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, - header->fsid, fs_info->fs_devices->fsid); + header->fsid, fs_info->fs_devices->metadata_uuid); return; } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, -- cgit v1.2.3 From 1c7e9111f4e6d6d42bc47759c9af1ef91f03ac2c Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 17:32:39 +0000 Subject: btrfs: print correct subvol num if active swapfile prevents deletion Fix the error message in btrfs_delete_subvolume() if we can't delete a subvolume because it has an active swapfile: we were printing the number of the parent rather than the target. Fixes: 60021bd754c6 ("btrfs: prevent subvol with swapfile from being deleted") Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ade590707793..d28d55beaacd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4779,7 +4779,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - btrfs_root_id(root)); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } -- cgit v1.2.3 From 587bb33b10bda645a1028c1737ad3992b3d7cf61 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 17:46:41 +0000 Subject: btrfs: fix compat mask in error messages in btrfs_check_features() Commit d7f67ac9a928 ("btrfs: relax block-group-tree feature dependency checks") introduced a regression when it comes to handling unsupported incompat or compat_ro flags. Beforehand we only printed the flags that we didn't recognize, afterwards we printed them all, which is less useful. Fix the error handling so it behaves like it used to. Fixes: d7f67ac9a928 ("btrfs: relax block-group-tree feature dependency checks") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6a38489a4e18..49987334dd15 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3187,7 +3187,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { btrfs_err(fs_info, "cannot mount because of unknown incompat features (0x%llx)", - incompat); + incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP); return -EINVAL; } @@ -3219,7 +3219,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) if (compat_ro_unsupp && is_rw_mount) { btrfs_err(fs_info, "cannot mount read-write because of unknown compat_ro features (0x%llx)", - compat_ro); + compat_ro_unsupp); return -EINVAL; } @@ -3232,7 +3232,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_err(fs_info, "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", - compat_ro); + compat_ro_unsupp); return -EINVAL; } -- cgit v1.2.3 From f15fb3d41543244d1179f423da4a4832a55bc050 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 20 Feb 2026 12:53:17 +0000 Subject: btrfs: fix chunk map leak in btrfs_map_block() after btrfs_chunk_map_num_copies() Fix a chunk map leak in btrfs_map_block(): if we return early with -EINVAL, we're not freeing the chunk map that we've just looked up. Fixes: 0ae653fbec2b ("btrfs: reduce chunk_map lookups in btrfs_map_block()") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Filipe Manana Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 50f7aae70418..b8cbd3ecb94d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6921,8 +6921,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } num_copies = btrfs_chunk_map_num_copies(map); - if (io_geom.mirror_num > num_copies) - return -EINVAL; + if (io_geom.mirror_num > num_copies) { + ret = -EINVAL; + goto out; + } map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; -- cgit v1.2.3 From 54b9395b186a60d1655186c561a505c590654395 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 20 Feb 2026 12:52:56 +0000 Subject: btrfs: fix chunk map leak in btrfs_map_block() after btrfs_translate_remap() If the call to btrfs_translate_remap() in btrfs_map_block() returns an error code, we were leaking the chunk map. Fix it by jumping to out rather than returning directly. Reported-by: Chris Mason Link: https://lore.kernel.org/linux-btrfs/20260125125830.2352988-1-clm@meta.com/ Fixes: 18ba64992871 ("btrfs: redirect I/O for remapped block groups") Reviewed-by: Filipe Manana Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8cbd3ecb94d..3c37c5d2267b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6907,7 +6907,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, ret = btrfs_translate_remap(fs_info, &new_logical, length); if (ret) - return ret; + goto out; if (new_logical != logical) { btrfs_free_chunk_map(map); -- cgit v1.2.3 From 7885ca40c305c64ffa444e1fc55edd6acb7a6e5b Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 19 Feb 2026 14:16:02 +0000 Subject: btrfs: fix transaction handle leaks in btrfs_last_identity_remap_gone() btrfs_abort_transaction(), unlike btrfs_commit_transaction(), doesn't also free the transaction handle. Fix the instances in btrfs_last_identity_remap_gone() where we're also leaking the transaction on abort. Reported-by: Chris Mason Link: https://lore.kernel.org/linux-btrfs/20260125125129.2245240-1-clm@meta.com/ Fixes: 979e1dc3d69e ("btrfs: handle deletions from remapped block group") Reviewed-by: Filipe Manana Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fcd0a2ba3554..2119dddd6f8e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4723,6 +4723,7 @@ int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, ret = btrfs_remove_dev_extents(trans, chunk_map); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); return ret; } @@ -4732,6 +4733,7 @@ int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, if (unlikely(ret)) { mutex_unlock(&trans->fs_info->chunk_mutex); btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); return ret; } } @@ -4750,6 +4752,7 @@ int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, ret = remove_chunk_stripes(trans, chunk_map, path); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); return ret; } -- cgit v1.2.3 From f8db8009ea65297dba7786668d4561f6dbd99678 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 19 Feb 2026 14:30:59 +0000 Subject: btrfs: check block group lookup in remove_range_from_remap_tree() Add a check in remove_range_from_remap_tree() after we call btrfs_lookup_block_group(), to check if it is NULL. This shouldn't happen, but if it does we at least get an error rather than a segfault. Reported-by: Chris Mason Link: https://lore.kernel.org/linux-btrfs/20260125125129.2245240-1-clm@meta.com/ Fixes: 979e1dc3d69e ("btrfs: handle deletions from remapped block group") Reviewed-by: Filipe Manana Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2119dddd6f8e..cdb53c0b26ec 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -5985,6 +5985,9 @@ static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *dest_bg; dest_bg = btrfs_lookup_block_group(fs_info, new_addr); + if (unlikely(!dest_bg)) + return -EUCLEAN; + adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length); btrfs_put_block_group(dest_bg); ret = btrfs_add_to_free_space_tree(trans, -- cgit v1.2.3 From 8f153eb745463b0715f1aad41e765cd83e9da8c0 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 29 Jan 2026 10:24:05 +0800 Subject: wifi: ath12k: use correct pdev id when requesting firmware stats To get firmware statistics, currently ar->pdev->pdev_id is passed as an argument to ath12k_mac_get_fw_stats() in ath12k_mac_op_sta_statistics(). For single pdev device like WCN7850, its value is 0 which represents the SoC pdev id. As a result, WCN7850 firmware sends the same reply to host twice, which further results in memory leak: unreferenced object 0xffff88812e286000 (size 192): comm "softirq", pid 0, jiffies 4294981997 hex dump (first 32 bytes): 10 a5 40 11 81 88 ff ff 10 a5 40 11 81 88 ff ff ..@.......@..... 00 00 00 00 00 00 00 00 80 ff ff ff 33 05 00 00 ............3... backtrace (crc cecc8c82): __kmalloc_cache_noprof ath12k_wmi_tlv_fw_stats_parse ath12k_wmi_tlv_iter ath12k_wmi_op_rx ath12k_htc_rx_completion_handler ath12k_ce_per_engine_service ath12k_pci_ce_workqueue process_one_work bh_worker tasklet_action handle_softirqs Detailed explanation is: 1. ath12k_mac_get_fw_stats() called in ath12k_mac_op_sta_statistics() to get vdev statistics, making the caller thread wait. 2. firmware sends the first reply, ath12k_wmi_tlv_fw_stats_data_parse() allocates buffers to cache necessary information. Following that, in ath12k_wmi_fw_stats_process() if events of all started vdev haved been received, is_end flag is set hence the waiting thread gets waken up by the ar->fw_stats_done/->fw_stats_complete signals. 3. ath12k_mac_get_fw_stats() wakes up and returns successfully. ath12k_mac_op_sta_statistics() saves required parameters and calls ath12k_fw_stats_reset() to free buffers allocated earlier. 4. firmware sends the second reply. As usual, buffers are allocated and attached to the ar->fw_stats.vdevs list. Note this time there is no thread waiting, therefore no chance to free those buffers. 5. ath12k module gets unloaded. If there has been no more firmware statistics request made since step 4, or if the request fails (see the example in the following patch), there is no chance to call ath12k_fw_stats_reset(). Consequently those buffers leak. Actually for single pdev device, using SoC pdev id in ath12k_mac_op_sta_statistics() is wrong, because the purpose is to get statistics of a specific station, which is mapped to a specific pdev. That said, the id of actual individual pdev should be fetched and used instead. The helper ath12k_mac_get_target_pdev_id() serves for this purpose, hence use it to fix this issue. Note it also works for other devices as well due to the single_pdev_only check inside. The same applies to ath12k_mac_op_get_txpower() and ath12k_mac_op_link_sta_statistics() as well. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 79e7b04b5388 ("wifi: ath12k: report station mode signal strength") Fixes: e92c658b056b ("wifi: ath12k: add get_txpower mac ops") Fixes: ebebe66ec208 ("wifi: ath12k: fill link station statistics for MLO") Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260129-ath12k-fw-stats-fixes-v1-1-55d66064f4d5@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 68431a0e128e..5287fa2f66e5 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5430,7 +5430,7 @@ int ath12k_mac_op_get_txpower(struct ieee80211_hw *hw, ar->last_tx_power_update)) goto send_tx_power; - params.pdev_id = ar->pdev->pdev_id; + params.pdev_id = ath12k_mac_get_target_pdev_id(ar); params.vdev_id = arvif->vdev_id; params.stats_id = WMI_REQUEST_PDEV_STAT; ret = ath12k_mac_get_fw_stats(ar, ¶ms); @@ -13452,7 +13452,7 @@ void ath12k_mac_op_sta_statistics(struct ieee80211_hw *hw, /* TODO: Use real NF instead of default one. */ signal = rate_info.rssi_comb; - params.pdev_id = ar->pdev->pdev_id; + params.pdev_id = ath12k_mac_get_target_pdev_id(ar); params.vdev_id = 0; params.stats_id = WMI_REQUEST_VDEV_STAT; @@ -13580,7 +13580,7 @@ void ath12k_mac_op_link_sta_statistics(struct ieee80211_hw *hw, spin_unlock_bh(&ar->ab->dp->dp_lock); if (!signal && ahsta->ahvif->vdev_type == WMI_VDEV_TYPE_STA) { - params.pdev_id = ar->pdev->pdev_id; + params.pdev_id = ath12k_mac_get_target_pdev_id(ar); params.vdev_id = 0; params.stats_id = WMI_REQUEST_VDEV_STAT; -- cgit v1.2.3 From 7259b1a0e54c2d3051ac8f1eb01de121b11118ea Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 29 Jan 2026 10:24:06 +0800 Subject: wifi: ath12k: fix station lookup failure when disconnecting from AP In ath12k_wmi_tlv_fw_stats_data_parse() and ath12k_wmi_tlv_rssi_chain_parse(), the driver uses ieee80211_find_sta_by_ifaddr() to look up the station associated with the incoming firmware statistics. This works under normal conditions but fails during AP disconnection, resulting in log messages like: wlan0: deauthenticating from xxxxxx by local choice (Reason: 3=DEAUTH_LEAVING) wlan0: moving STA xxxxxx to state 3 wlan0: moving STA xxxxxx to state 2 wlan0: moving STA xxxxxx to state 1 ath12k_pci 0000:02:00.0: not found station bssid xxxxxx for vdev stat ath12k_pci 0000:02:00.0: not found station of bssid xxxxxx for rssi chain ath12k_pci 0000:02:00.0: failed to pull fw stats: -71 ath12k_pci 0000:02:00.0: time out while waiting for get fw stats wlan0: Removed STA xxxxxx wlan0: Destroyed STA xxxxxx The failure happens because the station has already been removed from ieee80211_local::sta_hash by the time firmware statistics are requested through drv_sta_statistics(). Switch the lookup to ath12k_link_sta_find_by_addr(), which searches the driver's link station hash table that still has the station recorded at that time. This also implicitly fixes another issue: the current code always uses deflink regardless of which link the statistics belong to, which is incorrect in MLO scenarios. The new helper returns the correct link station. Additionally, raise the log level on lookup failures. With the updated helper, such failures should no longer occur under normal conditions. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 79e7b04b5388 ("wifi: ath12k: report station mode signal strength") Fixes: 6af5bc381b36 ("wifi: ath12k: report station mode per-chain signal strength") Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260129-ath12k-fw-stats-fixes-v1-2-55d66064f4d5@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 36 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 7617fc3a2479..404f031a3c87 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -8241,8 +8241,6 @@ static int ath12k_wmi_tlv_fw_stats_data_parse(struct ath12k_base *ab, struct ath12k_fw_stats *stats = parse->stats; struct ath12k *ar; struct ath12k_link_vif *arvif; - struct ieee80211_sta *sta; - struct ath12k_sta *ahsta; struct ath12k_link_sta *arsta; int i, ret = 0; const void *data = ptr; @@ -8278,21 +8276,19 @@ static int ath12k_wmi_tlv_fw_stats_data_parse(struct ath12k_base *ab, arvif = ath12k_mac_get_arvif(ar, le32_to_cpu(src->vdev_id)); if (arvif) { - sta = ieee80211_find_sta_by_ifaddr(ath12k_ar_to_hw(ar), - arvif->bssid, - NULL); - if (sta) { - ahsta = ath12k_sta_to_ahsta(sta); - arsta = &ahsta->deflink; + spin_lock_bh(&ab->base_lock); + arsta = ath12k_link_sta_find_by_addr(ab, arvif->bssid); + if (arsta) { arsta->rssi_beacon = le32_to_cpu(src->beacon_snr); ath12k_dbg(ab, ATH12K_DBG_WMI, "wmi stats vdev id %d snr %d\n", src->vdev_id, src->beacon_snr); } else { - ath12k_dbg(ab, ATH12K_DBG_WMI, - "not found station bssid %pM for vdev stat\n", - arvif->bssid); + ath12k_warn(ab, + "not found link sta with bssid %pM for vdev stat\n", + arvif->bssid); } + spin_unlock_bh(&ab->base_lock); } data += sizeof(*src); @@ -8363,8 +8359,6 @@ static int ath12k_wmi_tlv_rssi_chain_parse(struct ath12k_base *ab, struct ath12k_fw_stats *stats = parse->stats; struct ath12k_link_vif *arvif; struct ath12k_link_sta *arsta; - struct ieee80211_sta *sta; - struct ath12k_sta *ahsta; struct ath12k *ar; int vdev_id; int j; @@ -8400,19 +8394,15 @@ static int ath12k_wmi_tlv_rssi_chain_parse(struct ath12k_base *ab, "stats bssid %pM vif %p\n", arvif->bssid, arvif->ahvif->vif); - sta = ieee80211_find_sta_by_ifaddr(ath12k_ar_to_hw(ar), - arvif->bssid, - NULL); - if (!sta) { - ath12k_dbg(ab, ATH12K_DBG_WMI, - "not found station of bssid %pM for rssi chain\n", - arvif->bssid); + guard(spinlock_bh)(&ab->base_lock); + arsta = ath12k_link_sta_find_by_addr(ab, arvif->bssid); + if (!arsta) { + ath12k_warn(ab, + "not found link sta with bssid %pM for rssi chain\n", + arvif->bssid); return -EPROTO; } - ahsta = ath12k_sta_to_ahsta(sta); - arsta = &ahsta->deflink; - BUILD_BUG_ON(ARRAY_SIZE(arsta->chain_signal) > ARRAY_SIZE(stats_rssi->rssi_avg_beacon)); -- cgit v1.2.3 From 3540cc453f5679d8c4d5ccc9834a1f5f8184af3a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 26 Feb 2026 16:43:48 +0100 Subject: ALSA: usb-audio: Drop superfluous kernel-doc markers We don't process USB-audio driver code for kernel-doc, and the "/**" marker leads to warnings with W=1 builds. Drop the superfluous markers. Link: https://patch.msgid.link/20260226154414.1081568-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/mixer_s1810c.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/usb/mixer_s1810c.c b/sound/usb/mixer_s1810c.c index 473cb29efa7f..7eac7d1bce64 100644 --- a/sound/usb/mixer_s1810c.c +++ b/sound/usb/mixer_s1810c.c @@ -71,7 +71,7 @@ * * e I guess the same as with mixer * */ -/** struct s1810c_ctl_packet - basic vendor request +/* struct s1810c_ctl_packet - basic vendor request * @selector: device/mixer/output * @b: request-dependant field b * @tag: fixed value identifying type of request @@ -94,14 +94,14 @@ struct s1810c_ctl_packet { __le32 e; }; -/** selectors for CMD request +/* selectors for CMD request */ #define SC1810C_SEL_DEVICE 0 #define SC1810C_SEL_MIXER 0x64 #define SC1810C_SEL_OUTPUT 0x65 -/** control ids */ +/* control ids */ #define SC1810C_CTL_LINE_SW 0 #define SC1810C_CTL_MUTE_SW 1 #define SC1824C_CTL_MONO_SW 2 @@ -127,7 +127,7 @@ struct s1810c_ctl_packet { #define SC1810C_GET_STATE_TAG SC1810C_SET_STATE_TAG #define SC1810C_GET_STATE_LEN SC1810C_SET_STATE_LEN -/** Mixer levels normally range from 0 (off) to 0x0100 0000 (0 dB). +/* Mixer levels normally range from 0 (off) to 0x0100 0000 (0 dB). * raw_level = 2^24 * 10^(db_level / 20), thus * -3dB = 0xb53bf0 (technically, half-power -3.01...dB would be 0xb504f3) * -96dB = 0x109 @@ -145,7 +145,7 @@ struct s1810c_ctl_packet { #define MIXER_LEVEL_N3DB 0xb53bf0 #define MIXER_LEVEL_0DB 0x1000000 -/** +/* * This packet includes mixer volumes and * various other fields, it's an extended * version of ctl_packet, with a and b @@ -155,7 +155,7 @@ struct s1810c_state_packet { __le32 fields[63]; }; -/** indices into s1810c_state_packet.fields[] +/* indices into s1810c_state_packet.fields[] */ #define SC1810C_STATE_TAG_IDX 2 #define SC1810C_STATE_LEN_IDX 3 -- cgit v1.2.3 From 1d6452a0ce78cd3f4e48943b5ba21d273a658298 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 26 Feb 2026 16:43:49 +0100 Subject: ALSA: usb: qcom: Correct parameter comment for uaudio_transfer_buffer_setup() At fixing the memory leak of xfer buffer, we forgot to update the corresponding comment, too. This resulted in a kernel-doc warning with W=1. Let's correct it. Fixes: 5c7ef5001292 ("ALSA: qc_audio_offload: avoid leaking xfer_buf allocation") Link: https://patch.msgid.link/20260226154414.1081568-4-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index 01e6063c2207..510b68cced33 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -1007,7 +1007,7 @@ put_suspend: /** * uaudio_transfer_buffer_setup() - fetch and populate xfer buffer params * @subs: usb substream - * @xfer_buf: xfer buf to be allocated + * @xfer_buf_cpu: xfer buf to be allocated * @xfer_buf_len: size of allocation * @mem_info: QMI response info * -- cgit v1.2.3 From dc9786a06d53291a5af824e854dd0769b1a97dbe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 26 Feb 2026 16:54:45 +0100 Subject: ALSA: us144mkii: Drop kernel-doc markers We don't process this driver code for kernel-doc, and the "/**" marker leads to warnings with W=1 builds. Drop the superfluous markers, and also fix the invalid mark up, too. Link: https://patch.msgid.link/20260226155456.1092186-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/usx2y/us144mkii.c | 14 ++++++------ sound/usb/usx2y/us144mkii_capture.c | 12 +++++------ sound/usb/usx2y/us144mkii_controls.c | 42 ++++++++++++++++++------------------ sound/usb/usx2y/us144mkii_midi.c | 22 +++++++++---------- sound/usb/usx2y/us144mkii_playback.c | 10 ++++----- 5 files changed, 50 insertions(+), 50 deletions(-) diff --git a/sound/usb/usx2y/us144mkii.c b/sound/usb/usx2y/us144mkii.c index bc71968df8e2..0cf4fa74e210 100644 --- a/sound/usb/usx2y/us144mkii.c +++ b/sound/usb/usx2y/us144mkii.c @@ -10,8 +10,8 @@ MODULE_AUTHOR("Šerif Rami "); MODULE_DESCRIPTION("ALSA Driver for TASCAM US-144MKII"); MODULE_LICENSE("GPL"); -/** - * @brief Module parameters for ALSA card instantiation. +/* + * Module parameters for ALSA card instantiation. * * These parameters allow users to configure how the ALSA sound card * for the TASCAM US-144MKII is instantiated. @@ -269,7 +269,7 @@ void tascam_stop_work_handler(struct work_struct *work) atomic_set(&tascam->active_urbs, 0); } -/** +/* * tascam_card_private_free() - Frees private data associated with the sound * card. * @card: Pointer to the ALSA sound card instance. @@ -291,7 +291,7 @@ static void tascam_card_private_free(struct snd_card *card) } } -/** +/* * tascam_suspend() - Handles device suspension. * @intf: The USB interface being suspended. * @message: Power management message. @@ -332,7 +332,7 @@ static int tascam_suspend(struct usb_interface *intf, pm_message_t message) return 0; } -/** +/* * tascam_resume() - Handles device resumption from suspend. * @intf: The USB interface being resumed. * @@ -390,7 +390,7 @@ static void tascam_error_timer(struct timer_list *t) schedule_work(&tascam->midi_out_work); } -/** +/* * tascam_probe() - Probes for the TASCAM US-144MKII device. * @intf: The USB interface being probed. * @usb_id: The USB device ID. @@ -565,7 +565,7 @@ free_card: return err; } -/** +/* * tascam_disconnect() - Disconnects the TASCAM US-144MKII device. * @intf: The USB interface being disconnected. * diff --git a/sound/usb/usx2y/us144mkii_capture.c b/sound/usb/usx2y/us144mkii_capture.c index 00188ff6cd51..af120bf62173 100644 --- a/sound/usb/usx2y/us144mkii_capture.c +++ b/sound/usb/usx2y/us144mkii_capture.c @@ -3,7 +3,7 @@ #include "us144mkii.h" -/** +/* * tascam_capture_open() - Opens the PCM capture substream. * @substream: The ALSA PCM substream to open. * @@ -23,7 +23,7 @@ static int tascam_capture_open(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_capture_close() - Closes the PCM capture substream. * @substream: The ALSA PCM substream to close. * @@ -41,7 +41,7 @@ static int tascam_capture_close(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_capture_prepare() - Prepares the PCM capture substream for use. * @substream: The ALSA PCM substream to prepare. * @@ -62,7 +62,7 @@ static int tascam_capture_prepare(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_capture_pointer() - Returns the current capture pointer position. * @substream: The ALSA PCM substream. * @@ -91,7 +91,7 @@ tascam_capture_pointer(struct snd_pcm_substream *substream) return do_div(pos, runtime->buffer_size); } -/** +/* * tascam_capture_ops - ALSA PCM operations for capture. * * This structure defines the callback functions for capture stream operations, @@ -109,7 +109,7 @@ const struct snd_pcm_ops tascam_capture_ops = { .pointer = tascam_capture_pointer, }; -/** +/* * decode_tascam_capture_block() - Decodes a raw 512-byte block from the device. * @src_block: Pointer to the 512-byte raw source block. * @dst_block: Pointer to the destination buffer for decoded audio frames. diff --git a/sound/usb/usx2y/us144mkii_controls.c b/sound/usb/usx2y/us144mkii_controls.c index 62055fb8e7ba..81ded11e3709 100644 --- a/sound/usb/usx2y/us144mkii_controls.c +++ b/sound/usb/usx2y/us144mkii_controls.c @@ -3,8 +3,8 @@ #include "us144mkii.h" -/** - * @brief Text descriptions for playback output source options. +/* + * Text descriptions for playback output source options. * * Used by ALSA kcontrol elements to provide user-friendly names for * the playback routing options (e.g., "Playback 1-2", "Playback 3-4"). @@ -12,15 +12,15 @@ static const char *const playback_source_texts[] = { "Playback 1-2", "Playback 3-4" }; -/** - * @brief Text descriptions for capture input source options. +/* + * Text descriptions for capture input source options. * * Used by ALSA kcontrol elements to provide user-friendly names for * the capture routing options (e.g., "Analog In", "Digital In"). */ static const char *const capture_source_texts[] = { "Analog In", "Digital In" }; -/** +/* * tascam_playback_source_info() - ALSA control info callback for playback * source. * @kcontrol: The ALSA kcontrol instance. @@ -38,7 +38,7 @@ static int tascam_playback_source_info(struct snd_kcontrol *kcontrol, return snd_ctl_enum_info(uinfo, 1, 2, playback_source_texts); } -/** +/* * tascam_line_out_get() - ALSA control get callback for Line Outputs Source. * @kcontrol: The ALSA kcontrol instance. * @ucontrol: The ALSA control element value structure to fill. @@ -60,7 +60,7 @@ static int tascam_line_out_get(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_line_out_put() - ALSA control put callback for Line Outputs Source. * @kcontrol: The ALSA kcontrol instance. * @ucontrol: The ALSA control element value structure containing the new value. @@ -89,7 +89,7 @@ static int tascam_line_out_put(struct snd_kcontrol *kcontrol, return changed; } -/** +/* * tascam_line_out_control - ALSA kcontrol definition for Line Outputs Source. * * This defines a new ALSA mixer control named "Line OUTPUTS Source" that allows @@ -106,7 +106,7 @@ static const struct snd_kcontrol_new tascam_line_out_control = { .put = tascam_line_out_put, }; -/** +/* * tascam_digital_out_get() - ALSA control get callback for Digital Outputs * Source. * @kcontrol: The ALSA kcontrol instance. @@ -129,7 +129,7 @@ static int tascam_digital_out_get(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_digital_out_put() - ALSA control put callback for Digital Outputs * Source. * @kcontrol: The ALSA kcontrol instance. @@ -159,7 +159,7 @@ static int tascam_digital_out_put(struct snd_kcontrol *kcontrol, return changed; } -/** +/* * tascam_digital_out_control - ALSA kcontrol definition for Digital Outputs * Source. * @@ -177,7 +177,7 @@ static const struct snd_kcontrol_new tascam_digital_out_control = { .put = tascam_digital_out_put, }; -/** +/* * tascam_capture_source_info() - ALSA control info callback for capture source. * @kcontrol: The ALSA kcontrol instance. * @uinfo: The ALSA control element info structure to fill. @@ -194,7 +194,7 @@ static int tascam_capture_source_info(struct snd_kcontrol *kcontrol, return snd_ctl_enum_info(uinfo, 1, 2, capture_source_texts); } -/** +/* * tascam_capture_12_get() - ALSA control get callback for Capture channels 1 * and 2 Source. * @kcontrol: The ALSA kcontrol instance. @@ -217,7 +217,7 @@ static int tascam_capture_12_get(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_capture_12_put() - ALSA control put callback for Capture channels 1 * and 2 Source. * @kcontrol: The ALSA kcontrol instance. @@ -247,7 +247,7 @@ static int tascam_capture_12_put(struct snd_kcontrol *kcontrol, return changed; } -/** +/* * tascam_capture_12_control - ALSA kcontrol definition for Capture channels 1 * and 2 Source. * @@ -265,7 +265,7 @@ static const struct snd_kcontrol_new tascam_capture_12_control = { .put = tascam_capture_12_put, }; -/** +/* * tascam_capture_34_get() - ALSA control get callback for Capture channels 3 * and 4 Source. * @kcontrol: The ALSA kcontrol instance. @@ -288,7 +288,7 @@ static int tascam_capture_34_get(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_capture_34_put() - ALSA control put callback for Capture channels 3 * and 4 Source. * @kcontrol: The ALSA kcontrol instance. @@ -318,7 +318,7 @@ static int tascam_capture_34_put(struct snd_kcontrol *kcontrol, return changed; } -/** +/* * tascam_capture_34_control - ALSA kcontrol definition for Capture channels 3 * and 4 Source. * @@ -336,7 +336,7 @@ static const struct snd_kcontrol_new tascam_capture_34_control = { .put = tascam_capture_34_put, }; -/** +/* * tascam_samplerate_info() - ALSA control info callback for Sample Rate. * @kcontrol: The ALSA kcontrol instance. * @uinfo: The ALSA control element info structure to fill. @@ -356,7 +356,7 @@ static int tascam_samplerate_info(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_samplerate_get() - ALSA control get callback for Sample Rate. * @kcontrol: The ALSA kcontrol instance. * @ucontrol: The ALSA control element value structure to fill. @@ -400,7 +400,7 @@ static int tascam_samplerate_get(struct snd_kcontrol *kcontrol, return 0; } -/** +/* * tascam_samplerate_control - ALSA kcontrol definition for Sample Rate. * * This defines a new ALSA mixer control named "Sample Rate" that displays diff --git a/sound/usb/usx2y/us144mkii_midi.c b/sound/usb/usx2y/us144mkii_midi.c index ed2afec2a89a..4871797b1670 100644 --- a/sound/usb/usx2y/us144mkii_midi.c +++ b/sound/usb/usx2y/us144mkii_midi.c @@ -3,7 +3,7 @@ #include "us144mkii.h" -/** +/* * tascam_midi_in_work_handler() - Deferred work for processing MIDI input. * @work: The work_struct instance. * @@ -75,7 +75,7 @@ out: usb_put_urb(urb); } -/** +/* * tascam_midi_in_open() - Opens the MIDI input substream. * @substream: The ALSA rawmidi substream to open. * @@ -92,7 +92,7 @@ static int tascam_midi_in_open(struct snd_rawmidi_substream *substream) return 0; } -/** +/* * tascam_midi_in_close() - Closes the MIDI input substream. * @substream: The ALSA rawmidi substream to close. * @@ -103,7 +103,7 @@ static int tascam_midi_in_close(struct snd_rawmidi_substream *substream) return 0; } -/** +/* * tascam_midi_in_trigger() - Triggers MIDI input stream activity. * @substream: The ALSA rawmidi substream. * @up: Boolean indicating whether to start (1) or stop (0) the stream. @@ -150,7 +150,7 @@ static void tascam_midi_in_trigger(struct snd_rawmidi_substream *substream, } } -/** +/* * tascam_midi_in_ops - ALSA rawmidi operations for MIDI input. * * This structure defines the callback functions for MIDI input stream @@ -205,7 +205,7 @@ out: usb_put_urb(urb); } -/** +/* * tascam_midi_out_work_handler() - Deferred work for sending MIDI data * @work: The work_struct instance. * @@ -282,7 +282,7 @@ static void tascam_midi_out_work_handler(struct work_struct *work) } } -/** +/* * tascam_midi_out_open() - Opens the MIDI output substream. * @substream: The ALSA rawmidi substream to open. * @@ -301,7 +301,7 @@ static int tascam_midi_out_open(struct snd_rawmidi_substream *substream) return 0; } -/** +/* * tascam_midi_out_close() - Closes the MIDI output substream. * @substream: The ALSA rawmidi substream to close. * @@ -312,7 +312,7 @@ static int tascam_midi_out_close(struct snd_rawmidi_substream *substream) return 0; } -/** +/* * tascam_midi_out_drain() - Drains the MIDI output stream. * @substream: The ALSA rawmidi substream. * @@ -340,7 +340,7 @@ static void tascam_midi_out_drain(struct snd_rawmidi_substream *substream) usb_kill_anchored_urbs(&tascam->midi_out_anchor); } -/** +/* * tascam_midi_out_trigger() - Triggers MIDI output stream activity. * @substream: The ALSA rawmidi substream. * @up: Boolean indicating whether to start (1) or stop (0) the stream. @@ -361,7 +361,7 @@ static void tascam_midi_out_trigger(struct snd_rawmidi_substream *substream, } } -/** +/* * tascam_midi_out_ops - ALSA rawmidi operations for MIDI output. * * This structure defines the callback functions for MIDI output stream diff --git a/sound/usb/usx2y/us144mkii_playback.c b/sound/usb/usx2y/us144mkii_playback.c index 0cb9699ec211..7efaca0a6489 100644 --- a/sound/usb/usx2y/us144mkii_playback.c +++ b/sound/usb/usx2y/us144mkii_playback.c @@ -3,7 +3,7 @@ #include "us144mkii.h" -/** +/* * tascam_playback_open() - Opens the PCM playback substream. * @substream: The ALSA PCM substream to open. * @@ -23,7 +23,7 @@ static int tascam_playback_open(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_playback_close() - Closes the PCM playback substream. * @substream: The ALSA PCM substream to close. * @@ -41,7 +41,7 @@ static int tascam_playback_close(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_playback_prepare() - Prepares the PCM playback substream for use. * @substream: The ALSA PCM substream to prepare. * @@ -108,7 +108,7 @@ static int tascam_playback_prepare(struct snd_pcm_substream *substream) return 0; } -/** +/* * tascam_playback_pointer() - Returns the current playback pointer position. * @substream: The ALSA PCM substream. * @@ -137,7 +137,7 @@ tascam_playback_pointer(struct snd_pcm_substream *substream) return do_div(pos, runtime->buffer_size); } -/** +/* * tascam_playback_ops - ALSA PCM operations for playback. * * This structure defines the callback functions for playback stream operations, -- cgit v1.2.3 From 021ca6b670bebebc409d43845efcfe8c11c1dd54 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 23 Feb 2026 22:33:22 +0900 Subject: mm/slab: pass __GFP_NOWARN to refill_sheaf() if fallback is available When refill_sheaf() is called, failing to refill the sheaf doesn't necessarily mean the allocation will fail because a fallback path might be available and serve the allocation request. Suppress spurious warnings by passing __GFP_NOWARN along with __GFP_NOMEMALLOC whenever a fallback path is available. When the caller is alloc_full_sheaf() or __pcs_replace_empty_main(), the kernel always falls back to the slowpath (__slab_alloc_node()). For __prefill_sheaf_pfmemalloc(), the fallback path is available only when gfp_pfmemalloc_allowed() returns true. Reported-and-tested-by: Chris Bainbridge Closes: https://lore.kernel.org/linux-mm/aZt2-oS9lkmwT7Ch@debian.local Fixes: 1ce20c28eafd ("slab: handle pfmemalloc slabs properly with sheaves") Link: https://lore.kernel.org/linux-mm/aZwSreGj9-HHdD-j@hyeyoo Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260223133322.16705-1-harry.yoo@oracle.com Tested-by: Mikhail Gavrilov Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 862642c165ed..4ce24d9ee7e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2822,7 +2822,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) if (!sheaf) return NULL; - if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { free_empty_sheaf(s, sheaf); return NULL; } @@ -4575,7 +4575,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; if (empty) { - if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { full = empty; } else { /* @@ -4890,9 +4890,14 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) { - int ret = 0; + gfp_t gfp_nomemalloc; + int ret; + + gfp_nomemalloc = gfp | __GFP_NOMEMALLOC; + if (gfp_pfmemalloc_allowed(gfp)) + gfp_nomemalloc |= __GFP_NOWARN; - ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); + ret = refill_sheaf(s, sheaf, gfp_nomemalloc); if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) return ret; -- cgit v1.2.3 From f3ec502b6755a3bfb12c1c47025ef989ff9efc72 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 25 Feb 2026 08:34:07 -0800 Subject: mm/slab: mark alloc tags empty for sheaves allocated with __GFP_NO_OBJ_EXT alloc_empty_sheaf() allocates sheaves from SLAB_KMALLOC caches using __GFP_NO_OBJ_EXT to avoid recursion, however it does not mark their allocation tags empty before freeing, which results in a warning when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set. Fix this by marking allocation tags for such sheaves as empty. The problem was technically introduced in commit 4c0a17e28340 but only becomes possible to hit with commit 913ffd3a1bf5. Fixes: 4c0a17e28340 ("slab: prevent recursive kmalloc() in alloc_empty_sheaf()") Fixes: 913ffd3a1bf5 ("slab: handle kmalloc sheaves bootstrap") Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/all/20260223155128.3849-1-00107082@163.com/ Analyzed-by: Harry Yoo Signed-off-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Tested-by: Harry Yoo Tested-by: David Wang <00107082@163.com> Link: https://patch.msgid.link/20260225163407.2218712-1-surenb@google.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/gfp_types.h | 2 ++ mm/slab.h | 4 ++-- mm/slub.c | 33 +++++++++++++++++++++++---------- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 814bb2892f99..6c75df30a281 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -139,6 +139,8 @@ enum { * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. * * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. + * mark_obj_codetag_empty() should be called upon freeing for objects allocated + * with this flag to indicate that their NULL tags are expected and normal. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) diff --git a/mm/slab.h b/mm/slab.h index 71c7261bf822..f6ef862b60ef 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -290,14 +290,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, /* Determine object index from a given position */ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) + void *addr, const void *obj) { return reciprocal_divide(kasan_reset_tag(obj) - addr, cache->reciprocal_size); } static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct slab *slab, const void *obj) { if (is_kfence_address(obj)) return 0; diff --git a/mm/slub.c b/mm/slub.c index 4ce24d9ee7e4..52f021711744 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2041,18 +2041,18 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) +static inline void mark_obj_codetag_empty(const void *obj) { - struct slab *obj_exts_slab; + struct slab *obj_slab; unsigned long slab_exts; - obj_exts_slab = virt_to_slab(obj_exts); - slab_exts = slab_obj_exts(obj_exts_slab); + obj_slab = virt_to_slab(obj); + slab_exts = slab_obj_exts(obj_slab); if (slab_exts) { get_slab_obj_exts(slab_exts); - unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, - obj_exts_slab, obj_exts); - struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, + unsigned int offs = obj_to_index(obj_slab->slab_cache, + obj_slab, obj); + struct slabobj_ext *ext = slab_obj_ext(obj_slab, slab_exts, offs); if (unlikely(is_codetag_empty(&ext->ref))) { @@ -2090,7 +2090,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline void mark_obj_codetag_empty(const void *obj) {} static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; } static inline void handle_failed_objexts_alloc(unsigned long obj_exts, struct slabobj_ext *vec, unsigned int objects) {} @@ -2211,7 +2211,7 @@ retry: * assign slabobj_exts in parallel. In this case the existing * objcg vector should be reused. */ - mark_objexts_empty(vec); + mark_obj_codetag_empty(vec); if (unlikely(!allow_spin)) kfree_nolock(vec); else @@ -2254,7 +2254,7 @@ static inline void free_slab_obj_exts(struct slab *slab, bool allow_spin) * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that * the extension for obj_exts is expected to be NULL. */ - mark_objexts_empty(obj_exts); + mark_obj_codetag_empty(obj_exts); if (allow_spin) kfree(obj_exts); else @@ -2312,6 +2312,10 @@ static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) #else /* CONFIG_SLAB_OBJ_EXT */ +static inline void mark_obj_codetag_empty(const void *obj) +{ +} + static inline void init_slab_obj_exts(struct slab *slab) { } @@ -2783,6 +2787,15 @@ static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { + /* + * If the sheaf was created with __GFP_NO_OBJ_EXT flag then its + * corresponding extension is NULL and alloc_tag_sub() will throw a + * warning, therefore replace NULL with CODETAG_EMPTY to indicate + * that the extension for this sheaf is expected to be NULL. + */ + if (s->flags & SLAB_KMALLOC) + mark_obj_codetag_empty(sheaf); + kfree(sheaf); stat(s, SHEAF_FREE); -- cgit v1.2.3 From 2b351ea42820a7ecc2e8305724536512984f4419 Mon Sep 17 00:00:00 2001 From: Sanjay Chitroda Date: Thu, 26 Feb 2026 11:17:12 +0530 Subject: mm/slub: drop duplicate kernel-doc for ksize() The implementation of ksize() was updated with kernel-doc by commit fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") However, the public header still contains a kernel-doc comment attached to the ksize() prototype. Having documentation both in the header and next to the implementation causes Sphinx to treat the function as being documented twice, resulting in the warning: WARNING: Duplicate C declaration, also defined at core-api/mm-api:521 Declaration is '.. c:function:: size_t ksize(const void *objp)' Kernel-doc guidelines recommend keeping the documentation with the function implementation. Therefore remove the redundant kernel-doc block from include/linux/slab.h so that the implementation in slub.c remains the canonical source for documentation. No functional change. Fixes: fab0694646d7 ("mm/slab: move [__]ksize and slab_ksize() to mm/slub.c") Signed-off-by: Sanjay Chitroda Link: https://patch.msgid.link/20260226054712.3610744-1-sanjayembedded@gmail.com Signed-off-by: Vlastimil Babka (SUSE) --- include/linux/slab.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index a5a5e4108ae5..15a60b501b95 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -517,18 +517,6 @@ void kfree_sensitive(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) -/** - * ksize - Report actual allocation size of associated object - * - * @objp: Pointer returned from a prior kmalloc()-family allocation. - * - * This should not be used for writing beyond the originally requested - * allocation size. Either use krealloc() or round up the allocation size - * with kmalloc_size_roundup() prior to allocation. If this is used to - * access beyond the originally requested allocation size, UBSAN_BOUNDS - * and/or FORTIFY_SOURCE may trip, since they only know about the - * originally allocated size via the __alloc_size attribute. - */ size_t ksize(const void *objp); #ifdef CONFIG_PRINTK -- cgit v1.2.3 From 71c1978ab6d2c6d48c31311855f1a85377c152ae Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 26 Feb 2026 16:47:52 +0100 Subject: ASoC: SDCA: Fix comments for sdca_irq_request() The kernel-doc comments for sdca_irq_request() contained some typos that lead to build warnings with W=1. Let's correct them. Fixes: b126394d9ec6 ("ASoC: SDCA: Generic interrupt support") Acked-by: Mark Brown Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260226154753.1083320-1-tiwai@suse.de --- sound/soc/sdca/sdca_interrupts.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c index d9e22cf40f77..95b1ab4ba1b0 100644 --- a/sound/soc/sdca/sdca_interrupts.c +++ b/sound/soc/sdca/sdca_interrupts.c @@ -265,9 +265,9 @@ static int sdca_irq_request_locked(struct device *dev, } /** - * sdca_request_irq - request an individual SDCA interrupt + * sdca_irq_request - request an individual SDCA interrupt * @dev: Pointer to the struct device against which things should be allocated. - * @interrupt_info: Pointer to the interrupt information structure. + * @info: Pointer to the interrupt information structure. * @sdca_irq: SDCA interrupt position. * @name: Name to be given to the IRQ. * @handler: A callback thread function to be called for the IRQ. -- cgit v1.2.3 From 795469820c638b4449f3bb90ee5e98ebccfbc480 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 23 Feb 2026 14:01:56 -0800 Subject: kcsan: test: Adjust "expect" allocation type for kmalloc_obj The call to kmalloc_obj(observed.lines) returns "char (*)[3][512]", a pointer to the whole 2D array. But "expect" wants to be "char (*)[512]", the decayed pointer type, as if it were observed.lines itself (though without the "3" bounds). This produces the following build error: ../kernel/kcsan/kcsan_test.c: In function '__report_matches': ../kernel/kcsan/kcsan_test.c:171:16: error: assignment to 'char (*)[512]' from incompatible pointer type 'char (*)[3][512]' [-Wincompatible-pointer-types] 171 | expect = kmalloc_obj(observed.lines); | ^ Instead of changing the "expect" type to "char (*)[3][512]" and requiring a dereference at each use (e.g. "(expect*)[0]"), just explicitly cast the return to the desired type. Note that I'm intentionally not switching back to byte-based "kmalloc" here because I cannot find a way for the Coccinelle script (which will be used going forward to catch future conversions) to exclude this case. Tested with: $ ./tools/testing/kunit/kunit.py run \ --kconfig_add CONFIG_DEBUG_KERNEL=y \ --kconfig_add CONFIG_KCSAN=y \ --kconfig_add CONFIG_KCSAN_KUNIT_TEST=y \ --arch=x86_64 --qemu_args '-smp 2' kcsan Reported-by: Nathan Chancellor Fixes: 69050f8d6d07 ("treewide: Replace kmalloc with kmalloc_obj for non-scalar types") Signed-off-by: Kees Cook --- kernel/kcsan/kcsan_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 79e655ea4ca1..ae758150ccb9 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -168,7 +168,7 @@ static bool __report_matches(const struct expect_report *r) if (!report_available()) return false; - expect = kmalloc_obj(observed.lines); + expect = (typeof(expect))kmalloc_obj(observed.lines); if (WARN_ON(!expect)) return false; -- cgit v1.2.3 From e5cb94ba5f96d691d8885175d4696d6ae6bc5ec9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Feb 2026 08:22:32 +0000 Subject: arm64: Fix sampling the "stable" virtual counter in preemptible section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ben reports that when running with CONFIG_DEBUG_PREEMPT, using __arch_counter_get_cntvct_stable() results in well deserves warnings, as we access a per-CPU variable without preemption disabled. Fix the issue by disabling preemption on reading the counter. We can probably do a lot better by not disabling preemption on systems that do not require horrible workarounds to return a valid counter value, but this plugs the issue for the time being. Fixes: 29cc0f3aa7c6 ("arm64: Force the use of CNTVCT_EL0 in __delay()") Reported-by: Ben Horgan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/aZw3EGs4rbQvbAzV@e134344.arm.com Tested-by: Ben Horgan Tested-by: André Draszik Signed-off-by: Will Deacon --- arch/arm64/lib/delay.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index d02341303899..e278e060e78a 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -32,7 +32,11 @@ static inline unsigned long xloops_to_cycles(unsigned long xloops) * Note that userspace cannot change the offset behind our back either, * as the vcpu mutex is held as long as KVM_RUN is in progress. */ -#define __delay_cycles() __arch_counter_get_cntvct_stable() +static cycles_t notrace __delay_cycles(void) +{ + guard(preempt_notrace)(); + return __arch_counter_get_cntvct_stable(); +} void __delay(unsigned long cycles) { -- cgit v1.2.3 From df6e4ab654dc482c1d45776257a62ac10e14086c Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 26 Feb 2026 17:29:11 +0530 Subject: arm64: topology: Fix false warning in counters_read_on_cpu() for same-CPU reads The counters_read_on_cpu() function warns when called with IRQs disabled to prevent deadlock in smp_call_function_single(). However, this warning is spurious when reading counters on the current CPU, since no IPI is needed for same CPU reads. Commit 12eb8f4fff24 ("cpufreq: CPPC: Update FIE arch_freq_scale in ticks for non-PCC regs") changed the CPPC Frequency Invariance Engine to read AMU counters directly from the scheduler tick for non-PCC register spaces (like FFH), instead of deferring to a kthread. This means counters_read_on_cpu() is now called with IRQs disabled from the tick handler, triggering the warning. Fix this by restructuring the logic: when IRQs are disabled (tick context), call the function directly for same-CPU reads. Otherwise use smp_call_function_single(). Fixes: 997c021abc6e ("cpufreq: CPPC: Update FIE arch_freq_scale in ticks for non-PCC regs") Suggested-by: Will Deacon Signed-off-by: Sumit Gupta Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3fe1faab0362..b32f13358fbb 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -400,16 +400,25 @@ static inline int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) { /* - * Abort call on counterless CPU or when interrupts are - * disabled - can lead to deadlock in smp sync call. + * Abort call on counterless CPU. */ if (!cpu_has_amu_feat(cpu)) return -EOPNOTSUPP; - if (WARN_ON_ONCE(irqs_disabled())) - return -EPERM; - - smp_call_function_single(cpu, func, val, 1); + if (irqs_disabled()) { + /* + * When IRQs are disabled (tick path: sched_tick -> + * topology_scale_freq_tick or cppc_scale_freq_tick), only local + * CPU counter reads are allowed. Remote CPU counter read would + * require smp_call_function_single() which is unsafe with IRQs + * disabled. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + return -EPERM; + func(val); + } else { + smp_call_function_single(cpu, func, val, 1); + } return 0; } -- cgit v1.2.3 From ef06fd16d48704eac868441d98d4ef083d8f3d07 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 26 Feb 2026 07:55:25 +0000 Subject: bpf, arm64: Force 8-byte alignment for JIT buffer to prevent atomic tearing struct bpf_plt contains a u64 target field. Currently, the BPF JIT allocator requests an alignment of 4 bytes (sizeof(u32)) for the JIT buffer. Because the base address of the JIT buffer can be 4-byte aligned (e.g., ending in 0x4 or 0xc), the relative padding logic in build_plt() fails to ensure that target lands on an 8-byte boundary. This leads to two issues: 1. UBSAN reports misaligned-access warnings when dereferencing the structure. 2. More critically, target is updated concurrently via WRITE_ONCE() in bpf_arch_text_poke() while the JIT'd code executes ldr. On arm64, 64-bit loads/stores are only guaranteed to be single-copy atomic if they are 64-bit aligned. A misaligned target risks a torn read, causing the JIT to jump to a corrupted address. Fix this by increasing the allocation alignment requirement to 8 bytes (sizeof(u64)) in bpf_jit_binary_pack_alloc(). This anchors the base of the JIT buffer to an 8-byte boundary, allowing the relative padding math in build_plt() to correctly align the target field. Fixes: b2ad54e1533e ("bpf, arm64: Implement bpf_arch_text_poke() for arm64") Signed-off-by: Fuad Tabba Acked-by: Will Deacon Link: https://lore.kernel.org/r/20260226075525.233321-1-tabba@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 356d33c7a4ae..adf84962d579 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2119,7 +2119,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align); image_size = extable_offset + extable_size; ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, - sizeof(u32), &header, &image_ptr, + sizeof(u64), &header, &image_ptr, jit_fill_hole); if (!ro_header) { prog = orig_prog; -- cgit v1.2.3 From ad6fface76da42721c15e8fb281570aaa44a2c01 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 25 Feb 2026 12:12:49 +0100 Subject: bpf: Fix kprobe_multi cookies access in show_fdinfo callback We don't check if cookies are available on the kprobe_multi link before accessing them in show_fdinfo callback, we should. Cc: stable@vger.kernel.org Fixes: da7e9c0a7fbc ("bpf: Add show_fdinfo for kprobe_multi") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260225111249.186230-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9bc0dfd235af..0b040a417442 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_kprobe_multi_link *kmulti_link; + bool has_cookies; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + has_cookies = !!kmulti_link->cookies; seq_printf(seq, "kprobe_cnt:\t%u\n" @@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, for (int i = 0; i < kmulti_link->cnt; i++) { seq_printf(seq, "%llu\t %pS\n", - kmulti_link->cookies[i], + has_cookies ? kmulti_link->cookies[i] : 0, (void *)kmulti_link->addrs[i]); } } -- cgit v1.2.3 From b7bf516c3ecd9a2aae2dc2635178ab87b734fef1 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Wed, 25 Feb 2026 05:34:44 +0000 Subject: bpf: Fix stack-out-of-bounds write in devmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_upper_ifindexes() iterates over all upper devices and writes their indices into an array without checking bounds. Also the callers assume that the max number of upper devices is MAX_NEST_DEV and allocate excluded_devices[1+MAX_NEST_DEV] on the stack, but that assumption is not correct and the number of upper devices could be larger than MAX_NEST_DEV (e.g., many macvlans), causing a stack-out-of-bounds write. Add a max parameter to get_upper_ifindexes() to avoid the issue. When there are too many upper devices, return -EOVERFLOW and abort the redirect. To reproduce, create more than MAX_NEST_DEV(8) macvlans on a device with an XDP program attached using BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS. Then send a packet to the device to trigger the XDP redirect path. Reported-by: syzbot+10cc7f13760b31bd2e61@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/698c4ce3.050a0220.340abe.000b.GAE@google.com/T/ Fixes: aeea1b86f936 ("bpf, devmap: Exclude XDP broadcast to master device") Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Kohei Enju Link: https://lore.kernel.org/r/20260225053506.4738-1-kohei@enjuk.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2625601de76e..2984e938f94d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -588,18 +588,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin } /* Get ifindex of each upper device. 'indexes' must be able to hold at - * least MAX_NEST_DEV elements. - * Returns the number of ifindexes added. + * least 'max' elements. + * Returns the number of ifindexes added, or -EOVERFLOW if there are too + * many upper devices. */ -static int get_upper_ifindexes(struct net_device *dev, int *indexes) +static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (n >= max) + return -EOVERFLOW; indexes[n++] = upper->ifindex; } + return n; } @@ -615,7 +619,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev_rx->ifindex; } @@ -733,7 +741,11 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev, excluded_devices); + num_excluded = get_upper_ifindexes(dev, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev->ifindex; } -- cgit v1.2.3 From 60e3cbef3500ad6101bc91946e645f69b1bb9556 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 24 Feb 2026 16:33:51 -0800 Subject: selftests/bpf: Fix a memory leak in xdp_flowtable test test_progs run with ASAN reported [1]: ==126==ERROR: LeakSanitizer: detected memory leaks Direct leak of 32 byte(s) in 1 object(s) allocated from: #0 0x7f1ff3cfa340 in calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:77 #1 0x5610c15bb520 in bpf_program_attach_fd /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/lib/bpf/libbpf.c:13164 #2 0x5610c15bb740 in bpf_program__attach_xdp /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/lib/bpf/libbpf.c:13204 #3 0x5610c14f91d3 in test_xdp_flowtable /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c:138 #4 0x5610c1533566 in run_one_test /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/test_progs.c:1406 #5 0x5610c1537fb0 in main /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/test_progs.c:2097 #6 0x7f1ff25df1c9 (/lib/x86_64-linux-gnu/libc.so.6+0x2a1c9) (BuildId: 8e9fd827446c24067541ac5390e6f527fb5947bb) #7 0x7f1ff25df28a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2a28a) (BuildId: 8e9fd827446c24067541ac5390e6f527fb5947bb) #8 0x5610c0bd3180 in _start (/tmp/work/vmtest/vmtest/selftests/bpf/test_progs+0x593180) (BuildId: cdf9f103f42307dc0a2cd6cfc8afcbc1366cf8bd) Fix by properly destroying bpf_link on exit in xdp_flowtable test. [1] https://github.com/kernel-patches/vmtest/actions/runs/22361085418/job/64716490680 Signed-off-by: Ihor Solodrai Reviewed-by: Subbaraya Sundeep Link: https://lore.kernel.org/r/20260225003351.465104-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c index 3f9146d83d79..325e0b64dc35 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c @@ -67,7 +67,7 @@ void test_xdp_flowtable(void) struct nstoken *tok = NULL; int iifindex, stats_fd; __u32 value, key = 0; - struct bpf_link *link; + struct bpf_link *link = NULL; if (SYS_NOFAIL("nft -v")) { fprintf(stdout, "Missing required nft tool\n"); @@ -160,6 +160,7 @@ void test_xdp_flowtable(void) ASSERT_GE(value, N_PACKETS - 2, "bpf_xdp_flow_lookup failed"); out: + bpf_link__destroy(link); xdp_flowtable__destroy(skel); if (tok) close_netns(tok); -- cgit v1.2.3 From 6881af27f9ea0f5ca8f606f573ef5cc25ca31fe4 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Tue, 24 Feb 2026 16:33:48 -0800 Subject: selftests/bpf: Fix OOB read in dmabuf_collector Dmabuf name allocations can be less than DMA_BUF_NAME_LEN characters, but bpf_probe_read_kernel always tries to read exactly that many bytes. If a name is less than DMA_BUF_NAME_LEN characters, bpf_probe_read_kernel will read past the end. bpf_probe_read_kernel_str stops at the first NUL terminator so use it instead, like iter_dmabuf_for_each already does. Fixes: ae5d2c59ecd7 ("selftests/bpf: Add test for dmabuf_iter") Reported-by: Jerome Lee Signed-off-by: T.J. Mercier Link: https://lore.kernel.org/r/20260225003349.113746-1-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dmabuf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/dmabuf_iter.c b/tools/testing/selftests/bpf/progs/dmabuf_iter.c index 13cdb11fdeb2..9cbb7442646e 100644 --- a/tools/testing/selftests/bpf/progs/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/progs/dmabuf_iter.c @@ -48,7 +48,7 @@ int dmabuf_collector(struct bpf_iter__dmabuf *ctx) /* Buffers are not required to be named */ if (pname) { - if (bpf_probe_read_kernel(name, sizeof(name), pname)) + if (bpf_probe_read_kernel_str(name, sizeof(name), pname) < 0) return 1; /* Name strings can be provided by userspace */ -- cgit v1.2.3 From 749989b2d90ddc7dd253ad3b11a77cf882721acf Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 26 Feb 2026 12:45:17 +0000 Subject: sched_ext: Fix SCX_EFLAG_INITIALIZED being a no-op flag SCX_EFLAG_INITIALIZED is the sole member of enum scx_exit_flags with no explicit value, so the compiler assigns it 0. This makes the bitwise OR in scx_ops_init() a no-op: sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; /* |= 0 */ As a result, BPF schedulers cannot distinguish whether ops.init() completed successfully by inspecting exit_info->flags. Assign the value 1LLU << 0 so the flag is actually set. Fixes: f3aec2adce8d ("sched_ext: Add SCX_EFLAG_INITIALIZED to indicate successful ops.init()") Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- kernel/sched/ext_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 386c677e4c9a..11ebb744d893 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -74,7 +74,7 @@ enum scx_exit_flags { * info communication. The following flag indicates whether ops.init() * finished successfully. */ - SCX_EFLAG_INITIALIZED, + SCX_EFLAG_INITIALIZED = 1LLU << 0, }; /* -- cgit v1.2.3 From b6c3af46c26f2d07c10a1452adc34b821719327e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 23 Feb 2026 19:06:51 +0100 Subject: pinctrl: cy8c95x0: Don't miss reading the last bank registers When code had been changed to use for_each_set_clump8(), it mistakenly switched from chip->nport to chip->tpin since the cy8c9540 and cy8c9560 have a 4-pin gap. This, in particular, led to the missed read of the last bank interrupt status register and hence missing interrupts on those pins. Restore the upper limit in for_each_set_clump8() to take into consideration that gap. Fixes: 83e29a7a1fdf ("pinctrl: cy8c95x0; Switch to use for_each_set_clump8()") Cc: stable@vger.kernel.org Signed-off-by: Andy Shevchenko Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index a4b04bf6d081..5c055d344ac9 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -627,7 +627,7 @@ static int cy8c95x0_write_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, bitmap_scatter(tmask, mask, chip->map, MAX_LINE); bitmap_scatter(tval, val, chip->map, MAX_LINE); - for_each_set_clump8(offset, bits, tmask, chip->tpin) { + for_each_set_clump8(offset, bits, tmask, chip->nport * BANK_SZ) { unsigned int i = offset / 8; write_val = bitmap_get_value8(tval, offset); @@ -655,7 +655,7 @@ static int cy8c95x0_read_regs_mask(struct cy8c95x0_pinctrl *chip, int reg, bitmap_scatter(tmask, mask, chip->map, MAX_LINE); bitmap_scatter(tval, val, chip->map, MAX_LINE); - for_each_set_clump8(offset, bits, tmask, chip->tpin) { + for_each_set_clump8(offset, bits, tmask, chip->nport * BANK_SZ) { unsigned int i = offset / 8; ret = cy8c95x0_regmap_read_bits(chip, reg, i, bits, &read_val); -- cgit v1.2.3 From e96493229a6399e902062213c6381162464cdd50 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Tue, 24 Feb 2026 16:09:22 +0100 Subject: spi: stm32: fix missing pointer assignment in case of dma chaining Commit c4f2c05ab029 ("spi: stm32: fix pointer-to-pointer variables usage") introduced a regression since dma descriptors generated as part of the stm32_spi_prepare_rx_dma_mdma_chaining function are not well propagated to the caller function, leading to mdma-dma chaining being no more functional. Fixes: c4f2c05ab029 ("spi: stm32: fix pointer-to-pointer variables usage") Signed-off-by: Alain Volmat Acked-by: Antonio Quartulli Link: https://patch.msgid.link/20260224-spi-stm32-chaining-fix-v1-1-5da7a4851b66@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index b99de8c4cc99..33f211e159ef 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -1625,6 +1625,9 @@ static int stm32_spi_prepare_rx_dma_mdma_chaining(struct stm32_spi *spi, return -EINVAL; } + *rx_mdma_desc = _mdma_desc; + *rx_dma_desc = _dma_desc; + return 0; } -- cgit v1.2.3 From 4fc3a433c13944ee5766ec5b9bf6f1eb4d29b880 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 23 Feb 2026 13:34:35 -0300 Subject: smb: client: use atomic_t for mnt_cifs_flags Use atomic_t for cifs_sb_info::mnt_cifs_flags as it's currently accessed locklessly and may be changed concurrently in mount/remount and reconnect paths. Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 2 +- fs/smb/client/cifs_fs_sb.h | 2 +- fs/smb/client/cifs_ioctl.h | 8 --- fs/smb/client/cifs_unicode.c | 14 ---- fs/smb/client/cifs_unicode.h | 14 +++- fs/smb/client/cifsacl.c | 17 ++--- fs/smb/client/cifsfs.c | 84 ++++++++++++------------ fs/smb/client/cifsglob.h | 61 ++++++++++++++---- fs/smb/client/connect.c | 65 +++++++++++-------- fs/smb/client/dfs_cache.c | 2 +- fs/smb/client/dir.c | 53 ++++++++------- fs/smb/client/file.c | 90 +++++++++++++------------- fs/smb/client/fs_context.c | 149 +++++++++++++++++++++---------------------- fs/smb/client/fs_context.h | 2 +- fs/smb/client/inode.c | 142 +++++++++++++++++++++-------------------- fs/smb/client/ioctl.c | 2 +- fs/smb/client/link.c | 14 ++-- fs/smb/client/misc.c | 16 +++-- fs/smb/client/readdir.c | 39 +++++------ fs/smb/client/reparse.c | 27 ++++---- fs/smb/client/reparse.h | 4 +- fs/smb/client/smb1ops.c | 22 ++++--- fs/smb/client/smb2file.c | 2 +- fs/smb/client/smb2misc.c | 18 ++---- fs/smb/client/smb2ops.c | 8 +-- fs/smb/client/smb2pdu.c | 13 ++-- fs/smb/client/xattr.c | 6 +- 27 files changed, 462 insertions(+), 414 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index c327c246a9b4..04bb95091f49 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -118,7 +118,7 @@ static const char *path_no_prefix(struct cifs_sb_info *cifs_sb, if (!*path) return path; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + if ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_USE_PREFIX_PATH) && cifs_sb->prepath) { len = strlen(cifs_sb->prepath) + 1; if (unlikely(len > strlen(path))) diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h index 5e8d163cb5f8..84e7e366b0ff 100644 --- a/fs/smb/client/cifs_fs_sb.h +++ b/fs/smb/client/cifs_fs_sb.h @@ -55,7 +55,7 @@ struct cifs_sb_info { struct nls_table *local_nls; struct smb3_fs_context *ctx; atomic_t active; - unsigned int mnt_cifs_flags; + atomic_t mnt_cifs_flags; struct delayed_work prune_tlinks; struct rcu_head rcu; diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h index b51ce64fcccf..147496ac9f9f 100644 --- a/fs/smb/client/cifs_ioctl.h +++ b/fs/smb/client/cifs_ioctl.h @@ -122,11 +122,3 @@ struct smb3_notify_info { #define CIFS_GOING_FLAGS_DEFAULT 0x0 /* going down */ #define CIFS_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define CIFS_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi) -{ - if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags) - return true; - else - return false; -} diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index e7891b4406f2..e2edc207cef2 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -11,20 +11,6 @@ #include "cifsglob.h" #include "cifs_debug.h" -int cifs_remap(struct cifs_sb_info *cifs_sb) -{ - int map_type; - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) - map_type = SFM_MAP_UNI_RSVD; - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) - map_type = SFU_MAP_UNI_RSVD; - else - map_type = NO_MAP_UNI_RSVD; - - return map_type; -} - /* Convert character using the SFU - "Services for Unix" remapping range */ static bool convert_sfu_char(const __u16 src_char, char *target) diff --git a/fs/smb/client/cifs_unicode.h b/fs/smb/client/cifs_unicode.h index 9249db3b78c3..3e9cd9acf0a9 100644 --- a/fs/smb/client/cifs_unicode.h +++ b/fs/smb/client/cifs_unicode.h @@ -22,6 +22,7 @@ #include #include #include "../../nls/nls_ucs2_utils.h" +#include "cifsglob.h" /* * Macs use an older "SFM" mapping of the symbols above. Fortunately it does @@ -65,10 +66,21 @@ char *cifs_strndup_from_utf16(const char *src, const int maxlen, const struct nls_table *codepage); int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int map_chars); -int cifs_remap(struct cifs_sb_info *cifs_sb); __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len, const struct nls_table *cp, int remap); wchar_t cifs_toupper(wchar_t in); +static inline int cifs_remap(const struct cifs_sb_info *cifs_sb) +{ + unsigned int sbflags = cifs_sb_flags(cifs_sb); + + if (sbflags & CIFS_MOUNT_MAP_SFM_CHR) + return SFM_MAP_UNI_RSVD; + if (sbflags & CIFS_MOUNT_MAP_SPECIAL_CHR) + return SFU_MAP_UNI_RSVD; + + return NO_MAP_UNI_RSVD; +} + #endif /* _CIFS_UNICODE_H */ diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 6fa12c901c14..f4cb3018a358 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -356,7 +356,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid, psid->num_subauth, SID_MAX_SUB_AUTHORITIES); } - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) || + if ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_UID_FROM_ACL) || (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) { uint32_t unix_id; bool is_group; @@ -1612,7 +1612,8 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, struct smb_acl *dacl_ptr = NULL; struct smb_ntsd *pntsd = NULL; /* acl obtained from server */ struct smb_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + unsigned int sbflags; struct tcon_link *tlink; struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; @@ -1643,15 +1644,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) - mode_from_sid = true; - else - mode_from_sid = false; - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) - id_from_sid = true; - else - id_from_sid = false; + sbflags = cifs_sb_flags(cifs_sb); + mode_from_sid = sbflags & CIFS_MOUNT_MODE_FROM_SID; + id_from_sid = sbflags & CIFS_MOUNT_UID_FROM_ACL; /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 99b04234a08e..427558404aa5 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -226,16 +226,18 @@ cifs_sb_deactive(struct super_block *sb) static int cifs_read_super(struct super_block *sb) { - struct inode *inode; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; + unsigned int sbflags; struct timespec64 ts; + struct inode *inode; int rc = 0; cifs_sb = CIFS_SB(sb); tcon = cifs_sb_master_tcon(cifs_sb); + sbflags = cifs_sb_flags(cifs_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + if (sbflags & CIFS_MOUNT_POSIXACL) sb->s_flags |= SB_POSIXACL; if (tcon->snapshot_time) @@ -311,7 +313,7 @@ cifs_read_super(struct super_block *sb) } #ifdef CONFIG_CIFS_NFSD_EXPORT - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + if (sbflags & CIFS_MOUNT_SERVER_INUM) { cifs_dbg(FYI, "export ops supported\n"); sb->s_export_op = &cifs_export_ops; } @@ -389,8 +391,7 @@ statfs_out: static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) { - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(CIFS_SB(file)); struct TCP_Server_Info *server = tcon->ses->server; struct inode *inode = file_inode(file); int rc; @@ -418,11 +419,9 @@ out_unlock: static int cifs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { - struct cifs_sb_info *cifs_sb; + unsigned int sbflags = cifs_sb_flags(CIFS_SB(inode)); - cifs_sb = CIFS_SB(inode->i_sb); - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { + if (sbflags & CIFS_MOUNT_NO_PERM) { if ((mask & MAY_EXEC) && !execute_ok(inode)) return -EACCES; else @@ -568,15 +567,17 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) static void cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) { + unsigned int sbflags = cifs_sb_flags(cifs_sb); + seq_puts(s, ",cache="); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + if (sbflags & CIFS_MOUNT_STRICT_IO) seq_puts(s, "strict"); - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) + else if (sbflags & CIFS_MOUNT_DIRECT_IO) seq_puts(s, "none"); - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) + else if (sbflags & CIFS_MOUNT_RW_CACHE) seq_puts(s, "singleclient"); /* assume only one client access */ - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) + else if (sbflags & CIFS_MOUNT_RO_CACHE) seq_puts(s, "ro"); /* read only caching assumed */ else seq_puts(s, "loose"); @@ -637,6 +638,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct sockaddr *srcaddr; + unsigned int sbflags; + srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; seq_show_option(s, "vers", tcon->ses->server->vals->version_string); @@ -670,16 +673,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root) (int)(srcaddr->sa_family)); } + sbflags = cifs_sb_flags(cifs_sb); seq_printf(s, ",uid=%u", from_kuid_munged(&init_user_ns, cifs_sb->ctx->linux_uid)); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + if (sbflags & CIFS_MOUNT_OVERR_UID) seq_puts(s, ",forceuid"); else seq_puts(s, ",noforceuid"); seq_printf(s, ",gid=%u", from_kgid_munged(&init_user_ns, cifs_sb->ctx->linux_gid)); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + if (sbflags & CIFS_MOUNT_OVERR_GID) seq_puts(s, ",forcegid"); else seq_puts(s, ",noforcegid"); @@ -722,53 +726,53 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",unix"); else seq_puts(s, ",nounix"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + if (sbflags & CIFS_MOUNT_NO_DFS) seq_puts(s, ",nodfs"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + if (sbflags & CIFS_MOUNT_POSIX_PATHS) seq_puts(s, ",posixpaths"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) + if (sbflags & CIFS_MOUNT_SET_UID) seq_puts(s, ",setuids"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + if (sbflags & CIFS_MOUNT_UID_FROM_ACL) seq_puts(s, ",idsfromsid"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + if (sbflags & CIFS_MOUNT_SERVER_INUM) seq_puts(s, ",serverino"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + if (sbflags & CIFS_MOUNT_RWPIDFORWARD) seq_puts(s, ",rwpidforward"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) + if (sbflags & CIFS_MOUNT_NOPOSIXBRL) seq_puts(s, ",forcemand"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + if (sbflags & CIFS_MOUNT_NO_XATTR) seq_puts(s, ",nouser_xattr"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + if (sbflags & CIFS_MOUNT_MAP_SPECIAL_CHR) seq_puts(s, ",mapchars"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + if (sbflags & CIFS_MOUNT_MAP_SFM_CHR) seq_puts(s, ",mapposix"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + if (sbflags & CIFS_MOUNT_UNX_EMUL) seq_puts(s, ",sfu"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + if (sbflags & CIFS_MOUNT_NO_BRL) seq_puts(s, ",nobrl"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE) + if (sbflags & CIFS_MOUNT_NO_HANDLE_CACHE) seq_puts(s, ",nohandlecache"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) + if (sbflags & CIFS_MOUNT_MODE_FROM_SID) seq_puts(s, ",modefromsid"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + if (sbflags & CIFS_MOUNT_CIFS_ACL) seq_puts(s, ",cifsacl"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + if (sbflags & CIFS_MOUNT_DYNPERM) seq_puts(s, ",dynperm"); if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(s, ",acl"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + if (sbflags & CIFS_MOUNT_MF_SYMLINKS) seq_puts(s, ",mfsymlinks"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + if (sbflags & CIFS_MOUNT_FSCACHE) seq_puts(s, ",fsc"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + if (sbflags & CIFS_MOUNT_NOSSYNC) seq_puts(s, ",nostrictsync"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + if (sbflags & CIFS_MOUNT_NO_PERM) seq_puts(s, ",noperm"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) + if (sbflags & CIFS_MOUNT_CIFS_BACKUPUID) seq_printf(s, ",backupuid=%u", from_kuid_munged(&init_user_ns, cifs_sb->ctx->backupuid)); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) + if (sbflags & CIFS_MOUNT_CIFS_BACKUPGID) seq_printf(s, ",backupgid=%u", from_kgid_munged(&init_user_ns, cifs_sb->ctx->backupgid)); @@ -909,10 +913,10 @@ static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int cifs_drop_inode(struct inode *inode) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + unsigned int sbflags = cifs_sb_flags(CIFS_SB(inode)); /* no serverino => unconditional eviction */ - return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || + return !(sbflags & CIFS_MOUNT_SERVER_INUM) || inode_generic_drop(inode); } @@ -950,7 +954,7 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb) char *s, *p; char sep; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_USE_PREFIX_PATH) return dget(sb->s_root); full_path = cifs_build_path_to_root(ctx, cifs_sb, diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 080ea601c209..6f9b6c72962b 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1580,24 +1580,59 @@ CIFS_I(struct inode *inode) return container_of(inode, struct cifsInodeInfo, netfs.inode); } -static inline struct cifs_sb_info * -CIFS_SB(struct super_block *sb) +static inline void *cinode_to_fsinfo(struct cifsInodeInfo *cinode) +{ + return cinode->netfs.inode.i_sb->s_fs_info; +} + +static inline void *super_to_fsinfo(struct super_block *sb) { return sb->s_fs_info; } -static inline struct cifs_sb_info * -CIFS_FILE_SB(struct file *file) +static inline void *inode_to_fsinfo(struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +static inline void *file_to_fsinfo(struct file *file) +{ + return file_inode(file)->i_sb->s_fs_info; +} + +static inline void *dentry_to_fsinfo(struct dentry *dentry) +{ + return dentry->d_sb->s_fs_info; +} + +static inline void *const_dentry_to_fsinfo(const struct dentry *dentry) { - return CIFS_SB(file_inode(file)->i_sb); + return dentry->d_sb->s_fs_info; +} + +#define CIFS_SB(_ptr) \ + ((struct cifs_sb_info *) \ + _Generic((_ptr), \ + struct cifsInodeInfo * : cinode_to_fsinfo, \ + const struct dentry * : const_dentry_to_fsinfo, \ + struct super_block * : super_to_fsinfo, \ + struct dentry * : dentry_to_fsinfo, \ + struct inode * : inode_to_fsinfo, \ + struct file * : file_to_fsinfo)(_ptr)) + +/* + * Use atomic_t for @cifs_sb->mnt_cifs_flags as it is currently accessed + * locklessly and may be changed concurrently by mount/remount and reconnect + * paths. + */ +static inline unsigned int cifs_sb_flags(const struct cifs_sb_info *cifs_sb) +{ + return atomic_read(&cifs_sb->mnt_cifs_flags); } static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) - return '/'; - else - return '\\'; + return (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_POSIX_PATHS) ? '/' : '\\'; } static inline void @@ -2314,9 +2349,8 @@ static inline bool __cifs_cache_state_check(struct cifsInodeInfo *cinode, unsigned int oplock_flags, unsigned int sb_flags) { - struct cifs_sb_info *cifs_sb = CIFS_SB(cinode->netfs.inode.i_sb); + unsigned int sflags = cifs_sb_flags(CIFS_SB(cinode)); unsigned int oplock = READ_ONCE(cinode->oplock); - unsigned int sflags = cifs_sb->mnt_cifs_flags; return (oplock & oplock_flags) || (sflags & sb_flags); } @@ -2336,4 +2370,9 @@ static inline void cifs_reset_oplock(struct cifsInodeInfo *cinode) WRITE_ONCE(cinode->oplock, 0); } +static inline bool cifs_forced_shutdown(const struct cifs_sb_info *sbi) +{ + return cifs_sb_flags(sbi) & CIFS_MOUNT_SHUTDOWN; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 33dfe116ca52..87686c804057 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2915,8 +2915,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; - unsigned int oldflags = old->mnt_cifs_flags & CIFS_MOUNT_MASK; - unsigned int newflags = new->mnt_cifs_flags & CIFS_MOUNT_MASK; + unsigned int oldflags = cifs_sb_flags(old) & CIFS_MOUNT_MASK; + unsigned int newflags = cifs_sb_flags(new) & CIFS_MOUNT_MASK; if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK)) return 0; @@ -2971,9 +2971,9 @@ static int match_prepath(struct super_block *sb, struct smb3_fs_context *ctx = mnt_data->ctx; struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; - bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + bool old_set = (cifs_sb_flags(old) & CIFS_MOUNT_USE_PREFIX_PATH) && old->prepath; - bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + bool new_set = (cifs_sb_flags(new) & CIFS_MOUNT_USE_PREFIX_PATH) && new->prepath; if (tcon->origin_fullpath && @@ -3004,7 +3004,7 @@ cifs_match_super(struct super_block *sb, void *data) cifs_sb = CIFS_SB(sb); /* We do not want to use a superblock that has been shutdown */ - if (CIFS_MOUNT_SHUTDOWN & cifs_sb->mnt_cifs_flags) { + if (cifs_forced_shutdown(cifs_sb)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -3469,6 +3469,8 @@ ip_connect(struct TCP_Server_Info *server) int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) { struct smb3_fs_context *ctx = cifs_sb->ctx; + unsigned int sbflags; + int rc = 0; INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); INIT_LIST_HEAD(&cifs_sb->tcon_sb_link); @@ -3493,17 +3495,16 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) } ctx->local_nls = cifs_sb->local_nls; - smb3_update_mnt_flags(cifs_sb); + sbflags = smb3_update_mnt_flags(cifs_sb); if (ctx->direct_io) cifs_dbg(FYI, "mounting share using direct i/o\n"); if (ctx->cache_ro) { cifs_dbg(VFS, "mounting share with read only caching. Ensure that the share will not be modified while in use.\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RO_CACHE; + sbflags |= CIFS_MOUNT_RO_CACHE; } else if (ctx->cache_rw) { cifs_dbg(VFS, "mounting share in single client RW caching mode. Ensure that no other systems will be accessing the share.\n"); - cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_RO_CACHE | - CIFS_MOUNT_RW_CACHE); + sbflags |= CIFS_MOUNT_RO_CACHE | CIFS_MOUNT_RW_CACHE; } if ((ctx->cifs_acl) && (ctx->dynperm)) @@ -3512,16 +3513,19 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) if (ctx->prepath) { cifs_sb->prepath = kstrdup(ctx->prepath, GFP_KERNEL); if (cifs_sb->prepath == NULL) - return -ENOMEM; - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = -ENOMEM; + else + sbflags |= CIFS_MOUNT_USE_PREFIX_PATH; } - return 0; + atomic_set(&cifs_sb->mnt_cifs_flags, sbflags); + return rc; } /* Release all succeed connections */ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx) { + struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; int rc = 0; if (mnt_ctx->tcon) @@ -3533,7 +3537,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx) mnt_ctx->ses = NULL; mnt_ctx->tcon = NULL; mnt_ctx->server = NULL; - mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + atomic_andnot(CIFS_MOUNT_POSIX_PATHS, &cifs_sb->mnt_cifs_flags); free_xid(mnt_ctx->xid); } @@ -3587,19 +3591,23 @@ out: int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) { struct TCP_Server_Info *server; + struct cifs_tcon *tcon = NULL; struct cifs_sb_info *cifs_sb; struct smb3_fs_context *ctx; - struct cifs_tcon *tcon = NULL; + unsigned int sbflags; int rc = 0; - if (WARN_ON_ONCE(!mnt_ctx || !mnt_ctx->server || !mnt_ctx->ses || !mnt_ctx->fs_ctx || - !mnt_ctx->cifs_sb)) { - rc = -EINVAL; - goto out; + if (WARN_ON_ONCE(!mnt_ctx)) + return -EINVAL; + if (WARN_ON_ONCE(!mnt_ctx->server || !mnt_ctx->ses || + !mnt_ctx->fs_ctx || !mnt_ctx->cifs_sb)) { + mnt_ctx->tcon = NULL; + return -EINVAL; } server = mnt_ctx->server; ctx = mnt_ctx->fs_ctx; cifs_sb = mnt_ctx->cifs_sb; + sbflags = cifs_sb_flags(cifs_sb); /* search for existing tcon to this server share */ tcon = cifs_get_tcon(mnt_ctx->ses, ctx); @@ -3614,9 +3622,9 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) * path (i.e., do not remap / and \ and do not map any special characters) */ if (tcon->posix_extensions) { - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; - cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MAP_SFM_CHR | - CIFS_MOUNT_MAP_SPECIAL_CHR); + sbflags |= CIFS_MOUNT_POSIX_PATHS; + sbflags &= ~(CIFS_MOUNT_MAP_SFM_CHR | + CIFS_MOUNT_MAP_SPECIAL_CHR); } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY @@ -3643,12 +3651,11 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) /* do not care if a following call succeed - informational */ if (!tcon->pipe && server->ops->qfs_tcon) { server->ops->qfs_tcon(mnt_ctx->xid, tcon, cifs_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { + if (sbflags & CIFS_MOUNT_RO_CACHE) { if (tcon->fsDevInfo.DeviceCharacteristics & cpu_to_le32(FILE_READ_ONLY_DEVICE)) cifs_dbg(VFS, "mounted to read only share\n"); - else if ((cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_RW_CACHE) == 0) + else if (!(sbflags & CIFS_MOUNT_RW_CACHE)) cifs_dbg(VFS, "read only mount of RW share\n"); /* no need to log a RW mount of a typical RW share */ } @@ -3660,11 +3667,12 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) * Inside cifs_fscache_get_super_cookie it checks * that we do not get super cookie twice. */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + if (sbflags & CIFS_MOUNT_FSCACHE) cifs_fscache_get_super_cookie(tcon); out: mnt_ctx->tcon = tcon; + atomic_set(&cifs_sb->mnt_cifs_flags, sbflags); return rc; } @@ -3783,7 +3791,8 @@ int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx) cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS); if (rc != 0) { cifs_server_dbg(VFS, "cannot query dirs between root and final path, enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + atomic_or(CIFS_MOUNT_USE_PREFIX_PATH, + &cifs_sb->mnt_cifs_flags); rc = 0; } } @@ -3863,7 +3872,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * Force the use of prefix path to support failover on DFS paths that resolve to targets * that have different prefix paths. */ - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + atomic_or(CIFS_MOUNT_USE_PREFIX_PATH, &cifs_sb->mnt_cifs_flags); kfree(cifs_sb->prepath); cifs_sb->prepath = ctx->prepath; ctx->prepath = NULL; @@ -4357,7 +4366,7 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) kuid_t fsuid = current_fsuid(); int err; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 983132735d72..83f8cf2f8d2b 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1333,7 +1333,7 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) * Force the use of prefix path to support failover on DFS paths that resolve to targets * that have different prefix paths. */ - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + atomic_or(CIFS_MOUNT_USE_PREFIX_PATH, &cifs_sb->mnt_cifs_flags); refresh_tcon_referral(tcon, true); return 0; diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index cb10088197d2..953f1fee8cb8 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -82,10 +82,11 @@ char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *pa const char *tree, int tree_len, bool prefix) { - int dfsplen; - int pplen = 0; - struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry); + unsigned int sbflags = cifs_sb_flags(cifs_sb); char dirsep = CIFS_DIR_SEP(cifs_sb); + int pplen = 0; + int dfsplen; char *s; if (unlikely(!page)) @@ -96,7 +97,7 @@ char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *pa else dfsplen = 0; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + if (sbflags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; s = dentry_path_raw(direntry, page, PATH_MAX); @@ -123,7 +124,7 @@ char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *pa if (dfsplen) { s -= dfsplen; memcpy(s, tree, dfsplen); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { + if (sbflags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { if (s[i] == '\\') @@ -152,7 +153,7 @@ char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page static int check_name(struct dentry *direntry, struct cifs_tcon *tcon) { - struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry); int i; if (unlikely(tcon->fsAttrInfo.MaxPathNameComponentLength && @@ -160,7 +161,7 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) return -ENAMETOOLONG; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { cifs_dbg(FYI, "Invalid file name\n"); @@ -181,11 +182,12 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int rc = -ENOENT; int create_options = CREATE_NOT_DIR; int desired_access; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct cifs_tcon *tcon = tlink_tcon(tlink); const char *full_path; void *page = alloc_dentry_path(); struct inode *newinode = NULL; + unsigned int sbflags; int disposition; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -365,6 +367,7 @@ retry_open: * If Open reported that we actually created a file then we now have to * set the mode if possible. */ + sbflags = cifs_sb_flags(cifs_sb); if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) { struct cifs_unix_set_info_args args = { .mode = mode, @@ -374,7 +377,7 @@ retry_open: .device = 0, }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + if (sbflags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) args.gid = inode->i_gid; @@ -411,9 +414,9 @@ cifs_create_get_file_info: if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); if ((*oplock & CIFS_CREATE_ACTION) && S_ISREG(newinode->i_mode)) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + if (sbflags & CIFS_MOUNT_DYNPERM) newinode->i_mode = mode; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + if (sbflags & CIFS_MOUNT_SET_UID) { newinode->i_uid = current_fsuid(); if (inode->i_mode & S_ISGID) newinode->i_gid = inode->i_gid; @@ -458,18 +461,20 @@ int cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct file *file, unsigned int oflags, umode_t mode) { - int rc; - unsigned int xid; - struct tcon_link *tlink; - struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + struct cifs_open_info_data buf = {}; struct TCP_Server_Info *server; - struct cifs_fid fid = {}; + struct cifsFileInfo *file_info; struct cifs_pending_open open; + struct cifs_fid fid = {}; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + unsigned int sbflags; + unsigned int xid; __u32 oplock; - struct cifsFileInfo *file_info; - struct cifs_open_info_data buf = {}; + int rc; - if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + if (unlikely(cifs_forced_shutdown(cifs_sb))) return smb_EIO(smb_eio_trace_forced_shutdown); /* @@ -499,7 +504,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); - tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); goto out_free_xid; @@ -536,13 +541,13 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, goto out; } - if (file->f_flags & O_DIRECT && - CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { - if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + sbflags = cifs_sb_flags(cifs_sb); + if ((file->f_flags & O_DIRECT) && (sbflags & CIFS_MOUNT_STRICT_IO)) { + if (sbflags & CIFS_MOUNT_NO_BRL) file->f_op = &cifs_file_direct_nobrl_ops; else file->f_op = &cifs_file_direct_ops; - } + } file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target); if (file_info == NULL) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 18f31d4eb98d..f3ddcdf406c8 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -270,7 +270,7 @@ static void cifs_begin_writeback(struct netfs_io_request *wreq) static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) { struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); - struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode); struct cifsFileInfo *open_file = NULL; rreq->rsize = cifs_sb->ctx->rsize; @@ -281,7 +281,7 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { WARN_ON_ONCE(1); @@ -906,7 +906,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, * close because it may cause a error when we open this file * again and get at least level II oplock. */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_STRICT_IO) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } @@ -955,11 +955,11 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, int cifs_file_flush(const unsigned int xid, struct inode *inode, struct cifsFileInfo *cfile) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct cifs_tcon *tcon; int rc; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOSSYNC) return 0; if (cfile && (OPEN_FMODE(cfile->f_flags) & FMODE_WRITE)) { @@ -1015,24 +1015,24 @@ static int cifs_do_truncate(const unsigned int xid, struct dentry *dentry) int cifs_open(struct inode *inode, struct file *file) { + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + struct cifs_open_info_data data = {}; + struct cifsFileInfo *cfile = NULL; + struct TCP_Server_Info *server; + struct cifs_pending_open open; + bool posix_open_ok = false; + struct cifs_fid fid = {}; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + const char *full_path; + unsigned int sbflags; int rc = -EACCES; unsigned int xid; __u32 oplock; - struct cifs_sb_info *cifs_sb; - struct TCP_Server_Info *server; - struct cifs_tcon *tcon; - struct tcon_link *tlink; - struct cifsFileInfo *cfile = NULL; void *page; - const char *full_path; - bool posix_open_ok = false; - struct cifs_fid fid = {}; - struct cifs_pending_open open; - struct cifs_open_info_data data = {}; xid = get_xid(); - cifs_sb = CIFS_SB(inode->i_sb); if (unlikely(cifs_forced_shutdown(cifs_sb))) { free_xid(xid); return smb_EIO(smb_eio_trace_forced_shutdown); @@ -1056,9 +1056,9 @@ int cifs_open(struct inode *inode, struct file *file) cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); - if (file->f_flags & O_DIRECT && - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + sbflags = cifs_sb_flags(cifs_sb); + if ((file->f_flags & O_DIRECT) && (sbflags & CIFS_MOUNT_STRICT_IO)) { + if (sbflags & CIFS_MOUNT_NO_BRL) file->f_op = &cifs_file_direct_nobrl_ops; else file->f_op = &cifs_file_direct_ops; @@ -1209,7 +1209,7 @@ cifs_relock_file(struct cifsFileInfo *cfile) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cinode); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING); @@ -1222,7 +1222,7 @@ cifs_relock_file(struct cifsFileInfo *cfile) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0)) rc = cifs_push_posix_locks(cfile); else #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ @@ -2011,7 +2011,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cinode); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* we are going to update can_cache_brlcks here - need a write access */ @@ -2024,7 +2024,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0)) rc = cifs_push_posix_locks(cfile); else #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ @@ -2428,11 +2428,11 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) cifs_read_flock(fl, &type, &lock, &unlock, &wait_flag, tcon->ses->server); - cifs_sb = CIFS_FILE_SB(file); + cifs_sb = CIFS_SB(file); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0)) posix_lck = true; if (!lock && !unlock) { @@ -2455,14 +2455,14 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) int cifs_lock(struct file *file, int cmd, struct file_lock *flock) { - int rc, xid; + struct cifs_sb_info *cifs_sb = CIFS_SB(file); + struct cifsFileInfo *cfile; int lock = 0, unlock = 0; bool wait_flag = false; bool posix_lck = false; - struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsFileInfo *cfile; __u32 type; + int rc, xid; rc = -EACCES; xid = get_xid(); @@ -2477,12 +2477,11 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); - cifs_sb = CIFS_FILE_SB(file); set_bit(CIFS_INO_CLOSE_ON_LOCK, &CIFS_I(d_inode(cfile->dentry))->flags); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0)) posix_lck = true; /* * BB add code here to normalize offset and length to account for @@ -2532,11 +2531,11 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode); struct cifsFileInfo *open_file = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; spin_lock(&cifs_inode->open_file_lock); @@ -2589,10 +2588,10 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, return rc; } - cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); + cifs_sb = CIFS_SB(cifs_inode); /* only filter by fsuid on multiuser mounts */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; spin_lock(&cifs_inode->open_file_lock); @@ -2787,7 +2786,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file_inode(file); - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifs_sb_info *cifs_sb = CIFS_SB(file); rc = file_write_and_wait_range(file, start, end); if (rc) { @@ -2801,7 +2800,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) file, datasync); tcon = tlink_tcon(smbfile->tlink); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOSSYNC)) { server = tcon->ses->server; if (server->ops->flush == NULL) { rc = -ENOSYS; @@ -2853,7 +2852,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); ssize_t rc; rc = netfs_start_io_write(inode); @@ -2870,7 +2869,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) if (rc <= 0) goto out; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) && + if ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) && (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), server->vals->exclusive_lock_type, 0, NULL, CIFS_WRITE_OP))) { @@ -2893,7 +2892,7 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -2906,7 +2905,7 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0)) { written = netfs_file_write_iter(iocb, from); goto out; } @@ -2994,7 +2993,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); @@ -3011,7 +3010,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) if (!CIFS_CACHE_READ(cinode)) return netfs_unbuffered_read_iter(iocb, to); - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) { + if ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NOPOSIXBRL) == 0) { if (iocb->ki_flags & IOCB_DIRECT) return netfs_unbuffered_read_iter(iocb, to); return netfs_buffered_read_iter(iocb, to); @@ -3130,10 +3129,9 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file, if (is_inode_writable(cifsInode) || ((cifsInode->oplock & CIFS_CACHE_RW_FLG) != 0 && from_readdir)) { /* This inode is open for write at least once */ - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(cifsInode); - cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_DIRECT_IO) { /* since no page cache to corrupt on directio we can change size safely */ return true; @@ -3181,7 +3179,7 @@ void cifs_oplock_break(struct work_struct *work) server = tcon->ses->server; scoped_guard(spinlock, &cinode->open_file_lock) { - unsigned int sbflags = cifs_sb->mnt_cifs_flags; + unsigned int sbflags = cifs_sb_flags(cifs_sb); server->ops->downgrade_oplock(server, cinode, cfile->oplock_level, cfile->oplock_epoch, &purge_cache); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 09fe749e97ee..54090739535f 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -2062,161 +2062,160 @@ smb3_cleanup_fs_context(struct smb3_fs_context *ctx) kfree(ctx); } -void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) +unsigned int smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) { + unsigned int sbflags = cifs_sb_flags(cifs_sb); struct smb3_fs_context *ctx = cifs_sb->ctx; if (ctx->nodfs) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS; + sbflags |= CIFS_MOUNT_NO_DFS; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_DFS; + sbflags &= ~CIFS_MOUNT_NO_DFS; if (ctx->noperm) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; + sbflags |= CIFS_MOUNT_NO_PERM; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_PERM; + sbflags &= ~CIFS_MOUNT_NO_PERM; if (ctx->setuids) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + sbflags |= CIFS_MOUNT_SET_UID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SET_UID; + sbflags &= ~CIFS_MOUNT_SET_UID; if (ctx->setuidfromacl) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; + sbflags |= CIFS_MOUNT_UID_FROM_ACL; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UID_FROM_ACL; + sbflags &= ~CIFS_MOUNT_UID_FROM_ACL; if (ctx->server_ino) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; + sbflags |= CIFS_MOUNT_SERVER_INUM; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + sbflags &= ~CIFS_MOUNT_SERVER_INUM; if (ctx->remap) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR; + sbflags |= CIFS_MOUNT_MAP_SFM_CHR; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SFM_CHR; + sbflags &= ~CIFS_MOUNT_MAP_SFM_CHR; if (ctx->sfu_remap) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; + sbflags |= CIFS_MOUNT_MAP_SPECIAL_CHR; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SPECIAL_CHR; + sbflags &= ~CIFS_MOUNT_MAP_SPECIAL_CHR; if (ctx->no_xattr) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; + sbflags |= CIFS_MOUNT_NO_XATTR; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_XATTR; + sbflags &= ~CIFS_MOUNT_NO_XATTR; if (ctx->sfu_emul) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; + sbflags |= CIFS_MOUNT_UNX_EMUL; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UNX_EMUL; + sbflags &= ~CIFS_MOUNT_UNX_EMUL; if (ctx->nobrl) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; + sbflags |= CIFS_MOUNT_NO_BRL; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_BRL; + sbflags &= ~CIFS_MOUNT_NO_BRL; if (ctx->nohandlecache) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE; + sbflags |= CIFS_MOUNT_NO_HANDLE_CACHE; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_HANDLE_CACHE; + sbflags &= ~CIFS_MOUNT_NO_HANDLE_CACHE; if (ctx->nostrictsync) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; + sbflags |= CIFS_MOUNT_NOSSYNC; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOSSYNC; + sbflags &= ~CIFS_MOUNT_NOSSYNC; if (ctx->mand_lock) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; + sbflags |= CIFS_MOUNT_NOPOSIXBRL; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOPOSIXBRL; + sbflags &= ~CIFS_MOUNT_NOPOSIXBRL; if (ctx->rwpidforward) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; + sbflags |= CIFS_MOUNT_RWPIDFORWARD; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_RWPIDFORWARD; + sbflags &= ~CIFS_MOUNT_RWPIDFORWARD; if (ctx->mode_ace) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID; + sbflags |= CIFS_MOUNT_MODE_FROM_SID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MODE_FROM_SID; + sbflags &= ~CIFS_MOUNT_MODE_FROM_SID; if (ctx->cifs_acl) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + sbflags |= CIFS_MOUNT_CIFS_ACL; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_ACL; + sbflags &= ~CIFS_MOUNT_CIFS_ACL; if (ctx->backupuid_specified) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + sbflags |= CIFS_MOUNT_CIFS_BACKUPUID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPUID; + sbflags &= ~CIFS_MOUNT_CIFS_BACKUPUID; if (ctx->backupgid_specified) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; + sbflags |= CIFS_MOUNT_CIFS_BACKUPGID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPGID; + sbflags &= ~CIFS_MOUNT_CIFS_BACKUPGID; if (ctx->override_uid) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; + sbflags |= CIFS_MOUNT_OVERR_UID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_UID; + sbflags &= ~CIFS_MOUNT_OVERR_UID; if (ctx->override_gid) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; + sbflags |= CIFS_MOUNT_OVERR_GID; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_GID; + sbflags &= ~CIFS_MOUNT_OVERR_GID; if (ctx->dynperm) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + sbflags |= CIFS_MOUNT_DYNPERM; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DYNPERM; + sbflags &= ~CIFS_MOUNT_DYNPERM; if (ctx->fsc) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; + sbflags |= CIFS_MOUNT_FSCACHE; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_FSCACHE; + sbflags &= ~CIFS_MOUNT_FSCACHE; if (ctx->multiuser) - cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | - CIFS_MOUNT_NO_PERM); + sbflags |= CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_NO_PERM; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MULTIUSER; + sbflags &= ~CIFS_MOUNT_MULTIUSER; if (ctx->strict_io) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; + sbflags |= CIFS_MOUNT_STRICT_IO; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_STRICT_IO; + sbflags &= ~CIFS_MOUNT_STRICT_IO; if (ctx->direct_io) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; + sbflags |= CIFS_MOUNT_DIRECT_IO; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DIRECT_IO; + sbflags &= ~CIFS_MOUNT_DIRECT_IO; if (ctx->mfsymlinks) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; + sbflags |= CIFS_MOUNT_MF_SYMLINKS; else - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MF_SYMLINKS; - if (ctx->mfsymlinks) { - if (ctx->sfu_emul) { - /* - * Our SFU ("Services for Unix") emulation allows now - * creating new and reading existing SFU symlinks. - * Older Linux kernel versions were not able to neither - * read existing nor create new SFU symlinks. But - * creating and reading SFU style mknod and FIFOs was - * supported for long time. When "mfsymlinks" and - * "sfu" are both enabled at the same time, it allows - * reading both types of symlinks, but will only create - * them with mfsymlinks format. This allows better - * Apple compatibility, compatibility with older Linux - * kernel clients (probably better for Samba too) - * while still recognizing old Windows style symlinks. - */ - cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); - } - } - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; + sbflags &= ~CIFS_MOUNT_MF_SYMLINKS; - return; + if (ctx->mfsymlinks && ctx->sfu_emul) { + /* + * Our SFU ("Services for Unix") emulation allows now + * creating new and reading existing SFU symlinks. + * Older Linux kernel versions were not able to neither + * read existing nor create new SFU symlinks. But + * creating and reading SFU style mknod and FIFOs was + * supported for long time. When "mfsymlinks" and + * "sfu" are both enabled at the same time, it allows + * reading both types of symlinks, but will only create + * them with mfsymlinks format. This allows better + * Apple compatibility, compatibility with older Linux + * kernel clients (probably better for Samba too) + * while still recognizing old Windows style symlinks. + */ + cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); + } + sbflags &= ~CIFS_MOUNT_SHUTDOWN; + atomic_set(&cifs_sb->mnt_cifs_flags, sbflags); + return sbflags; } diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 49b2a6f09ca2..0b64fcb5d302 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -374,7 +374,7 @@ int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx); int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); -void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); +unsigned int smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); /* * max deferred close timeout (jiffies) - 2^30 diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index d4d3cfeb6c90..3e844c55ab8a 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -40,32 +40,33 @@ static void cifs_set_netfs_context(struct inode *inode) static void cifs_set_ops(struct inode *inode) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct netfs_inode *ictx = netfs_inode(inode); + unsigned int sbflags = cifs_sb_flags(cifs_sb); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &cifs_file_inode_ops; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { + if (sbflags & CIFS_MOUNT_DIRECT_IO) { set_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + if (sbflags & CIFS_MOUNT_NO_BRL) inode->i_fop = &cifs_file_direct_nobrl_ops; else inode->i_fop = &cifs_file_direct_ops; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + } else if (sbflags & CIFS_MOUNT_STRICT_IO) { + if (sbflags & CIFS_MOUNT_NO_BRL) inode->i_fop = &cifs_file_strict_nobrl_ops; else inode->i_fop = &cifs_file_strict_ops; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + } else if (sbflags & CIFS_MOUNT_NO_BRL) inode->i_fop = &cifs_file_nobrl_ops; else { /* not direct, send byte range locks */ inode->i_fop = &cifs_file_ops; } /* check if server can support readahead */ - if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < - PAGE_SIZE + MAX_CIFS_HDR_SIZE) + if (tcon->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; else inode->i_data.a_ops = &cifs_addr_ops; @@ -194,8 +195,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, inode->i_gid = fattr->cf_gid; /* if dynperm is set, don't clobber existing mode */ - if (inode_state_read(inode) & I_NEW || - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) + if ((inode_state_read(inode) & I_NEW) || + !(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_DYNPERM)) inode->i_mode = fattr->cf_mode; cifs_i->cifsAttrs = fattr->cf_cifsattrs; @@ -248,10 +249,8 @@ cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - return; - - fattr->cf_uniqueid = iunique(sb, ROOT_I); + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM)) + fattr->cf_uniqueid = iunique(sb, ROOT_I); } /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ @@ -259,6 +258,8 @@ void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, struct cifs_sb_info *cifs_sb) { + unsigned int sbflags; + memset(fattr, 0, sizeof(*fattr)); fattr->cf_uniqueid = le64_to_cpu(info->UniqueId); fattr->cf_bytes = le64_to_cpu(info->NumOfBytes); @@ -317,8 +318,9 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, break; } + sbflags = cifs_sb_flags(cifs_sb); fattr->cf_uid = cifs_sb->ctx->linux_uid; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) { + if (!(sbflags & CIFS_MOUNT_OVERR_UID)) { u64 id = le64_to_cpu(info->Uid); if (id < ((uid_t)-1)) { kuid_t uid = make_kuid(&init_user_ns, id); @@ -328,7 +330,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, } fattr->cf_gid = cifs_sb->ctx->linux_gid; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) { + if (!(sbflags & CIFS_MOUNT_OVERR_GID)) { u64 id = le64_to_cpu(info->Gid); if (id < ((gid_t)-1)) { kgid_t gid = make_kgid(&init_user_ns, id); @@ -382,7 +384,7 @@ static int update_inode_info(struct super_block *sb, * * If file type or uniqueid is different, return error. */ - if (unlikely((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + if (unlikely((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM) && CIFS_I(*inode)->uniqueid != fattr->cf_uniqueid)) { CIFS_I(*inode)->time = 0; /* force reval */ return -ESTALE; @@ -468,7 +470,7 @@ static int cifs_get_unix_fattr(const unsigned char *full_path, cifs_fill_uniqueid(sb, fattr); /* check for Minshall+French symlinks */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_MF_SYMLINKS) { tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } @@ -1081,7 +1083,7 @@ cifs_backup_query_path_info(int xid, else if ((tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find) == 0) info.info_level = SMB_FIND_FILE_INFO_STANDARD; - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + else if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM) info.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; else /* no srvino useful for fallback to some netapp */ info.info_level = SMB_FIND_FILE_DIRECTORY_INFO; @@ -1109,7 +1111,7 @@ static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_blo struct TCP_Server_Info *server = tcon->ses->server; int rc; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM)) { if (*inode) fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; else @@ -1263,14 +1265,15 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, struct inode **inode, const char *full_path) { + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_open_info_data tmp_data = {}; - struct cifs_tcon *tcon; + void *smb1_backup_rsp_buf = NULL; struct TCP_Server_Info *server; + struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - void *smb1_backup_rsp_buf = NULL; - int rc = 0; + unsigned int sbflags; int tmprc = 0; + int rc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1370,16 +1373,17 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY handle_mnt_opt: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ + sbflags = cifs_sb_flags(cifs_sb); /* query for SFU type info if supported and needed */ if ((fattr->cf_cifsattrs & ATTR_SYSTEM) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { + (sbflags & CIFS_MOUNT_UNX_EMUL)) { tmprc = cifs_sfu_type(fattr, full_path, cifs_sb, xid); if (tmprc) cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } /* fill in 0777 bits from ACL */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { + if (sbflags & CIFS_MOUNT_MODE_FROM_SID) { rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, true, full_path, fid); if (rc == -EREMOTE) @@ -1389,7 +1393,7 @@ handle_mnt_opt: __func__, rc); goto out; } - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + } else if (sbflags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, false, full_path, fid); if (rc == -EREMOTE) @@ -1399,7 +1403,7 @@ handle_mnt_opt: __func__, rc); goto out; } - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + } else if (sbflags & CIFS_MOUNT_UNX_EMUL) /* fill in remaining high mode bits e.g. SUID, VTX */ cifs_sfu_mode(fattr, full_path, cifs_sb, xid); else if (!(tcon->posix_extensions)) @@ -1409,7 +1413,7 @@ handle_mnt_opt: /* check for Minshall+French symlinks */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + if (sbflags & CIFS_MOUNT_MF_SYMLINKS) { tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } @@ -1509,7 +1513,7 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, * 3. Tweak fattr based on mount options */ /* check for Minshall+French symlinks */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_MF_SYMLINKS) { tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } @@ -1660,7 +1664,7 @@ struct inode *cifs_root_iget(struct super_block *sb) int len; int rc; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + if ((cifs_sb_flags(cifs_sb) & CIFS_MOUNT_USE_PREFIX_PATH) && cifs_sb->prepath) { len = strlen(cifs_sb->prepath); path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL); @@ -2098,8 +2102,9 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, const unsigned int xid) { - int rc = 0; struct inode *inode = NULL; + unsigned int sbflags; + int rc = 0; if (tcon->posix_extensions) { rc = smb311_posix_get_inode_info(&inode, full_path, @@ -2139,6 +2144,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (parent->i_mode & S_ISGID) mode |= S_ISGID; + sbflags = cifs_sb_flags(cifs_sb); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext) { struct cifs_unix_set_info_args args = { @@ -2148,7 +2154,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, .mtime = NO_CHANGE_64, .device = 0, }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + if (sbflags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (parent->i_mode & S_ISGID) args.gid = parent->i_gid; @@ -2166,14 +2172,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, { #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ struct TCP_Server_Info *server = tcon->ses->server; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + if (!(sbflags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) server->ops->mkdir_setinfo(inode, full_path, cifs_sb, tcon, xid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + if (sbflags & CIFS_MOUNT_DYNPERM) inode->i_mode = (mode | S_IFDIR); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + if (sbflags & CIFS_MOUNT_SET_UID) { inode->i_uid = current_fsuid(); if (inode->i_mode & S_ISGID) inode->i_gid = parent->i_gid; @@ -2686,7 +2692,7 @@ cifs_dentry_needs_reval(struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_i = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; @@ -2727,7 +2733,7 @@ cifs_dentry_needs_reval(struct dentry *dentry) } /* hardlinked files w/ noserverino get "special" treatment */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM) && S_ISREG(inode->i_mode) && inode->i_nlink != 1) return true; @@ -2752,10 +2758,10 @@ cifs_wait_bit_killable(struct wait_bit_key *key, int mode) int cifs_revalidate_mapping(struct inode *inode) { - int rc; struct cifsInodeInfo *cifs_inode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); unsigned long *flags = &cifs_inode->flags; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc; /* swapfiles are not supposed to be shared */ if (IS_SWAPFILE(inode)) @@ -2768,7 +2774,7 @@ cifs_revalidate_mapping(struct inode *inode) if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) { /* for cache=singleclient, do not invalidate */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RW_CACHE) goto skip_invalidate; cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size; @@ -2892,10 +2898,11 @@ int cifs_revalidate_dentry(struct dentry *dentry) int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { - struct dentry *dentry = path->dentry; - struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(path->dentry); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); + unsigned int sbflags; int rc; if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) @@ -2952,12 +2959,13 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, * enabled, and the admin hasn't overridden them, set the ownership * to the fsuid/fsgid of the current process. */ - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && + sbflags = cifs_sb_flags(cifs_sb); + if ((sbflags & CIFS_MOUNT_MULTIUSER) && + !(sbflags & CIFS_MOUNT_CIFS_ACL) && !tcon->unix_ext) { - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) + if (!(sbflags & CIFS_MOUNT_OVERR_UID)) stat->uid = current_fsuid(); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) + if (!(sbflags & CIFS_MOUNT_OVERR_GID)) stat->gid = current_fsgid(); } return 0; @@ -3102,7 +3110,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) void *page = alloc_dentry_path(); struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); struct tcon_link *tlink; struct cifs_tcon *pTcon; struct cifs_unix_set_info_args *args = NULL; @@ -3113,7 +3121,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) xid = get_xid(); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); @@ -3266,26 +3274,26 @@ out: static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { - unsigned int xid; - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; struct inode *inode = d_inode(direntry); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + unsigned int sbflags = cifs_sb_flags(cifs_sb); struct cifsFileInfo *cfile = NULL; - const char *full_path; void *page = alloc_dentry_path(); - int rc = -EACCES; - __u32 dosattr = 0; __u64 mode = NO_CHANGE_64; - bool posix = cifs_sb_master_tcon(cifs_sb)->posix_extensions; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + const char *full_path; + __u32 dosattr = 0; + int rc = -EACCES; + unsigned int xid; xid = get_xid(); cifs_dbg(FYI, "setattr on file %pd attrs->ia_valid 0x%x\n", direntry, attrs->ia_valid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + if (sbflags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; rc = setattr_prepare(&nop_mnt_idmap, direntry, attrs); @@ -3346,8 +3354,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_GID) gid = attrs->ia_gid; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) { + if (sbflags & (CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_MODE_FROM_SID)) { if (uid_valid(uid) || gid_valid(gid)) { mode = NO_CHANGE_64; rc = id_mode_to_cifs_acl(inode, full_path, &mode, @@ -3358,9 +3365,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } } - } else - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) + } else if (!(sbflags & CIFS_MOUNT_SET_UID)) { attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); + } /* skip mode change if it's just for clearing setuid/setgid */ if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) @@ -3369,9 +3376,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { mode = attrs->ia_mode; rc = 0; - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) || - posix) { + if ((sbflags & (CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_MODE_FROM_SID)) || + cifs_sb_master_tcon(cifs_sb)->posix_extensions) { rc = id_mode_to_cifs_acl(inode, full_path, &mode, INVALID_UID, INVALID_GID); if (rc) { @@ -3393,7 +3399,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) dosattr = cifsInode->cifsAttrs | ATTR_READONLY; /* fix up mode if we're not using dynperm */ - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + if ((sbflags & CIFS_MOUNT_DYNPERM) == 0) attrs->ia_mode = inode->i_mode & ~S_IWUGO; } else if ((mode & S_IWUGO) && (cifsInode->cifsAttrs & ATTR_READONLY)) { @@ -3404,7 +3410,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) dosattr |= ATTR_NORMAL; /* reset local inode permissions to normal */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + if (!(sbflags & CIFS_MOUNT_DYNPERM)) { attrs->ia_mode &= ~(S_IALLUGO); if (S_ISDIR(inode->i_mode)) attrs->ia_mode |= @@ -3413,7 +3419,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_mode |= cifs_sb->ctx->file_mode; } - } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + } else if (!(sbflags & CIFS_MOUNT_DYNPERM)) { /* ignore mode change - ATTR_READONLY hasn't changed */ attrs->ia_valid &= ~ATTR_MODE; } diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 8dc2651e237f..9afab3237e54 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -216,7 +216,7 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) */ case CIFS_GOING_FLAGS_LOGFLUSH: case CIFS_GOING_FLAGS_NOLOGFLUSH: - sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN; + atomic_or(CIFS_MOUNT_SHUTDOWN, &sbi->mnt_cifs_flags); goto shutdown_good; default: rc = -EINVAL; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index a2f7bfa8ad1e..434e8fe74080 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -544,14 +544,15 @@ int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname) { - int rc = -EOPNOTSUPP; - unsigned int xid; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + struct inode *newinode = NULL; struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; + int rc = -EOPNOTSUPP; + unsigned int sbflags; + unsigned int xid; void *page; - struct inode *newinode = NULL; if (unlikely(cifs_forced_shutdown(cifs_sb))) return smb_EIO(smb_eio_trace_forced_shutdown); @@ -580,6 +581,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ + sbflags = cifs_sb_flags(cifs_sb); rc = -EOPNOTSUPP; switch (cifs_symlink_type(cifs_sb)) { case CIFS_SYMLINK_TYPE_UNIX: @@ -594,14 +596,14 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, break; case CIFS_SYMLINK_TYPE_MFSYMLINKS: - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { + if (sbflags & CIFS_MOUNT_MF_SYMLINKS) { rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); } break; case CIFS_SYMLINK_TYPE_SFU: - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + if (sbflags & CIFS_MOUNT_UNX_EMUL) { rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon, full_path, S_IFLNK, 0, symname); diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 22cde46309fe..bc24c92b8b95 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -275,13 +275,15 @@ dump_smb(void *buf, int smb_buf_length) void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + unsigned int sbflags = cifs_sb_flags(cifs_sb); + + if (sbflags & CIFS_MOUNT_SERVER_INUM) { struct cifs_tcon *tcon = NULL; if (cifs_sb->master_tlink) tcon = cifs_sb_master_tcon(cifs_sb); - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; + atomic_andnot(CIFS_MOUNT_SERVER_INUM, &cifs_sb->mnt_cifs_flags); cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", tcon ? tcon->tree_name : "new server"); @@ -382,11 +384,13 @@ void cifs_done_oplock_break(struct cifsInodeInfo *cinode) bool backup_cred(struct cifs_sb_info *cifs_sb) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + unsigned int sbflags = cifs_sb_flags(cifs_sb); + + if (sbflags & CIFS_MOUNT_CIFS_BACKUPUID) { if (uid_eq(cifs_sb->ctx->backupuid, current_fsuid())) return true; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (sbflags & CIFS_MOUNT_CIFS_BACKUPGID) { if (in_group_p(cifs_sb->ctx->backupgid)) return true; } @@ -955,7 +959,7 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb)); } - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + atomic_or(CIFS_MOUNT_USE_PREFIX_PATH, &cifs_sb->mnt_cifs_flags); return 0; } @@ -984,7 +988,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, * look up or tcon is not DFS. */ if (strlen(full_path) < 2 || !cifs_sb || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || + (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_DFS) || !is_tcon_dfs(tcon)) return 0; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 8615a8747b7f..be22bbc4a65a 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -121,7 +121,7 @@ retry: * want to clobber the existing one with the one that * the readdir code created. */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_SERVER_INUM)) fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; /* @@ -177,6 +177,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) struct cifs_open_info_data data = { .reparse = { .tag = fattr->cf_cifstag, }, }; + unsigned int sbflags; fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; @@ -215,12 +216,12 @@ out_reparse: * may look wrong since the inodes may not have timed out by the time * "ls" does a stat() call on them. */ - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) + sbflags = cifs_sb_flags(cifs_sb); + if (sbflags & (CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_MODE_FROM_SID)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && - fattr->cf_cifsattrs & ATTR_SYSTEM) { + if ((sbflags & CIFS_MOUNT_UNX_EMUL) && + (fattr->cf_cifsattrs & ATTR_SYSTEM)) { if (fattr->cf_eof == 0) { fattr->cf_mode &= ~S_IFMT; fattr->cf_mode |= S_IFIFO; @@ -345,13 +346,14 @@ static int _initiate_cifs_search(const unsigned int xid, struct file *file, const char *full_path) { - __u16 search_flags; - int rc = 0; - struct cifsFileInfo *cifsFile; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifs_sb_info *cifs_sb = CIFS_SB(file); struct tcon_link *tlink = NULL; - struct cifs_tcon *tcon; struct TCP_Server_Info *server; + struct cifsFileInfo *cifsFile; + struct cifs_tcon *tcon; + unsigned int sbflags; + __u16 search_flags; + int rc = 0; if (file->private_data == NULL) { tlink = cifs_sb_tlink(cifs_sb); @@ -385,6 +387,7 @@ _initiate_cifs_search(const unsigned int xid, struct file *file, cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: + sbflags = cifs_sb_flags(cifs_sb); /* test for Unix extensions */ /* but now check for them on the share/mount not on the SMB session */ /* if (cap_unix(tcon->ses) { */ @@ -395,7 +398,7 @@ ffirst_retry: else if ((tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find) == 0) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + } else if (sbflags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; @@ -411,8 +414,7 @@ ffirst_retry: if (rc == 0) { cifsFile->invalidHandle = false; - } else if ((rc == -EOPNOTSUPP) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + } else if (rc == -EOPNOTSUPP && (sbflags & CIFS_MOUNT_SERVER_INUM)) { cifs_autodisable_serverino(cifs_sb); goto ffirst_retry; } @@ -690,7 +692,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, loff_t first_entry_in_buffer; loff_t index_to_find = pos; struct cifsFileInfo *cfile = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifs_sb_info *cifs_sb = CIFS_SB(file); struct TCP_Server_Info *server = tcon->ses->server; /* check if index in the buffer */ @@ -955,6 +957,7 @@ static int cifs_filldir(char *find_entry, struct file *file, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_dirent de = { NULL, }; struct cifs_fattr fattr; + unsigned int sbflags; struct qstr name; int rc = 0; @@ -1019,15 +1022,15 @@ static int cifs_filldir(char *find_entry, struct file *file, break; } - if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + sbflags = cifs_sb_flags(cifs_sb); + if (de.ino && (sbflags & CIFS_MOUNT_SERVER_INUM)) { fattr.cf_uniqueid = de.ino; } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); } - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && - couldbe_mf_symlink(&fattr)) + if ((sbflags & CIFS_MOUNT_MF_SYMLINKS) && couldbe_mf_symlink(&fattr)) /* * trying to get the type and mode can be slow, * so just call those regular files for now, and mark @@ -1058,7 +1061,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) const char *full_path; void *page = alloc_dentry_path(); struct cached_fid *cfid = NULL; - struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); + struct cifs_sb_info *cifs_sb = CIFS_SB(file); xid = get_xid(); diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index ce9b923498b5..cd1e1eaee67a 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -55,17 +55,18 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, const char *full_path, const char *symname) { struct reparse_symlink_data_buffer *buf = NULL; - struct cifs_open_info_data data = {}; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); const char *symroot = cifs_sb->ctx->symlinkroot; + struct cifs_open_info_data data = {}; + char sep = CIFS_DIR_SEP(cifs_sb); + char *symlink_target = NULL; + u16 len, plen, poff, slen; + unsigned int sbflags; + __le16 *path = NULL; struct inode *new; + char *sym = NULL; struct kvec iov; - __le16 *path = NULL; bool directory; - char *symlink_target = NULL; - char *sym = NULL; - char sep = CIFS_DIR_SEP(cifs_sb); - u16 len, plen, poff, slen; int rc = 0; if (strlen(symname) > REPARSE_SYM_PATH_MAX) @@ -83,8 +84,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, .symlink_target = symlink_target, }; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && - symroot && symname[0] == '/') { + sbflags = cifs_sb_flags(cifs_sb); + if (!(sbflags & CIFS_MOUNT_POSIX_PATHS) && symroot && symname[0] == '/') { /* * This is a request to create an absolute symlink on the server * which does not support POSIX paths, and expects symlink in @@ -164,7 +165,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, * mask these characters in NT object prefix by '_' and then change * them back. */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') + if (!(sbflags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') sym[0] = sym[1] = sym[2] = sym[5] = '_'; path = cifs_convert_path_to_utf16(sym, cifs_sb); @@ -173,7 +174,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, goto out; } - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + if (!(sbflags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { sym[0] = '\\'; sym[1] = sym[2] = '?'; sym[5] = ':'; @@ -197,7 +198,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, slen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX); poff = 0; plen = slen; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + if (!(sbflags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { /* * For absolute NT symlinks skip leading "\\??\\" in PrintName as * PrintName is user visible location in DOS/Win32 format (not in NT format). @@ -824,7 +825,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, goto out; } - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && + if (!(cifs_sb_flags(cifs_sb) & CIFS_MOUNT_POSIX_PATHS) && symroot && !relative) { /* * This is an absolute symlink from the server which does not diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 570b0d25aeba..0164dc47bdfd 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -33,7 +33,7 @@ static inline kuid_t wsl_make_kuid(struct cifs_sb_info *cifs_sb, { u32 uid = le32_to_cpu(*(__le32 *)ptr); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_OVERR_UID) return cifs_sb->ctx->linux_uid; return make_kuid(current_user_ns(), uid); } @@ -43,7 +43,7 @@ static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb, { u32 gid = le32_to_cpu(*(__le32 *)ptr); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_OVERR_GID) return cifs_sb->ctx->linux_gid; return make_kgid(current_user_ns(), gid); } diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index aed49aaef8c4..9643eca0cb70 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -49,6 +49,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (!CIFSSMBQFSUnixInfo(xid, tcon)) { __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + unsigned int sbflags; cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); /* @@ -75,14 +76,16 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) cifs_dbg(VFS, "per-share encryption not supported yet\n"); + if (cifs_sb) + sbflags = cifs_sb_flags(cifs_sb); + cap &= CIFS_UNIX_CAP_MASK; if (ctx && ctx->no_psx_acl) cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { cifs_dbg(FYI, "negotiated posix acl support\n"); if (cifs_sb) - cifs_sb->mnt_cifs_flags |= - CIFS_MOUNT_POSIXACL; + sbflags |= CIFS_MOUNT_POSIXACL; } if (ctx && ctx->posix_paths == 0) @@ -90,10 +93,12 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { cifs_dbg(FYI, "negotiate posix pathnames\n"); if (cifs_sb) - cifs_sb->mnt_cifs_flags |= - CIFS_MOUNT_POSIX_PATHS; + sbflags |= CIFS_MOUNT_POSIX_PATHS; } + if (cifs_sb) + atomic_set(&cifs_sb->mnt_cifs_flags, sbflags); + cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) @@ -1147,7 +1152,7 @@ static int cifs_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, __u64 volatile_fid, __u16 net_fid, struct cifsInodeInfo *cinode, unsigned int oplock) { - unsigned int sbflags = CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags; + unsigned int sbflags = cifs_sb_flags(CIFS_SB(cinode)); __u8 op; op = !!((oplock & CIFS_CACHE_READ_FLG) || (sbflags & CIFS_MOUNT_RO_CACHE)); @@ -1282,7 +1287,8 @@ cifs_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode); + unsigned int sbflags = cifs_sb_flags(cifs_sb); struct inode *newinode = NULL; int rc; @@ -1298,7 +1304,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, .mtime = NO_CHANGE_64, .device = dev, }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + if (sbflags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); args.gid = current_fsgid(); } else { @@ -1317,7 +1323,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, if (rc == 0) d_instantiate(dentry, newinode); return rc; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + } else if (sbflags & CIFS_MOUNT_UNX_EMUL) { /* * Check if mounted with mount parm 'sfu' mount parm. * SFU emulation should work with all servers diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index 1ab41de2b634..ed651c946251 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -72,7 +72,7 @@ int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_i * POSIX server does not distinguish between symlinks to file and * symlink directory. So nothing is needed to fix on the client side. */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_POSIX_PATHS) return 0; if (!*target) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index e19674d9b92b..973fce3c959c 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -455,17 +455,8 @@ calc_size_exit: __le16 * cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) { - int len; const char *start_of_path; - __le16 *to; - int map_type; - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) - map_type = SFM_MAP_UNI_RSVD; - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) - map_type = SFU_MAP_UNI_RSVD; - else - map_type = NO_MAP_UNI_RSVD; + int len; /* Windows doesn't allow paths beginning with \ */ if (from[0] == '\\') @@ -479,14 +470,13 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) } else start_of_path = from; - to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, - cifs_sb->local_nls, map_type); - return to; + return cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, + cifs_sb->local_nls, cifs_remap(cifs_sb)); } __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode, unsigned int oplock) { - unsigned int sbflags = CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags; + unsigned int sbflags = cifs_sb_flags(CIFS_SB(cinode)); __le32 lease = 0; if ((oplock & CIFS_CACHE_WRITE_FLG) || (sbflags & CIFS_MOUNT_RW_CACHE)) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index fea9a35caa57..7f2d3459cbf9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -986,7 +986,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = -EREMOTE; } if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) + (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_DFS)) rc = -EOPNOTSUPP; goto out; } @@ -2691,7 +2691,7 @@ static int smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, __u64 volatile_fid, __u16 net_fid, struct cifsInodeInfo *cinode, unsigned int oplock) { - unsigned int sbflags = CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags; + unsigned int sbflags = cifs_sb_flags(CIFS_SB(cinode)); __u8 op; if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) @@ -5332,7 +5332,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + unsigned int sbflags = cifs_sb_flags(CIFS_SB(inode)); int rc = -EOPNOTSUPP; /* @@ -5341,7 +5341,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, * supports block and char device, socket & fifo, * and was used by default in earlier versions of Windows */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + if (sbflags & CIFS_MOUNT_UNX_EMUL) { rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); } else if (CIFS_REPARSE_SUPPORT(tcon)) { diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index ef655acf673d..a2a96d817717 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3182,22 +3182,19 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } if ((oparms->disposition != FILE_OPEN) && (oparms->cifs_sb)) { + unsigned int sbflags = cifs_sb_flags(oparms->cifs_sb); bool set_mode; bool set_owner; - if ((oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) && - (oparms->mode != ACL_NO_MODE)) + if ((sbflags & CIFS_MOUNT_MODE_FROM_SID) && + oparms->mode != ACL_NO_MODE) { set_mode = true; - else { + } else { set_mode = false; oparms->mode = ACL_NO_MODE; } - if (oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) - set_owner = true; - else - set_owner = false; - + set_owner = sbflags & CIFS_MOUNT_UID_FROM_ACL; if (set_owner | set_mode) { cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode); rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner); diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index e1a7d9a10a53..23227f2f9428 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -149,7 +149,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, break; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_XATTR) goto out; if (pTcon->ses->server->ops->set_EA) { @@ -309,7 +309,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, break; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_XATTR) goto out; if (pTcon->ses->server->ops->query_all_EAs) @@ -398,7 +398,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (unlikely(cifs_forced_shutdown(cifs_sb))) return smb_EIO(smb_eio_trace_forced_shutdown); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) + if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; tlink = cifs_sb_tlink(cifs_sb); -- cgit v1.2.3 From d9d1e319b39ea685ede59319002d567c159d23c3 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 25 Feb 2026 21:34:55 -0300 Subject: smb: client: fix broken multichannel with krb5+signing When mounting a share with 'multichannel,max_channels=n,sec=krb5i', the client was duplicating signing key for all secondary channels, thus making the server fail all commands sent from secondary channels due to bad signatures. Every channel has its own signing key, so when establishing a new channel with krb5 auth, make sure to use the new session key as the derived key to generate channel's signing key in SMB2_auth_kerberos(). Repro: $ mount.cifs //srv/share /mnt -o multichannel,max_channels=4,sec=krb5i $ sleep 5 $ umount /mnt $ dmesg ... CIFS: VFS: sign fail cmd 0x5 message id 0x2 CIFS: VFS: \\srv SMB signature verification returned error = -13 CIFS: VFS: sign fail cmd 0x5 message id 0x2 CIFS: VFS: \\srv SMB signature verification returned error = -13 CIFS: VFS: sign fail cmd 0x4 message id 0x2 CIFS: VFS: \\srv SMB signature verification returned error = -13 Reported-by: Xiaoli Feng Reviewed-by: Enzo Matsumiya Signed-off-by: Paulo Alcantara (Red Hat) Cc: David Howells Cc: linux-cifs@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index a2a96d817717..04e361ed2356 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1714,19 +1714,17 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) is_binding = (ses->ses_status == SES_GOOD); spin_unlock(&ses->ses_lock); - /* keep session key if binding */ - if (!is_binding) { - kfree_sensitive(ses->auth_key.response); - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory\n", - msg->sesskey_len); - rc = -ENOMEM; - goto out_put_spnego_key; - } - ses->auth_key.len = msg->sesskey_len; + kfree_sensitive(ses->auth_key.response); + ses->auth_key.response = kmemdup(msg->data, + msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "%s: can't allocate (%u bytes) memory\n", + __func__, msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; } + ses->auth_key.len = msg->sesskey_len; sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; sess_data->iov[1].iov_len = msg->secblob_len; -- cgit v1.2.3 From 2f37dc436d4e61ff7ae0b0353cf91b8c10396e4d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 26 Feb 2026 22:28:45 +0100 Subject: smb: client: Don't log plaintext credentials in cifs_set_cifscreds When debug logging is enabled, cifs_set_cifscreds() logs the key payload and exposes the plaintext username and password. Remove the debug log to avoid exposing credentials. Fixes: 8a8798a5ff90 ("cifs: fetch credentials out of keyring for non-krb5 auth multiuser mounts") Cc: stable@vger.kernel.org Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Thorsten Blum Signed-off-by: Steve French --- fs/smb/client/connect.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 87686c804057..4c34d9603e05 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2236,7 +2236,6 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) /* find first : in payload */ payload = upayload->data; delim = strnchr(payload, upayload->datalen, ':'); - cifs_dbg(FYI, "payload=%s\n", payload); if (!delim) { cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n", upayload->datalen); -- cgit v1.2.3 From e35626f610f3d2b7953ccddf6a77453da22b3a9e Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 24 Feb 2026 21:28:32 +0100 Subject: net/sched: ets: fix divide by zero in the offload path Offloading ETS requires computing each class' WRR weight: this is done by averaging over the sums of quanta as 'q_sum' and 'q_psum'. Using unsigned int, the same integer size as the individual DRR quanta, can overflow and even cause division by zero, like it happened in the following splat: Oops: divide error: 0000 [#1] SMP PTI CPU: 13 UID: 0 PID: 487 Comm: tc Tainted: G E 6.19.0-virtme #45 PREEMPT(full) Tainted: [E]=UNSIGNED_MODULE Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:ets_offload_change+0x11f/0x290 [sch_ets] Code: e4 45 31 ff eb 03 41 89 c7 41 89 cb 89 ce 83 f9 0f 0f 87 b7 00 00 00 45 8b 08 31 c0 45 01 cc 45 85 c9 74 09 41 6b c4 64 31 d2 <41> f7 f2 89 c2 44 29 fa 45 89 df 41 83 fb 0f 0f 87 c7 00 00 00 44 RSP: 0018:ffffd0a180d77588 EFLAGS: 00010246 RAX: 00000000ffffff38 RBX: ffff8d3d482ca000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffd0a180d77660 RBP: ffffd0a180d77690 R08: ffff8d3d482ca2d8 R09: 00000000fffffffe R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffe R13: ffff8d3d472f2000 R14: 0000000000000003 R15: 0000000000000000 FS: 00007f440b6c2740(0000) GS:ffff8d3dc9803000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000003cdd2000 CR3: 0000000007b58002 CR4: 0000000000172ef0 Call Trace: ets_qdisc_change+0x870/0xf40 [sch_ets] qdisc_create+0x12b/0x540 tc_modify_qdisc+0x6d7/0xbd0 rtnetlink_rcv_msg+0x168/0x6b0 netlink_rcv_skb+0x5c/0x110 netlink_unicast+0x1d6/0x2b0 netlink_sendmsg+0x22e/0x470 ____sys_sendmsg+0x38a/0x3c0 ___sys_sendmsg+0x99/0xe0 __sys_sendmsg+0x8a/0xf0 do_syscall_64+0x111/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f440b81c77e Code: 4d 89 d8 e8 d4 bc 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa RSP: 002b:00007fff951e4c10 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000481820 RCX: 00007f440b81c77e RDX: 0000000000000000 RSI: 00007fff951e4cd0 RDI: 0000000000000003 RBP: 00007fff951e4c20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00007fff951f4fa8 R13: 00000000699ddede R14: 00007f440bb01000 R15: 0000000000486980 Modules linked in: sch_ets(E) netdevsim(E) ---[ end trace 0000000000000000 ]--- RIP: 0010:ets_offload_change+0x11f/0x290 [sch_ets] Code: e4 45 31 ff eb 03 41 89 c7 41 89 cb 89 ce 83 f9 0f 0f 87 b7 00 00 00 45 8b 08 31 c0 45 01 cc 45 85 c9 74 09 41 6b c4 64 31 d2 <41> f7 f2 89 c2 44 29 fa 45 89 df 41 83 fb 0f 0f 87 c7 00 00 00 44 RSP: 0018:ffffd0a180d77588 EFLAGS: 00010246 RAX: 00000000ffffff38 RBX: ffff8d3d482ca000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffd0a180d77660 RBP: ffffd0a180d77690 R08: ffff8d3d482ca2d8 R09: 00000000fffffffe R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffe R13: ffff8d3d472f2000 R14: 0000000000000003 R15: 0000000000000000 FS: 00007f440b6c2740(0000) GS:ffff8d3dc9803000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000003cdd2000 CR3: 0000000007b58002 CR4: 0000000000172ef0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x30000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- Fix this using 64-bit integers for 'q_sum' and 'q_psum'. Cc: stable@vger.kernel.org Fixes: d35eb52bd2ac ("net: sch_ets: Make the ETS qdisc offloadable") Signed-off-by: Davide Caratti Reviewed-by: Jamal Hadi Salim Reviewed-by: Petr Machata Link: https://patch.msgid.link/28504887df314588c7255e9911769c36f751edee.1771964872.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_ets.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 306e046276d4..a4b07b661b77 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -115,12 +115,12 @@ static void ets_offload_change(struct Qdisc *sch) struct ets_sched *q = qdisc_priv(sch); struct tc_ets_qopt_offload qopt; unsigned int w_psum_prev = 0; - unsigned int q_psum = 0; - unsigned int q_sum = 0; unsigned int quantum; unsigned int w_psum; unsigned int weight; unsigned int i; + u64 q_psum = 0; + u64 q_sum = 0; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; @@ -138,8 +138,12 @@ static void ets_offload_change(struct Qdisc *sch) for (i = 0; i < q->nbands; i++) { quantum = q->classes[i].quantum; - q_psum += quantum; - w_psum = quantum ? q_psum * 100 / q_sum : 0; + if (quantum) { + q_psum += quantum; + w_psum = div64_u64(q_psum * 100, q_sum); + } else { + w_psum = 0; + } weight = w_psum - w_psum_prev; w_psum_prev = w_psum; -- cgit v1.2.3 From 2ef2b20cf4e04ac8a6ba68493f8780776ff84300 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Feb 2026 13:15:47 +0000 Subject: net: annotate data-races around sk->sk_{data_ready,write_space} skmsg (and probably other layers) are changing these pointers while other cpus might read them concurrently. Add corresponding READ_ONCE()/WRITE_ONCE() annotations for UDP, TCP and AF_UNIX. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+87f770387a9e5dc6b79b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/699ee9fc.050a0220.1cd54b.0009.GAE@google.com/ Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Cc: John Fastabend Cc: Jakub Sitnicki Cc: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260225131547.1085509-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 14 +++++++------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_bpf.c | 2 +- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv4/udp_bpf.c | 2 +- net/unix/af_unix.c | 8 ++++---- 8 files changed, 25 insertions(+), 23 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2e26174c9919..3261793abe83 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1205,8 +1205,8 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_strp_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) @@ -1216,8 +1216,8 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; - psock->saved_data_ready = NULL; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); + WRITE_ONCE(psock->saved_data_ready, NULL); strp_stop(&psock->strp); } @@ -1296,8 +1296,8 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_verdict_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_verdict_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) @@ -1308,6 +1308,6 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); psock->saved_data_ready = NULL; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f84d9a45cc9d..8cdc26e8ad68 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1446,7 +1446,7 @@ out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } if (binding) @@ -4181,7 +4181,7 @@ ao_parse: break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); break; case TCP_INQ: if (val > 1 || val < 0) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index c449a044895e..813d2e498c93 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -725,7 +725,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 41c57efd125c..6404e53382ca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5425,7 +5425,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -5635,7 +5635,7 @@ err: void tcp_data_ready(struct sock *sk) { if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5691,7 +5691,7 @@ queue_and_out: inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); inet_csk_schedule_ack(sk); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; @@ -6114,7 +6114,9 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); + INDIRECT_CALL_1(READ_ONCE(sk->sk_write_space), + sk_stream_write_space, + sk); } /* Caller made space either from: @@ -6325,7 +6327,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } } } @@ -7792,7 +7794,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, sock_put(fastopen_sk); goto drop_and_free; } - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d9c5a43bd281..dafb63b923d0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1004,7 +1004,7 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent); + READ_ONCE(parent->sk_data_ready)(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6c6b68a66dcd..014fdfdd331b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * using prepare_to_wait_exclusive(). */ while (nb) { - INDIRECT_CALL_1(sk->sk_data_ready, + INDIRECT_CALL_1(READ_ONCE(sk->sk_data_ready), sock_def_readable, sk); nb--; } diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 91233e37cd97..779a3a03762f 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -158,7 +158,7 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); sock_replace_proto(sk, psock->sk_proto); return 0; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3756a93dc63a..7eaa5b187fef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1785,7 +1785,7 @@ restart: __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); return 0; @@ -2278,7 +2278,7 @@ restart_locked: scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); scm_destroy(&scm); return len; @@ -2351,7 +2351,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, sk_send_sigurg(other); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); return 0; out_unlock: @@ -2477,7 +2477,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sent += size; } -- cgit v1.2.3 From 93c9475c04acad2457a7e7ea4e3ec40a6e6d94a7 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 25 Feb 2026 16:39:55 +0200 Subject: bridge: Check relevant per-VLAN options in VLAN range grouping The br_vlan_opts_eq_range() function determines if consecutive VLANs can be grouped together in a range for compact netlink notifications. It currently checks state, tunnel info, and multicast router configuration, but misses two categories of per-VLAN options that affect the output: 1. User-visible priv_flags (neigh_suppress, mcast_enabled) 2. Port multicast context (mcast_max_groups, mcast_n_groups) When VLANs have different settings for these options, they are incorrectly grouped into ranges, causing netlink notifications to report only one VLAN's settings for the entire range. Fix by checking priv_flags equality, but only for flags that affect netlink output (BR_VLFLAG_NEIGH_SUPPRESS_ENABLED and BR_VLFLAG_MCAST_ENABLED), and comparing multicast context (mcast_max_groups and mcast_n_groups). Example showing the bugs before the fix: $ bridge vlan set vid 10 dev dummy1 neigh_suppress on $ bridge vlan set vid 11 dev dummy1 neigh_suppress off $ bridge -d vlan show dev dummy1 port vlan-id dummy1 10-11 ... neigh_suppress on $ bridge vlan set vid 10 dev dummy1 mcast_max_groups 100 $ bridge vlan set vid 11 dev dummy1 mcast_max_groups 200 $ bridge -d vlan show dev dummy1 port vlan-id dummy1 10-11 ... mcast_max_groups 100 After the fix, VLANs 10 and 11 are shown as separate entries with their correct individual settings. Fixes: a1aee20d5db2 ("net: bridge: Add netlink knobs for number / maximum MDB entries") Fixes: 83f6d600796c ("bridge: vlan: Allow setting VLAN neighbor suppression state") Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260225143956.3995415-2-danieller@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_private.h | 10 ++++++++++ net/bridge/br_vlan_options.c | 26 +++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b9b2981c4841..9b55d38ea9ed 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1344,6 +1344,16 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, true; } +static inline bool +br_multicast_port_ctx_options_equal(const struct net_bridge_mcast_port *pmctx1, + const struct net_bridge_mcast_port *pmctx2) +{ + return br_multicast_ngroups_get(pmctx1) == + br_multicast_ngroups_get(pmctx2) && + br_multicast_ngroups_get_max(pmctx1) == + br_multicast_ngroups_get_max(pmctx2); +} + static inline bool br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index 8fa89b04ee94..5514e1fc8d1f 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -43,9 +43,29 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, u8 range_mc_rtr = br_vlan_multicast_router(range_end); u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); - return v_curr->state == range_end->state && - __vlan_tun_can_enter_range(v_curr, range_end) && - curr_mc_rtr == range_mc_rtr; + if (v_curr->state != range_end->state) + return false; + + if (!__vlan_tun_can_enter_range(v_curr, range_end)) + return false; + + if (curr_mc_rtr != range_mc_rtr) + return false; + + /* Check user-visible priv_flags that affect output */ + if ((v_curr->priv_flags ^ range_end->priv_flags) & + (BR_VLFLAG_NEIGH_SUPPRESS_ENABLED | BR_VLFLAG_MCAST_ENABLED)) + return false; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (!br_vlan_is_master(v_curr) && + !br_multicast_port_ctx_vlan_disabled(&v_curr->port_mcast_ctx) && + !br_multicast_port_ctx_options_equal(&v_curr->port_mcast_ctx, + &range_end->port_mcast_ctx)) + return false; +#endif + + return true; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, -- cgit v1.2.3 From 13540021be228dcda63d02b2245ce8dad01d8473 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 25 Feb 2026 16:39:56 +0200 Subject: selftests: net: Add bridge VLAN range grouping tests Add a new test file bridge_vlan_dump.sh with four test cases that verify VLANs with different per-VLAN options are not incorrectly grouped into ranges in the dump output. The tests verify the kernel's br_vlan_opts_eq_range() function correctly prevents VLAN range grouping when neigh_suppress, mcast_max_groups, mcast_n_groups, or mcast_enabled options differ. Each test verifies that VLANs with different option values appear as individual entries rather than ranges, and that VLANs with matching values are properly grouped together. Example output: $ ./bridge_vlan_dump.sh TEST: VLAN range grouping with neigh_suppress [ OK ] TEST: VLAN range grouping with mcast_max_groups [ OK ] TEST: VLAN range grouping with mcast_n_groups [ OK ] TEST: VLAN range grouping with mcast_enabled [ OK ] Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260225143956.3995415-3-danieller@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/bridge_vlan_dump.sh | 204 ++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100755 tools/testing/selftests/net/bridge_vlan_dump.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index afdea6d95bde..605c54c0e8a3 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -15,6 +15,7 @@ TEST_PROGS := \ big_tcp.sh \ bind_bhash.sh \ bpf_offload.py \ + bridge_vlan_dump.sh \ broadcast_ether_dst.sh \ broadcast_pmtu.sh \ busy_poll_test.sh \ diff --git a/tools/testing/selftests/net/bridge_vlan_dump.sh b/tools/testing/selftests/net/bridge_vlan_dump.sh new file mode 100755 index 000000000000..ad66731d2a6f --- /dev/null +++ b/tools/testing/selftests/net/bridge_vlan_dump.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bridge VLAN range grouping. VLANs are collapsed into a range entry in +# the dump if they have the same per-VLAN options. These tests verify that +# VLANs with different per-VLAN option values are not grouped together. + +# shellcheck disable=SC1091,SC2034,SC2154,SC2317 +source lib.sh + +ALL_TESTS=" + vlan_range_neigh_suppress + vlan_range_mcast_max_groups + vlan_range_mcast_n_groups + vlan_range_mcast_enabled +" + +setup_prepare() +{ + setup_ns NS + defer cleanup_all_ns + + ip -n "$NS" link add name br0 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 1 mcast_vlan_snooping 1 + ip -n "$NS" link set dev br0 up + + ip -n "$NS" link add name dummy0 type dummy + ip -n "$NS" link set dev dummy0 master br0 + ip -n "$NS" link set dev dummy0 up +} + +vlan_range_neigh_suppress() +{ + RET=0 + + # Add two new consecutive VLANs for range grouping test + bridge -n "$NS" vlan add vid 10 dev dummy0 + defer bridge -n "$NS" vlan del vid 10 dev dummy0 + + bridge -n "$NS" vlan add vid 11 dev dummy0 + defer bridge -n "$NS" vlan del vid 11 dev dummy0 + + # Configure different neigh_suppress values and verify no range grouping + bridge -n "$NS" vlan set vid 10 dev dummy0 neigh_suppress on + check_err $? "Failed to set neigh_suppress for VLAN 10" + + bridge -n "$NS" vlan set vid 11 dev dummy0 neigh_suppress off + check_err $? "Failed to set neigh_suppress for VLAN 11" + + # Verify VLANs are not shown as a range, but individual entries exist + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_fail $? "VLANs with different neigh_suppress incorrectly grouped" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+10$|^\s+10$" + check_err $? "VLAN 10 individual entry not found" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+11$|^\s+11$" + check_err $? "VLAN 11 individual entry not found" + + # Configure same neigh_suppress value and verify range grouping + bridge -n "$NS" vlan set vid 11 dev dummy0 neigh_suppress on + check_err $? "Failed to set neigh_suppress for VLAN 11" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_err $? "VLANs with same neigh_suppress not grouped" + + log_test "VLAN range grouping with neigh_suppress" +} + +vlan_range_mcast_max_groups() +{ + RET=0 + + # Add two new consecutive VLANs for range grouping test + bridge -n "$NS" vlan add vid 10 dev dummy0 + defer bridge -n "$NS" vlan del vid 10 dev dummy0 + + bridge -n "$NS" vlan add vid 11 dev dummy0 + defer bridge -n "$NS" vlan del vid 11 dev dummy0 + + # Configure different mcast_max_groups values and verify no range grouping + bridge -n "$NS" vlan set vid 10 dev dummy0 mcast_max_groups 100 + check_err $? "Failed to set mcast_max_groups for VLAN 10" + + bridge -n "$NS" vlan set vid 11 dev dummy0 mcast_max_groups 200 + check_err $? "Failed to set mcast_max_groups for VLAN 11" + + # Verify VLANs are not shown as a range, but individual entries exist + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_fail $? "VLANs with different mcast_max_groups incorrectly grouped" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+10$|^\s+10$" + check_err $? "VLAN 10 individual entry not found" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+11$|^\s+11$" + check_err $? "VLAN 11 individual entry not found" + + # Configure same mcast_max_groups value and verify range grouping + bridge -n "$NS" vlan set vid 11 dev dummy0 mcast_max_groups 100 + check_err $? "Failed to set mcast_max_groups for VLAN 11" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_err $? "VLANs with same mcast_max_groups not grouped" + + log_test "VLAN range grouping with mcast_max_groups" +} + +vlan_range_mcast_n_groups() +{ + RET=0 + + # Add two new consecutive VLANs for range grouping test + bridge -n "$NS" vlan add vid 10 dev dummy0 + defer bridge -n "$NS" vlan del vid 10 dev dummy0 + + bridge -n "$NS" vlan add vid 11 dev dummy0 + defer bridge -n "$NS" vlan del vid 11 dev dummy0 + + # Add different numbers of multicast groups to each VLAN + bridge -n "$NS" mdb add dev br0 port dummy0 grp 239.1.1.1 vid 10 + check_err $? "Failed to add mdb entry to VLAN 10" + defer bridge -n "$NS" mdb del dev br0 port dummy0 grp 239.1.1.1 vid 10 + + bridge -n "$NS" mdb add dev br0 port dummy0 grp 239.1.1.2 vid 10 + check_err $? "Failed to add second mdb entry to VLAN 10" + defer bridge -n "$NS" mdb del dev br0 port dummy0 grp 239.1.1.2 vid 10 + + bridge -n "$NS" mdb add dev br0 port dummy0 grp 239.1.1.1 vid 11 + check_err $? "Failed to add mdb entry to VLAN 11" + defer bridge -n "$NS" mdb del dev br0 port dummy0 grp 239.1.1.1 vid 11 + + # Verify VLANs are not shown as a range due to different mcast_n_groups + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_fail $? "VLANs with different mcast_n_groups incorrectly grouped" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+10$|^\s+10$" + check_err $? "VLAN 10 individual entry not found" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+11$|^\s+11$" + check_err $? "VLAN 11 individual entry not found" + + # Add another group to VLAN 11 to match VLAN 10's count + bridge -n "$NS" mdb add dev br0 port dummy0 grp 239.1.1.2 vid 11 + check_err $? "Failed to add second mdb entry to VLAN 11" + defer bridge -n "$NS" mdb del dev br0 port dummy0 grp 239.1.1.2 vid 11 + + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_err $? "VLANs with same mcast_n_groups not grouped" + + log_test "VLAN range grouping with mcast_n_groups" +} + +vlan_range_mcast_enabled() +{ + RET=0 + + # Add two new consecutive VLANs for range grouping test + bridge -n "$NS" vlan add vid 10 dev br0 self + defer bridge -n "$NS" vlan del vid 10 dev br0 self + + bridge -n "$NS" vlan add vid 11 dev br0 self + defer bridge -n "$NS" vlan del vid 11 dev br0 self + + bridge -n "$NS" vlan add vid 10 dev dummy0 + defer bridge -n "$NS" vlan del vid 10 dev dummy0 + + bridge -n "$NS" vlan add vid 11 dev dummy0 + defer bridge -n "$NS" vlan del vid 11 dev dummy0 + + # Configure different mcast_snooping for bridge VLANs + # Port VLANs inherit BR_VLFLAG_MCAST_ENABLED from bridge VLANs + bridge -n "$NS" vlan global set dev br0 vid 10 mcast_snooping 1 + bridge -n "$NS" vlan global set dev br0 vid 11 mcast_snooping 0 + + # Verify port VLANs are not grouped due to different mcast_enabled + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_fail $? "VLANs with different mcast_enabled incorrectly grouped" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+10$|^\s+10$" + check_err $? "VLAN 10 individual entry not found" + + bridge -n "$NS" -d vlan show dev dummy0 | grep -Eq "^\S+\s+11$|^\s+11$" + check_err $? "VLAN 11 individual entry not found" + + # Configure same mcast_snooping and verify range grouping + bridge -n "$NS" vlan global set dev br0 vid 11 mcast_snooping 1 + + bridge -n "$NS" -d vlan show dev dummy0 | grep -q "10-11" + check_err $? "VLANs with same mcast_enabled not grouped" + + log_test "VLAN range grouping with mcast_enabled" +} + +# Verify the newest tested option is supported +if ! bridge vlan help 2>&1 | grep -q "neigh_suppress"; then + echo "SKIP: iproute2 too old, missing per-VLAN neighbor suppression support" + exit "$ksft_skip" +fi + +trap defer_scopes_cleanup EXIT +setup_prepare +tests_run + +exit "$EXIT_STATUS" -- cgit v1.2.3 From be11a537224d72b906db6b98510619770298c8a4 Mon Sep 17 00:00:00 2001 From: Chintan Vankar Date: Tue, 24 Feb 2026 23:43:59 +0530 Subject: net: ethernet: ti: am65-cpsw-nuss/cpsw-ale: Fix multicast entry handling in ALE table In the current implementation, flushing multicast entries in MAC mode incorrectly deletes entries for all ports instead of only the target port, disrupting multicast traffic on other ports. The cause is adding multicast entries by setting only host port bit, and not setting the MAC port bits. Fix this by setting the MAC port's bit in the port mask while adding the multicast entry. Also fix the flush logic to preserve the host port bit during removal of MAC port and free ALE entries when mask contains only host port. Fixes: 5c50a856d550 ("drivers: net: ethernet: cpsw: add multicast address to ALE table") Signed-off-by: Chintan Vankar Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260224181359.2055322-1-c-vankar@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- drivers/net/ethernet/ti/cpsw_ale.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 5924db6be3fe..967918050433 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -391,7 +391,7 @@ static void am65_cpsw_nuss_ndo_slave_set_rx_mode(struct net_device *ndev) cpsw_ale_set_allmulti(common->ale, ndev->flags & IFF_ALLMULTI, port->port_id); - port_mask = ALE_PORT_HOST; + port_mask = BIT(port->port_id) | ALE_PORT_HOST; /* Clear all mcast from ALE */ cpsw_ale_flush_multicast(common->ale, port_mask, -1); diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index bb969dd435b4..be7b69319221 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -450,14 +450,13 @@ static void cpsw_ale_flush_mcast(struct cpsw_ale *ale, u32 *ale_entry, ale->port_mask_bits); if ((mask & port_mask) == 0) return; /* ports dont intersect, not interested */ - mask &= ~port_mask; + mask &= (~port_mask | ALE_PORT_HOST); - /* free if only remaining port is host port */ - if (mask) + if (mask == 0x0 || mask == ALE_PORT_HOST) + cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE); + else cpsw_ale_set_port_mask(ale_entry, mask, ale->port_mask_bits); - else - cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE); } int cpsw_ale_flush_multicast(struct cpsw_ale *ale, int port_mask, int vid) -- cgit v1.2.3 From 24d87712727a5017ad142d63940589a36cd25647 Mon Sep 17 00:00:00 2001 From: Ariel Silver Date: Sat, 21 Feb 2026 15:26:00 +0100 Subject: media: dvb-net: fix OOB access in ULE extension header tables The ule_mandatory_ext_handlers[] and ule_optional_ext_handlers[] tables in handle_one_ule_extension() are declared with 255 elements (valid indices 0-254), but the index htype is derived from network-controlled data as (ule_sndu_type & 0x00FF), giving a range of 0-255. When htype equals 255, an out-of-bounds read occurs on the function pointer table, and the OOB value may be called as a function pointer. Add a bounds check on htype against the array size before either table is accessed. Out-of-range values now cause the SNDU to be discarded. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Ariel Silver Signed-off-by: Ariel Silver Cc: stable@vger.kernel.org Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-core/dvb_net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c index 8bb8dd34c223..a2159b2bc176 100644 --- a/drivers/media/dvb-core/dvb_net.c +++ b/drivers/media/dvb-core/dvb_net.c @@ -228,6 +228,9 @@ static int handle_one_ule_extension( struct dvb_net_priv *p ) unsigned char hlen = (p->ule_sndu_type & 0x0700) >> 8; unsigned char htype = p->ule_sndu_type & 0x00FF; + if (htype >= ARRAY_SIZE(ule_mandatory_ext_handlers)) + return -1; + /* Discriminate mandatory and optional extension headers. */ if (hlen == 0) { /* Mandatory extension header */ -- cgit v1.2.3 From e9217ca77dc35b4978db0fe901685ddb3f1e223a Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 23 Feb 2026 16:58:09 +0900 Subject: mm/slab: initialize slab->stride early to avoid memory ordering issues When alloc_slab_obj_exts() is called later (instead of during slab allocation and initialization), slab->stride and slab->obj_exts are updated after the slab is already accessible by multiple CPUs. The current implementation does not enforce memory ordering between slab->stride and slab->obj_exts. For correctness, slab->stride must be visible before slab->obj_exts. Otherwise, concurrent readers may observe slab->obj_exts as non-zero while stride is still stale. With stale slab->stride, slab_obj_ext() could return the wrong obj_ext. This could cause two problems: - obj_cgroup_put() is called on the wrong objcg, leading to a use-after-free due to incorrect reference counting [1] by decrementing the reference count more than it was incremented. - refill_obj_stock() is called on the wrong objcg, leading to a page_counter overflow [2] by uncharging more memory than charged. Fix this by unconditionally initializing slab->stride in alloc_slab_obj_exts_early(), before the need_slab_obj_exts() check. In the case of SLAB_OBJ_EXT_IN_OBJ, it is overridden in the function. This ensures updates to slab->stride become visible before the slab can be accessed by other CPUs via the per-node partial slab list (protected by spinlock with acquire/release semantics). Thanks to Shakeel Butt for pointing out this issue [3]. [vbabka@kernel.org: the bug reports [1] and [2] are not yet fully fixed, with investigation ongoing, but it is nevertheless a step in the right direction to only set stride once after allocating the slab and not change it later ] Fixes: 7a8e71bc619d ("mm/slab: use stride to access slabobj_ext") Reported-by: Venkat Rao Bagalkote Link: https://lore.kernel.org/lkml/ca241daa-e7e7-4604-a48d-de91ec9184a5@linux.ibm.com [1] Link: https://lore.kernel.org/all/ddff7c7d-c0c3-4780-808f-9a83268bbf0c@linux.ibm.com [2] Link: https://lore.kernel.org/linux-mm/aZu9G9mVIVzSm6Ft@hyeyoo [3] Signed-off-by: Harry Yoo Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 52f021711744..0c906fefc31b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2196,7 +2196,6 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, retry: old_exts = READ_ONCE(slab->obj_exts); handle_failed_objexts_alloc(old_exts, vec, objects); - slab_set_stride(slab, sizeof(struct slabobj_ext)); if (new_slab) { /* @@ -2272,6 +2271,9 @@ static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) void *addr; unsigned long obj_exts; + /* Initialize stride early to avoid memory ordering issues */ + slab_set_stride(slab, sizeof(struct slabobj_ext)); + if (!need_slab_obj_exts(s)) return; @@ -2288,7 +2290,6 @@ static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) obj_exts |= MEMCG_DATA_OBJEXTS; #endif slab->obj_exts = obj_exts; - slab_set_stride(slab, sizeof(struct slabobj_ext)); } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) { unsigned int offset = obj_exts_offset_in_object(s); -- cgit v1.2.3 From f505a45776d149632e3bd0b87f0da1609607161a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 26 Feb 2026 23:15:22 +0100 Subject: smb: client: Use snprintf in cifs_set_cifscreds Replace unbounded sprintf() calls with the safer snprintf(). Avoid using magic numbers and use strlen() to calculate the key descriptor buffer size. Save the size in a local variable and reuse it for the bounded snprintf() calls. Remove CIFSCREDS_DESC_SIZE. Signed-off-by: Thorsten Blum Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 4c34d9603e05..3bad2c5c523d 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2167,9 +2167,6 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) #ifdef CONFIG_KEYS -/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */ -#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1) - /* Populate username and pw fields from keyring if possible */ static int cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) @@ -2177,6 +2174,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) int rc = 0; int is_domain = 0; const char *delim, *payload; + size_t desc_sz; char *desc; ssize_t len; struct key *key; @@ -2185,7 +2183,9 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) struct sockaddr_in6 *sa6; const struct user_key_payload *upayload; - desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); + /* "cifs:a:" and "cifs:d:" are the same length; +1 for NUL terminator */ + desc_sz = strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1; + desc = kmalloc(desc_sz, GFP_KERNEL); if (!desc) return -ENOMEM; @@ -2193,11 +2193,11 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) switch (server->dstaddr.ss_family) { case AF_INET: sa = (struct sockaddr_in *)&server->dstaddr; - sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr); + snprintf(desc, desc_sz, "cifs:a:%pI4", &sa->sin_addr.s_addr); break; case AF_INET6: sa6 = (struct sockaddr_in6 *)&server->dstaddr; - sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); + snprintf(desc, desc_sz, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); break; default: cifs_dbg(FYI, "Bad ss_family (%hu)\n", @@ -2216,7 +2216,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } /* didn't work, try to find a domain key */ - sprintf(desc, "cifs:d:%s", ses->domainName); + snprintf(desc, desc_sz, "cifs:d:%s", ses->domainName); cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { -- cgit v1.2.3 From 39195990e4c093c9eecf88f29811c6de29265214 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 27 Feb 2026 06:10:08 -0600 Subject: PCI: Correct PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fb82437fdd8c ("PCI: Change capability register offsets to hex") incorrectly converted the PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 value from decimal 52 to hex 0x32: -#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ +#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 0x32 /* end of v2 EPs w/ link */ This broke PCI capabilities in a VMM because subsequent ones weren't DWORD-aligned. Change PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 to the correct value of 0x34. fb82437fdd8c was from Baruch Siach , but this was not Baruch's fault; it's a mistake I made when applying the patch. Fixes: fb82437fdd8c ("PCI: Change capability register offsets to hex") Reported-by: David Woodhouse Closes: https://lore.kernel.org/all/3ae392a0158e9d9ab09a1d42150429dd8ca42791.camel@infradead.org Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- include/uapi/linux/pci_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index ec1c54b5a310..14f634ab9350 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -712,7 +712,7 @@ #define PCI_EXP_LNKCTL2_HASD 0x0020 /* HW Autonomous Speed Disable */ #define PCI_EXP_LNKSTA2 0x32 /* Link Status 2 */ #define PCI_EXP_LNKSTA2_FLIT 0x0400 /* Flit Mode Status */ -#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 0x32 /* end of v2 EPs w/ link */ +#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 0x34 /* end of v2 EPs w/ link */ #define PCI_EXP_SLTCAP2 0x34 /* Slot Capabilities 2 */ #define PCI_EXP_SLTCAP2_IBPD 0x00000001 /* In-band PD Disable Supported */ #define PCI_EXP_SLTCTL2 0x38 /* Slot Control 2 */ -- cgit v1.2.3 From 032e084f0d43fda78c33abfc704ac13a0891a6e7 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Fri, 27 Feb 2026 18:43:09 +0000 Subject: tools/sched_ext: fix strtoul() misuse in scx_hotplug_seq() scx_hotplug_seq() uses strtoul() but validates the result with a negative check (val < 0), which can never be true for an unsigned return value. Use the endptr mechanism to verify the entire string was consumed, and check errno == ERANGE for overflow detection. Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/compat.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 8b4897fc8b99..edccc99c7294 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -125,6 +125,7 @@ static inline long scx_hotplug_seq(void) { int fd; char buf[32]; + char *endptr; ssize_t len; long val; @@ -137,8 +138,10 @@ static inline long scx_hotplug_seq(void) buf[len] = 0; close(fd); - val = strtoul(buf, NULL, 10); - SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + errno = 0; + val = strtoul(buf, &endptr, 10); + SCX_BUG_ON(errno == ERANGE || endptr == buf || + (*endptr != '\n' && *endptr != '\0'), "invalid num hotplug events: %ld", val); return val; } -- cgit v1.2.3 From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:09 +0100 Subject: nsfs: tighten permission checks for ns iteration ioctls Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.12+ Signed-off-by: Christian Brauner --- fs/nsfs.c | 13 +++++++++++++ include/linux/ns_common.h | 2 ++ kernel/nscommon.c | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index db91de208645..be36c10c38cf 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -199,6 +199,17 @@ static bool nsfs_ioctl_valid(unsigned int cmd) return false; } +static bool may_use_nsfs_ioctl(unsigned int cmd) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_NEXT): + fallthrough; + case _IOC_NR(NS_MNT_GET_PREV): + return may_see_all_namespaces(); + } + return true; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -214,6 +225,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!nsfs_ioctl_valid(ioctl)) return -ENOIOCTLCMD; + if (!may_use_nsfs_ioctl(ioctl)) + return -EPERM; ns = get_proc_ns(file_inode(filp)); switch (ioctl) { diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 825f5865bfc5..c8e227a3f9e2 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +bool may_see_all_namespaces(void); + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bdc3c86231d3..3166c1fd844a 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -309,3 +309,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; } } + +bool may_see_all_namespaces(void) +{ + return (task_active_pid_ns(current) == &init_pid_ns) && + ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN); +} -- cgit v1.2.3 From d2324a9317f00013facb0ba00b00440e19d2af5e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:10 +0100 Subject: nsfs: tighten permission checks for handle opening Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-2-d2c2853313bd@kernel.org Fixes: 5222470b2fbb ("nsfs: support file handles") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.18+ Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index be36c10c38cf..c215878d55e8 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -627,7 +627,7 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return ERR_PTR(-EOPNOTSUPP); } - if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + if (owning_ns && !may_see_all_namespaces()) { ns->ops->put(ns); return ERR_PTR(-EPERM); } -- cgit v1.2.3 From 8d76afe84fa2babf604b3c173730d4d2b067e361 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:11 +0100 Subject: nstree: tighten permission checks for listing Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-3-d2c2853313bd@kernel.org Fixes: 76b6f5dfb3fd ("nstree: add listns()") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.19+ Signed-off-by: Christian Brauner --- kernel/nstree.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/kernel/nstree.c b/kernel/nstree.c index f36c59e6951d..6d12e5900ac0 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -515,32 +515,11 @@ static inline bool __must_check ns_requested(const struct klistns *kls, static inline bool __must_check may_list_ns(const struct klistns *kls, struct ns_common *ns) { - if (kls->user_ns) { - if (kls->userns_capable) - return true; - } else { - struct ns_common *owner; - struct user_namespace *user_ns; - - owner = ns_owner(ns); - if (owner) - user_ns = to_user_ns(owner); - else - user_ns = &init_user_ns; - if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) - return true; - } - - if (is_current_namespace(ns)) + if (kls->user_ns && kls->userns_capable) return true; - - if (ns->ns_type != CLONE_NEWUSER) - return false; - - if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + if (is_current_namespace(ns)) return true; - - return false; + return may_see_all_namespaces(); } static inline void ns_put(struct ns_common *ns) @@ -600,7 +579,7 @@ static ssize_t do_listns_userns(struct klistns *kls) ret = 0; head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; - kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + kls->userns_capable = may_see_all_namespaces(); rcu_read_lock(); -- cgit v1.2.3 From 4c7b2ec23cc5d880e3ffe35e8c2aad686b67723a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:12 +0100 Subject: selftests: fix mntns iteration selftests Now that we changed permission checking make sure that we reflect that in the selftests. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-4-d2c2853313bd@kernel.org Fixes: 9d87b1067382 ("selftests: add tests for mntns iteration") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.14+ Signed-off-by: Christian Brauner --- .../selftests/filesystems/nsfs/iterate_mntns.c | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c index 61e55dfbf121..e19ff8168baf 100644 --- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -37,17 +37,20 @@ FIXTURE(iterate_mount_namespaces) { __u64 mnt_ns_id[MNT_NS_COUNT]; }; +static inline bool mntns_in_list(__u64 *mnt_ns_id, struct mnt_ns_info *info) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) { + if (mnt_ns_id[i] == info->mnt_ns_id) + return true; + } + return false; +} + FIXTURE_SETUP(iterate_mount_namespaces) { for (int i = 0; i < MNT_NS_COUNT; i++) self->fd_mnt_ns[i] = -EBADF; - /* - * Creating a new user namespace let's us guarantee that we only see - * mount namespaces that we did actually create. - */ - ASSERT_EQ(unshare(CLONE_NEWUSER), 0); - for (int i = 0; i < MNT_NS_COUNT; i++) { struct mnt_ns_info info = {}; @@ -75,13 +78,15 @@ TEST_F(iterate_mount_namespaces, iterate_all_forward) fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC); ASSERT_GE(fd_mnt_ns_cur, 0); - for (;; count++) { + for (;;) { struct mnt_ns_info info = {}; int fd_mnt_ns_next; fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); if (fd_mnt_ns_next < 0 && errno == ENOENT) break; + if (mntns_in_list(self->mnt_ns_id, &info)) + count++; ASSERT_GE(fd_mnt_ns_next, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_next; @@ -96,13 +101,15 @@ TEST_F(iterate_mount_namespaces, iterate_all_backwards) fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC); ASSERT_GE(fd_mnt_ns_cur, 0); - for (;; count++) { + for (;;) { struct mnt_ns_info info = {}; int fd_mnt_ns_prev; fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); if (fd_mnt_ns_prev < 0 && errno == ENOENT) break; + if (mntns_in_list(self->mnt_ns_id, &info)) + count++; ASSERT_GE(fd_mnt_ns_prev, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_prev; @@ -125,7 +132,6 @@ TEST_F(iterate_mount_namespaces, iterate_forward) ASSERT_GE(fd_mnt_ns_next, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_next; - ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); } } @@ -144,7 +150,6 @@ TEST_F(iterate_mount_namespaces, iterate_backward) ASSERT_GE(fd_mnt_ns_prev, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_prev; - ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); } } -- cgit v1.2.3 From 1df97a7453eec80c1912c2d0360290a3970a7671 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:01 -0800 Subject: bpf: Register dtor for freeing special fields There is a race window where BPF hash map elements can leak special fields if the program with access to the map value recreates these special fields between the check_and_free_fields done on the map value and its eventual return to the memory allocator. Several ways were explored prior to this patch, most notably [0] tried to use a poison value to reject attempts to recreate special fields for map values that have been logically deleted but still accessible to BPF programs (either while sitting in the free list or when reused). While this approach works well for task work, timers, wq, etc., it is harder to apply the idea to kptrs, which have a similar race and failure mode. Instead, we change bpf_mem_alloc to allow registering destructor for allocated elements, such that when they are returned to the allocator, any special fields created while they were accessible to programs in the mean time will be freed. If these values get reused, we do not free the fields again before handing the element back. The special fields thus may remain initialized while the map value sits in a free list. When bpf_mem_alloc is retired in the future, a similar concept can be introduced to kmalloc_nolock-backed kmem_cache, paired with the existing idea of a constructor. Note that the destructor registration happens in map_check_btf, after the BTF record is populated and (at that point) avaiable for inspection and duplication. Duplication is necessary since the freeing of embedded bpf_mem_alloc can be decoupled from actual map lifetime due to logic introduced to reduce the cost of rcu_barrier()s in mem alloc free path in 9f2c6e96c65e ("bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc."). As such, once all callbacks are done, we must also free the duplicated record. To remove dependency on the bpf_map itself, also stash the key size of the map to obtain value from htab_elem long after the map is gone. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 6 +++ kernel/bpf/hashtab.c | 87 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/memalloc.c | 58 +++++++++++++++++++++++------ 3 files changed, 140 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e45162ef59bb..4ce0d27f8ea2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,8 @@ struct bpf_mem_alloc { struct obj_cgroup *objcg; bool percpu; struct work_struct work; + void (*dtor_ctx_free)(void *ctx); + void *dtor_ctx; }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. @@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg /* The percpu allocation with a specific unit size. */ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, + void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), + void *ctx); /* Check the allocation size for kmalloc equivalent allocator */ int bpf_mem_alloc_check_size(bool percpu, size_t size); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3b9d297a53be..582f0192b7e1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -125,6 +125,11 @@ struct htab_elem { char key[] __aligned(8); }; +struct htab_btf_record { + struct btf_record *record; + u32 key_size; +}; + static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); @@ -457,6 +462,84 @@ static int htab_map_alloc_check(union bpf_attr *attr) return 0; } +static void htab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct htab_elem *elem = obj; + void *map_value; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + map_value = htab_elem_value(elem, hrec->key_size); + bpf_obj_free_fields(hrec->record, map_value); +} + +static void htab_pcpu_mem_dtor(void *obj, void *ctx) +{ + void __percpu *pptr = *(void __percpu **)obj; + struct htab_btf_record *hrec = ctx; + int cpu; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + for_each_possible_cpu(cpu) + bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu)); +} + +static void htab_dtor_ctx_free(void *ctx) +{ + struct htab_btf_record *hrec = ctx; + + btf_record_free(hrec->record); + kfree(ctx); +} + +static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void *)) +{ + u32 key_size = htab->map.key_size; + const struct bpf_mem_alloc *ma; + struct htab_btf_record *hrec; + int err; + + /* No need for dtors. */ + if (IS_ERR_OR_NULL(htab->map.record)) + return 0; + + hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); + if (!hrec) + return -ENOMEM; + hrec->key_size = key_size; + hrec->record = btf_record_dup(htab->map.record); + if (IS_ERR(hrec->record)) { + err = PTR_ERR(hrec->record); + kfree(hrec); + return err; + } + ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; + /* Kinda sad, but cast away const-ness since we change ma->dtor. */ + bpf_mem_alloc_set_dtor((struct bpf_mem_alloc *)ma, dtor, htab_dtor_ctx_free, hrec); + return 0; +} + +static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (htab_is_prealloc(htab)) + return 0; + /* + * We must set the dtor using this callback, as map's BTF record is not + * populated in htab_map_alloc(), so it will always appear as NULL. + */ + if (htab_is_percpu(htab)) + return htab_set_dtor(htab, htab_pcpu_mem_dtor); + else + return htab_set_dtor(htab, htab_mem_dtor); +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -2281,6 +2364,7 @@ const struct bpf_map_ops htab_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], @@ -2303,6 +2387,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], @@ -2482,6 +2567,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], @@ -2502,6 +2588,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index bd45dda9dc35..682a9f34214b 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -102,6 +102,8 @@ struct bpf_mem_cache { int percpu_size; bool draining; struct bpf_mem_cache *tgt; + void (*dtor)(void *obj, void *ctx); + void *dtor_ctx; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; @@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static int free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; int cnt = 0; llist_for_each_safe(pos, t, llnode) { + if (c->dtor) + c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); free_one(pos, percpu); cnt++; } @@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); atomic_set(&c->call_rcu_ttrace_in_progress, 0); } @@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c) if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { if (unlikely(READ_ONCE(c->draining))) { llnode = llist_del_all(&c->free_by_rcu_ttrace); - free_all(llnode, !!c->percpu_size); + free_all(c, llnode, !!c->percpu_size); } return; } @@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c) dec_active(c, &flags); if (unlikely(READ_ONCE(c->draining))) { - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } else { call_rcu_hurry(&c->rcu, __free_by_rcu); @@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); - free_all(__llist_del_all(&c->free_llist), percpu); - free_all(__llist_del_all(&c->free_llist_extra), percpu); - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); + free_all(c, __llist_del_all(&c->free_llist), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra), percpu); + free_all(c, __llist_del_all(&c->free_by_rcu), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp), percpu); } static void check_mem_cache(struct bpf_mem_cache *c) @@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma) static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + /* We can free dtor ctx only once all callbacks are done using it. */ + if (ma->dtor_ctx_free) + ma->dtor_ctx_free(ma->dtor_ctx); check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); @@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size) return 0; } + +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), void *ctx) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + ma->dtor_ctx_free = dtor_ctx_free; + ma->dtor_ctx = ctx; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + } +} -- cgit v1.2.3 From ae51772b1e94ba1d76db19085957dbccac189c1c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:02 -0800 Subject: bpf: Lose const-ness of map in map_check_btf() BPF hash map may now use the map_check_btf() callback to decide whether to set a dtor on its bpf_mem_alloc or not. Unlike C++ where members can opt out of const-ness using mutable, we must lose the const qualifier on the callback such that we can avoid the ugly cast. Make the change and adjust all existing users, and lose the comment in hashtab.c. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/arraymap.c | 2 +- kernel/bpf/bloom_filter.c | 2 +- kernel/bpf/bpf_insn_array.c | 2 +- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/hashtab.c | 9 ++++----- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/syscall.c | 2 +- 11 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b78b53198a2e..05b34a6355b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,7 +124,7 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, + int (*map_check_btf)(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); @@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) map->ops->map_seq_show_elem; } -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 85efa9772530..8157e8da61d4 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 144f30e740e8..f355cf1c1a16 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key, return -EOPNOTSUPP; } -static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return 0; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26763df6134a..33de68c95ad8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, +static int array_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 35e1ddca74d2..b73336c976b7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key, return -EINVAL; } -static int bloom_map_check_btf(const struct bpf_map *map, +static int bloom_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c0286f25ca3c..a2f84afe6f7c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int insn_array_check_btf(const struct bpf_map *map, +static int insn_array_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b28f07d3a0db..2bf5ca5ae0df 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -797,7 +797,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 582f0192b7e1..bc6bc8bb871d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -496,10 +496,10 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) { u32 key_size = htab->map.key_size; - const struct bpf_mem_alloc *ma; + struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; @@ -518,12 +518,11 @@ static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void return err; } ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; - /* Kinda sad, but cast away const-ness since we change ma->dtor. */ - bpf_mem_alloc_set_dtor((struct bpf_mem_alloc *)ma, dtor, htab_dtor_ctx_free, hrec); + bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 1ccbf28b2ad9..8fca0c64f7b1 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int cgroup_storage_check_btf(const struct bpf_map *map, +static int cgroup_storage_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1adeb4d3b8cf..0f57608b385d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -751,7 +751,7 @@ free_stack: return err; } -static int trie_check_btf(const struct bpf_map *map, +static int trie_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0378e83b4099..274039e36465 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) } EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) -- cgit v1.2.3 From f41deee082dc7b1c198de21710cb5cb9c604cae0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:03 -0800 Subject: bpf: Delay freeing fields in local storage Currently, when use_kmalloc_nolock is false, the freeing of fields for a local storage selem is done eagerly before waiting for the RCU or RCU tasks trace grace period to elapse. This opens up a window where the program which has access to the selem can recreate the fields after the freeing of fields is done eagerly, causing memory leaks when the element is finally freed and returned to the kernel. Make a few changes to address this. First, delay the freeing of fields until after the grace periods have expired using a __bpf_selem_free_rcu wrapper which is eventually invoked after transitioning through the necessary number of grace period waits. Replace usage of the kfree_rcu with call_rcu to be able to take a custom callback. Finally, care needs to be taken to extend the rcu barriers for all cases, and not just when use_kmalloc_nolock is true, as RCU and RCU tasks trace callbacks can be in flight for either case and access the smap field, which is used to obtain the BTF record to walk over special fields in the map value. While we're at it, drop migrate_disable() from bpf_selem_free_rcu, since migration should be disabled for RCU callbacks already. Fixes: 9bac675e6368 ("bpf: Postpone bpf_obj_free_fields to the rcu callback") Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2bf5ca5ae0df..d7db1ada2dad 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -164,16 +164,28 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for use_kmalloc_nolock == false */ -static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) +/* rcu callback for use_kmalloc_nolock == false */ +static void __bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + kfree(selem); +} + +/* rcu tasks trace callback for use_kmalloc_nolock == false */ +static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ if (rcu_trace_implies_rcu_gp()) - kfree(selem); + __bpf_selem_free_rcu(rcu); else - kfree_rcu(selem, rcu); + call_rcu(rcu, __bpf_selem_free_rcu); } /* Handle use_kmalloc_nolock == false */ @@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) - kfree_rcu(selem, rcu); + call_rcu(&selem->rcu, __bpf_selem_free_rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } @@ -195,11 +207,8 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - if (smap) { - migrate_disable(); + if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); - } kfree_nolock(selem); } @@ -214,18 +223,12 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *smap; - - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - if (smap) - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -958,10 +961,9 @@ restart: */ synchronize_rcu(); - if (smap->use_kmalloc_nolock) { - rcu_barrier_tasks_trace(); - rcu_barrier(); - } + /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */ + rcu_barrier_tasks_trace(); + rcu_barrier(); kvfree(smap->buckets); bpf_map_area_free(smap); } -- cgit v1.2.3 From baa35b3cb6b642b903162aacff13181e170c4ecc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:04 -0800 Subject: bpf: Retire rcu_trace_implies_rcu_gp() from local storage This assumption will always hold going forward, hence just remove the various checks and assume it is true with a comment for the uninformed reader. Reviewed-by: Paul E. McKenney Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index d7db1ada2dad..9c96a4477f81 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; - /* If RCU Tasks Trace grace period implies RCU grace period, do - * kfree(), else do kfree_rcu(). + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(local_storage); - else - kfree_rcu(local_storage, rcu); + kfree(local_storage); } /* Handle use_kmalloc_nolock == false */ @@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_local_storage_free_rcu(rcu); - else - call_rcu(rcu, bpf_local_storage_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_local_storage_free_rcu(rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, @@ -182,10 +181,11 @@ static void __bpf_selem_free_rcu(struct rcu_head *rcu) /* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - __bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, __bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + __bpf_selem_free_rcu(rcu); } /* Handle use_kmalloc_nolock == false */ @@ -214,10 +214,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_selem_free_rcu(rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, -- cgit v1.2.3 From 2939d7b3b0e5f35359ce8f69dbbad0bfc4e920b6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:05 -0800 Subject: selftests/bpf: Add tests for special fields races Add a couple of tests to ensure that the refcount drops to zero when we exercise the race where creation of a special field succeeds the logical bpf_obj_free_fields done when deleting an element. Prior to previous changes, the fields would be freed eagerly and repopulate and end up leaking, causing the reference to not drop down correctly. Running this test on a kernel without fixes will cause a hang in delete_module, since the module reference stays active due to the leaked kptr not dropping it. After the fixes tests succeed as expected. Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/map_kptr_race.c | 218 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/map_kptr_race.c | 197 +++++++++++++++++++ 2 files changed, 415 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_kptr_race.c create mode 100644 tools/testing/selftests/bpf/progs/map_kptr_race.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c new file mode 100644 index 000000000000..506ed55e8528 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "map_kptr_race.skel.h" + +static int get_map_id(int map_fd) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info")) + return -1; + return info.id; +} + +static int read_refs(struct map_kptr_race *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts); + if (!ASSERT_OK(ret, "count_ref run")) + return -1; + if (!ASSERT_OK(opts.retval, "count_ref retval")) + return -1; + return skel->bss->num_of_refs; +} + +static void test_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_percpu_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + if (skel->rodata->nr_cpus > 16) + skel->rodata->nr_cpus = 16; + + ret = map_kptr_race__load(skel); + if (!ASSERT_OK(ret, "load")) + goto out_skel; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_percpu_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_sk_ls_leak(void) +{ + struct map_kptr_race *skel, *watcher; + int listen_fd = -1, client_fd = -1, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (!ASSERT_OK(map_kptr_race__attach(skel), "attach")) + goto out_skel; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(listen_fd, 0, "start_server")) + goto out_skel; + + client_fd = connect_to_fd(listen_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto out_skel; + + if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done")) + goto out_skel; + + close(client_fd); + client_fd = -1; + close(listen_fd); + listen_fd = -1; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free); + if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + if (client_fd >= 0) + close(client_fd); + if (listen_fd >= 0) + close(listen_fd); + map_kptr_race__destroy(skel); +} + +void serial_test_map_kptr_race(void) +{ + if (test__start_subtest("htab_leak")) + test_htab_leak(); + if (test__start_subtest("percpu_htab_leak")) + test_percpu_htab_leak(); + if (test__start_subtest("sk_ls_leak")) + test_sk_ls_leak(); +} diff --git a/tools/testing/selftests/bpf/progs/map_kptr_race.c b/tools/testing/selftests/bpf/progs/map_kptr_race.c new file mode 100644 index 000000000000..f6f136cd8f60 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_kptr_race.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" + +struct map_value { + struct prog_test_ref_kfunc __kptr *ref_ptr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); +} race_sk_ls_map SEC(".maps"); + +int num_of_refs; +int sk_ls_leak_done; +int target_map_id; +int map_freed; +const volatile int nr_cpus; + +SEC("tc") +int test_htab_leak(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value val = {}; + struct map_value *v; + int key = 0; + + if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY)) + return 1; + + v = bpf_map_lookup_elem(&race_hash_map, &key); + if (!v) + return 2; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 3; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_map_delete_elem(&race_hash_map, &key); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 4; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + return 0; +} + +static int fill_percpu_kptr(struct map_value *v) +{ + struct prog_test_ref_kfunc *p, *old; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 1; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + return 0; +} + +SEC("tc") +int test_percpu_htab_leak(struct __sk_buff *skb) +{ + struct map_value *v, *arr[16] = {}; + struct map_value val = {}; + int key = 0; + int err = 0; + + if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY)) + return 1; + + for (int i = 0; i < nr_cpus; i++) { + v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i); + if (!v) + return 2; + arr[i] = v; + } + + bpf_map_delete_elem(&race_percpu_hash_map, &key); + + for (int i = 0; i < nr_cpus; i++) { + v = arr[i]; + err = fill_percpu_kptr(v); + if (err) + return 3; + } + + return 0; +} + +SEC("tp_btf/inet_sock_set_state") +int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value *v; + + if (newstate != BPF_TCP_SYN_SENT) + return 0; + + if (sk_ls_leak_done) + return 0; + + v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!v) + return 0; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_sk_storage_delete(&race_sk_ls_map, sk); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + sk_ls_leak_done = 1; + return 0; +} + +long target_map_ptr; + +SEC("fentry/bpf_map_put") +int BPF_PROG(map_put, struct bpf_map *map) +{ + if (target_map_id && map->id == (u32)target_map_id) + target_map_ptr = (long)map; + return 0; +} + +SEC("fexit/htab_map_free") +int BPF_PROG(htab_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("fexit/bpf_sk_storage_map_free") +int BPF_PROG(sk_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 869c63d5975d55e97f6b168e885452b3da20ea47 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:14:55 +0800 Subject: bpf: Fix race in cpumap on PREEMPT_RT On PREEMPT_RT kernels, the per-CPU xdp_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __cpu_map_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_flush_to_queue(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double __list_del_clearprev(): after bq->count is reset in bq_flush_to_queue(), a preempting task can call bq_enqueue() -> bq_flush_to_queue() on the same bq when bq->count reaches CPU_MAP_BULK_SIZE. Both tasks then call __list_del_clearprev() on the same bq->flush_node, the second call dereferences the prev pointer that was already set to NULL by the first. 2. bq->count and bq->q[] races: concurrent bq_enqueue() can corrupt the packet queue while bq_flush_to_queue() is processing it. The race between task A (__cpu_map_flush -> bq_flush_to_queue) and task B (bq_enqueue -> bq_flush_to_queue) on the same CPU: Task A (xdp_do_flush) Task B (cpu_map_enqueue) ---------------------- ------------------------ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush bq->q[] to ptr_ring */ bq->count = 0 spin_unlock(&q->producer_lock) bq_enqueue(rcpu, xdpf) <-- CFS preempts Task A --> bq->q[bq->count++] = xdpf /* ... more enqueues until full ... */ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush to ptr_ring */ spin_unlock(&q->producer_lock) __list_del_clearprev(flush_node) /* sets flush_node.prev = NULL */ <-- Task A resumes --> __list_del_clearprev(flush_node) flush_node.prev->next = ... /* prev is NULL -> kernel oops */ Fix this by adding a local_lock_t to xdp_bulk_queue and acquiring it in bq_enqueue() and __cpu_map_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. To reproduce, insert an mdelay(100) between bq->count = 0 and __list_del_clearprev() in bq_flush_to_queue(), then run reproducer provided by syzkaller. Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/ Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260225121459.183121-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04171fbc39cb..32b43cb9061b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ struct xdp_bulk_queue { struct list_head flush_node; struct bpf_cpu_map_entry *obj; unsigned int count; + local_lock_t bq_lock; }; /* Struct for every remote "destination" CPU in map */ @@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, for_each_possible_cpu(i) { bq = per_cpu_ptr(rcpu->bulkq, i); bq->obj = rcpu; + local_lock_init(&bq->bq_lock); } /* Alloc queue */ @@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) struct ptr_ring *q; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!bq->count)) return; @@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) } /* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. + * Thus, safe percpu variable access. PREEMPT_RT relies on + * local_lock_nested_bh() to serialise access to the per-CPU bq. */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + struct xdp_bulk_queue *bq; + + local_lock_nested_bh(&rcpu->bulkq->bq_lock); + bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) bq_flush_to_queue(bq); @@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } + + local_unlock_nested_bh(&rcpu->bulkq->bq_lock); } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->obj->bulkq->bq_lock); bq_flush_to_queue(bq); + local_unlock_nested_bh(&bq->obj->bulkq->bq_lock); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); -- cgit v1.2.3 From 1872e75375c40add4a35990de3be77b5741c252c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:14:56 +0800 Subject: bpf: Fix race in devmap on PREEMPT_RT On PREEMPT_RT kernels, the per-CPU xdp_dev_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __dev_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_xmit_all(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double-free / use-after-free on bq->q[]: bq_xmit_all() snapshots cnt = bq->count, then iterates bq->q[0..cnt-1] to transmit frames. If preempted after the snapshot, a second task can call bq_enqueue() -> bq_xmit_all() on the same bq, transmitting (and freeing) the same frames. When the first task resumes, it operates on stale pointers in bq->q[], causing use-after-free. 2. bq->count and bq->q[] corruption: concurrent bq_enqueue() modifying bq->count and bq->q[] while bq_xmit_all() is reading them. 3. dev_rx/xdp_prog teardown race: __dev_flush() clears bq->dev_rx and bq->xdp_prog after bq_xmit_all(). If preempted between bq_xmit_all() return and bq->dev_rx = NULL, a preempting bq_enqueue() sees dev_rx still set (non-NULL), skips adding bq to the flush_list, and enqueues a frame. When __dev_flush() resumes, it clears dev_rx and removes bq from the flush_list, orphaning the newly enqueued frame. 4. __list_del_clearprev() on flush_node: similar to the cpumap race, both tasks can call __list_del_clearprev() on the same flush_node, the second dereferences the prev pointer already set to NULL. The race between task A (__dev_flush -> bq_xmit_all) and task B (bq_enqueue -> bq_xmit_all) on the same CPU: Task A (xdp_do_flush) Task B (ndo_xdp_xmit redirect) ---------------------- -------------------------------- __dev_flush(flush_list) bq_xmit_all(bq) cnt = bq->count /* e.g. 16 */ /* start iterating bq->q[] */ <-- CFS preempts Task A --> bq_enqueue(dev, xdpf) bq->count == DEV_MAP_BULK_SIZE bq_xmit_all(bq, 0) cnt = bq->count /* same 16! */ ndo_xdp_xmit(bq->q[]) /* frames freed by driver */ bq->count = 0 <-- Task A resumes --> ndo_xdp_xmit(bq->q[]) /* use-after-free: frames already freed! */ Fix this by adding a local_lock_t to xdp_dev_bulk_queue and acquiring it in bq_enqueue() and __dev_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260225121459.183121-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2984e938f94d..3d619d01088e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -45,6 +45,7 @@ * types of devmap; only the lookup and insertion is different. */ #include +#include #include #include #include @@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue { struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; + local_lock_t bq_lock; }; struct bpf_dtab_netdev { @@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) int to_send = cnt; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!cnt)) return; @@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list) struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); + local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); } } @@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above - * xdp_do_flush() in filter.c. + * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh() + * to serialise access to the per-CPU bq. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); + struct xdp_dev_bulk_queue *bq; + + local_lock_nested_bh(&dev->xdp_bulkq->bq_lock); + bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, } bq->q[bq->count++] = xdpf; + + local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, @@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier, if (!netdev->xdp_bulkq) return NOTIFY_BAD; - for_each_possible_cpu(cpu) - per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + for_each_possible_cpu(cpu) { + struct xdp_dev_bulk_queue *bq; + + bq = per_cpu_ptr(netdev->xdp_bulkq, cpu); + bq->dev = netdev; + local_lock_init(&bq->bq_lock); + } break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because -- cgit v1.2.3 From 62413a9c3cb183afb9bb6e94dd68caf4e4145f4c Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Mon, 23 Feb 2026 15:05:44 +0000 Subject: net/sched: act_gate: snapshot parameters with RCU on replace The gate action can be replaced while the hrtimer callback or dump path is walking the schedule list. Convert the parameters to an RCU-protected snapshot and swap updates under tcf_lock, freeing the previous snapshot via call_rcu(). When REPLACE omits the entry list, preserve the existing schedule so the effective state is unchanged. Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Tested-by: Vladimir Oltean Acked-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20260223150512.2251594-2-p@1g4.org Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_gate.h | 33 ++++-- net/sched/act_gate.c | 265 ++++++++++++++++++++++++++++++------------- 2 files changed, 212 insertions(+), 86 deletions(-) diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index b147a3bb1a46..e0fded18e18c 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -32,6 +32,7 @@ struct tcf_gate_params { s32 tcfg_clockid; size_t num_entries; struct list_head entries; + struct rcu_head rcu; }; #define GATE_ACT_GATE_OPEN BIT(0) @@ -39,7 +40,7 @@ struct tcf_gate_params { struct tcf_gate { struct tc_action common; - struct tcf_gate_params param; + struct tcf_gate_params __rcu *param; u8 current_gate_status; ktime_t current_close_time; u32 current_entry_octets; @@ -51,47 +52,65 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) +static inline struct tcf_gate_params *tcf_gate_params_locked(const struct tc_action *a) +{ + struct tcf_gate *gact = to_gate(a); + + return rcu_dereference_protected(gact->param, + lockdep_is_held(&gact->tcf_lock)); +} + static inline s32 tcf_gate_prio(const struct tc_action *a) { + struct tcf_gate_params *p; s32 tcfg_prio; - tcfg_prio = to_gate(a)->param.tcfg_priority; + p = tcf_gate_params_locked(a); + tcfg_prio = p->tcfg_priority; return tcfg_prio; } static inline u64 tcf_gate_basetime(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_basetime; - tcfg_basetime = to_gate(a)->param.tcfg_basetime; + p = tcf_gate_params_locked(a); + tcfg_basetime = p->tcfg_basetime; return tcfg_basetime; } static inline u64 tcf_gate_cycletime(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_cycletime; - tcfg_cycletime = to_gate(a)->param.tcfg_cycletime; + p = tcf_gate_params_locked(a); + tcfg_cycletime = p->tcfg_cycletime; return tcfg_cycletime; } static inline u64 tcf_gate_cycletimeext(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_cycletimeext; - tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext; + p = tcf_gate_params_locked(a); + tcfg_cycletimeext = p->tcfg_cycletime_ext; return tcfg_cycletimeext; } static inline u32 tcf_gate_num_entries(const struct tc_action *a) { + struct tcf_gate_params *p; u32 num_entries; - num_entries = to_gate(a)->param.num_entries; + p = tcf_gate_params_locked(a); + num_entries = p->num_entries; return num_entries; } @@ -105,7 +124,7 @@ static inline struct action_gate_entry u32 num_entries; int i = 0; - p = &to_gate(a)->param; + p = tcf_gate_params_locked(a); num_entries = p->num_entries; list_for_each_entry(entry, &p->entries, list) diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 686eaed81b81..fdbfcaa3e2ab 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,9 +32,12 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void tcf_gate_params_free_rcu(struct rcu_head *head); + +static void gate_get_start_time(struct tcf_gate *gact, + const struct tcf_gate_params *param, + ktime_t *start) { - struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; @@ -69,12 +72,14 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); - struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; + struct tcf_gate_params *p; ktime_t close_time, now; spin_lock(&gact->tcf_lock); + p = rcu_dereference_protected(gact->param, + lockdep_is_held(&gact->tcf_lock)); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ @@ -225,6 +230,35 @@ static void release_entry_list(struct list_head *entries) } } +static int tcf_gate_copy_entries(struct tcf_gate_params *dst, + const struct tcf_gate_params *src, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + int i = 0; + + list_for_each_entry(entry, &src->entries, list) { + struct tcfg_gate_entry *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + new->index = entry->index; + new->gate_state = entry->gate_state; + new->interval = entry->interval; + new->ipv = entry->ipv; + new->maxoctets = entry->maxoctets; + list_add_tail(&new->list, &dst->entries); + i++; + } + + dst->num_entries = i; + return 0; +} + static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) @@ -270,24 +304,44 @@ release_list: return err; } -static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, - enum tk_offsets tko, s32 clockid, - bool do_init) +static bool gate_timer_needs_cancel(u64 basetime, u64 old_basetime, + enum tk_offsets tko, + enum tk_offsets old_tko, + s32 clockid, s32 old_clockid) { - if (!do_init) { - if (basetime == gact->param.tcfg_basetime && - tko == gact->tk_offset && - clockid == gact->param.tcfg_clockid) - return; + return basetime != old_basetime || + clockid != old_clockid || + tko != old_tko; +} - spin_unlock_bh(&gact->tcf_lock); - hrtimer_cancel(&gact->hitimer); - spin_lock_bh(&gact->tcf_lock); +static int gate_clock_resolve(s32 clockid, enum tk_offsets *tko, + struct netlink_ext_ack *extack) +{ + switch (clockid) { + case CLOCK_REALTIME: + *tko = TK_OFFS_REAL; + return 0; + case CLOCK_MONOTONIC: + *tko = TK_OFFS_MAX; + return 0; + case CLOCK_BOOTTIME: + *tko = TK_OFFS_BOOT; + return 0; + case CLOCK_TAI: + *tko = TK_OFFS_TAI; + return 0; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; } - gact->param.tcfg_basetime = basetime; - gact->param.tcfg_clockid = clockid; - gact->tk_offset = tko; - hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); +} + +static void gate_setup_timer(struct tcf_gate *gact, s32 clockid, + enum tk_offsets tko) +{ + WRITE_ONCE(gact->tk_offset, tko); + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, + HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, @@ -296,15 +350,22 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); - enum tk_offsets tk_offset = TK_OFFS_TAI; + u64 cycletime = 0, basetime = 0, cycletime_ext = 0; + struct tcf_gate_params *p = NULL, *old_p = NULL; + enum tk_offsets old_tk_offset = TK_OFFS_TAI; + const struct tcf_gate_params *cur_p = NULL; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; + enum tk_offsets tko = TK_OFFS_TAI; struct tcf_chain *goto_ch = NULL; - u64 cycletime = 0, basetime = 0; - struct tcf_gate_params *p; + s32 timer_clockid = CLOCK_TAI; + bool use_old_entries = false; + s32 old_clockid = CLOCK_TAI; + bool need_cancel = false; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; + u64 old_basetime = 0; int ret = 0, err; u32 gflags = 0; s32 prio = -1; @@ -321,26 +382,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; - if (tb[TCA_GATE_CLOCKID]) { + if (tb[TCA_GATE_CLOCKID]) clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - return -EINVAL; - } - } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -366,6 +409,60 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return -EEXIST; } + gact = to_gate(*a); + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto chain_put; + } + INIT_LIST_HEAD(&p->entries); + + use_old_entries = !tb[TCA_GATE_ENTRY_LIST]; + if (!use_old_entries) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto err_free; + use_old_entries = !err; + } + + if (ret == ACT_P_CREATED && use_old_entries) { + NL_SET_ERR_MSG(extack, "The entry list is empty"); + err = -EINVAL; + goto err_free; + } + + if (ret != ACT_P_CREATED) { + rcu_read_lock(); + cur_p = rcu_dereference(gact->param); + + old_basetime = cur_p->tcfg_basetime; + old_clockid = cur_p->tcfg_clockid; + old_tk_offset = READ_ONCE(gact->tk_offset); + + basetime = old_basetime; + cycletime_ext = cur_p->tcfg_cycletime_ext; + prio = cur_p->tcfg_priority; + gflags = cur_p->tcfg_flags; + + if (!tb[TCA_GATE_CLOCKID]) + clockid = old_clockid; + + err = 0; + if (use_old_entries) { + err = tcf_gate_copy_entries(p, cur_p, extack); + if (!err && !tb[TCA_GATE_CYCLE_TIME]) + cycletime = cur_p->tcfg_cycletime; + } + rcu_read_unlock(); + if (err) + goto err_free; + } + if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -375,25 +472,26 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - gact = to_gate(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&gact->param.entries); + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); - if (err < 0) - goto release_idr; + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - spin_lock_bh(&gact->tcf_lock); - p = &gact->param; + err = gate_clock_resolve(clockid, &tko, extack); + if (err) + goto err_free; + timer_clockid = clockid; - if (tb[TCA_GATE_CYCLE_TIME]) - cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + need_cancel = ret != ACT_P_CREATED && + gate_timer_needs_cancel(basetime, old_basetime, + tko, old_tk_offset, + timer_clockid, old_clockid); - if (tb[TCA_GATE_ENTRY_LIST]) { - err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); - if (err < 0) - goto chain_put; - } + if (need_cancel) + hrtimer_cancel(&gact->hitimer); + + spin_lock_bh(&gact->tcf_lock); if (!cycletime) { struct tcfg_gate_entry *entry; @@ -402,22 +500,20 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; - if (!cycletime) { - err = -EINVAL; - goto chain_put; - } } p->tcfg_cycletime = cycletime; + p->tcfg_cycletime_ext = cycletime_ext; - if (tb[TCA_GATE_CYCLE_TIME_EXT]) - p->tcfg_cycletime_ext = - nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - - gate_setup_timer(gact, basetime, tk_offset, clockid, - ret == ACT_P_CREATED); + if (need_cancel || ret == ACT_P_CREATED) + gate_setup_timer(gact, timer_clockid, tko); p->tcfg_priority = prio; p->tcfg_flags = gflags; - gate_get_start_time(gact, &start); + p->tcfg_basetime = basetime; + p->tcfg_clockid = timer_clockid; + gate_get_start_time(gact, p, &start); + + old_p = rcu_replace_pointer(gact->param, p, + lockdep_is_held(&gact->tcf_lock)); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; @@ -434,11 +530,15 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); + if (old_p) + call_rcu(&old_p->rcu, tcf_gate_params_free_rcu); + return ret; +err_free: + release_entry_list(&p->entries); + kfree(p); chain_put: - spin_unlock_bh(&gact->tcf_lock); - if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: @@ -446,21 +546,29 @@ release_idr: * without taking tcf_lock. */ if (ret == ACT_P_CREATED) - gate_setup_timer(gact, gact->param.tcfg_basetime, - gact->tk_offset, gact->param.tcfg_clockid, - true); + gate_setup_timer(gact, timer_clockid, tko); + tcf_idr_release(*a, bind); return err; } +static void tcf_gate_params_free_rcu(struct rcu_head *head) +{ + struct tcf_gate_params *p = container_of(head, struct tcf_gate_params, rcu); + + release_entry_list(&p->entries); + kfree(p); +} + static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; - p = &gact->param; hrtimer_cancel(&gact->hitimer); - release_entry_list(&p->entries); + p = rcu_dereference_protected(gact->param, 1); + if (p) + call_rcu(&p->rcu, tcf_gate_params_free_rcu); } static int dumping_entry(struct sk_buff *skb, @@ -509,10 +617,9 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, struct nlattr *entry_list; struct tcf_t t; - spin_lock_bh(&gact->tcf_lock); - opt.action = gact->tcf_action; - - p = &gact->param; + rcu_read_lock(); + opt.action = READ_ONCE(gact->tcf_action); + p = rcu_dereference(gact->param); if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -552,12 +659,12 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 76e954155b45294c502e3d3a9e15757c858ca55e Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Fri, 27 Feb 2026 22:32:21 +0100 Subject: bpf: Introduce tnum_step to step through tnum's members This commit introduces tnum_step(), a function that, when given t, and a number z returns the smallest member of t larger than z. The number z must be greater or equal to the smallest member of t and less than the largest member of t. The first step is to compute j, a number that keeps all of t's known bits, and matches all unknown bits to z's bits. Since j is a member of the t, it is already a candidate for result. However, we want our result to be (minimally) greater than z. There are only two possible cases: (1) Case j <= z. In this case, we want to increase the value of j and make it > z. (2) Case j > z. In this case, we want to decrease the value of j while keeping it > z. (Case 1) j <= z t = xx11x0x0 z = 10111101 (189) j = 10111000 (184) ^ k (Case 1.1) Let's first consider the case where j < z. We will address j == z later. Since z > j, there had to be a bit position that was 1 in z and a 0 in j, beyond which all positions of higher significance are equal in j and z. Further, this position could not have been unknown in a, because the unknown positions of a match z. This position had to be a 1 in z and known 0 in t. Let k be position of the most significant 1-to-0 flip. In our example, k = 3 (starting the count at 1 at the least significant bit). Setting (to 1) the unknown bits of t in positions of significance smaller than k will not produce a result > z. Hence, we must set/unset the unknown bits at positions of significance higher than k. Specifically, we look for the next larger combination of 1s and 0s to place in those positions, relative to the combination that exists in z. We can achieve this by concatenating bits at unknown positions of t into an integer, adding 1, and writing the bits of that result back into the corresponding bit positions previously extracted from z. >From our example, considering only positions of significance greater than k: t = xx..x z = 10..1 + 1 ----- 11..0 This is the exact combination 1s and 0s we need at the unknown bits of t in positions of significance greater than k. Further, our result must only increase the value minimally above z. Hence, unknown bits in positions of significance smaller than k should remain 0. We finally have, result = 11110000 (240) (Case 1.2) Now consider the case when j = z, for example t = 1x1x0xxx z = 10110100 (180) j = 10110100 (180) Matching the unknown bits of the t to the bits of z yielded exactly z. To produce a number greater than z, we must set/unset the unknown bits in t, and *all* the unknown bits of t candidates for being set/unset. We can do this similar to Case 1.1, by adding 1 to the bits extracted from the masked bit positions of z. Essentially, this case is equivalent to Case 1.1, with k = 0. t = 1x1x0xxx z = .0.1.100 + 1 --------- .0.1.101 This is the exact combination of bits needed in the unknown positions of t. After recalling the known positions of t, we get result = 10110101 (181) (Case 2) j > z t = x00010x1 z = 10000010 (130) j = 10001011 (139) ^ k Since j > z, there had to be a bit position which was 0 in z, and a 1 in j, beyond which all positions of higher significance are equal in j and z. This position had to be a 0 in z and known 1 in t. Let k be the position of the most significant 0-to-1 flip. In our example, k = 4. Because of the 0-to-1 flip at position k, a member of t can become greater than z if the bits in positions greater than k are themselves >= to z. To make that member *minimally* greater than z, the bits in positions greater than k must be exactly = z. Hence, we simply match all of t's unknown bits in positions more significant than k to z's bits. In positions less significant than k, we set all t's unknown bits to 0 to retain minimality. In our example, in positions of greater significance than k (=4), t=x000. These positions are matched with z (1000) to produce 1000. In positions of lower significance than k, t=10x1. All unknown bits are set to 0 to produce 1001. The final result is: result = 10001001 (137) This concludes the computation for a result > z that is a member of t. The procedure for tnum_step() in this commit implements the idea described above. As a proof of correctness, we verified the algorithm against a logical specification of tnum_step. The specification asserts the following about the inputs t, z and output res that: 1. res is a member of t, and 2. res is strictly greater than z, and 3. there does not exist another value res2 such that 3a. res2 is also a member of t, and 3b. res2 is greater than z 3c. res2 is smaller than res We checked the implementation against this logical specification using an SMT solver. The verification formula in SMTLIB format is available at [1]. The verification returned an "unsat": indicating that no input assignment exists for which the implementation and the specification produce different outputs. In addition, we also automatically generated the logical encoding of the C implementation using Agni [2] and verified it against the same specification. This verification also returned an "unsat", confirming that the implementation is equivalent to the specification. The formula for this check is also available at [3]. Link: https://pastebin.com/raw/2eRWbiit [1] Link: https://github.com/bpfverif/agni [2] Link: https://pastebin.com/raw/EztVbBJ2 [3] Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Link: https://lore.kernel.org/r/93fdf71910411c0f19e282ba6d03b4c65f9c5d73.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index fa4654ffb621..ca2cfec8de08 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -131,4 +131,7 @@ static inline bool tnum_subreg_is_const(struct tnum a) return !(tnum_subreg(a)).mask; } +/* Returns the smallest member of t larger than z */ +u64 tnum_step(struct tnum t, u64 z); + #endif /* _LINUX_TNUM_H */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 26fbfbb01700..4abc359b3db0 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -269,3 +269,59 @@ struct tnum tnum_bswap64(struct tnum a) { return TNUM(swab64(a.value), swab64(a.mask)); } + +/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin + * is the smallest member of the t (= t.value) and tmax is the largest + * member of t (= t.value | t.mask), returns the smallest member of t + * larger than z. + * + * For example, + * t = x11100x0 + * z = 11110001 (241) + * result = 11110010 (242) + * + * Note: if this function is called with z >= tmax, it just returns + * early with tmax; if this function is called with z < tmin, the + * algorithm already returns tmin. + */ +u64 tnum_step(struct tnum t, u64 z) +{ + u64 tmax, j, p, q, r, s, v, u, w, res; + u8 k; + + tmax = t.value | t.mask; + + /* if z >= largest member of t, return largest member of t */ + if (z >= tmax) + return tmax; + + /* if z < smallest member of t, return smallest member of t */ + if (z < t.value) + return t.value; + + /* keep t's known bits, and match all unknown bits to z */ + j = t.value | (z & t.mask); + + if (j > z) { + p = ~z & t.value & ~t.mask; + k = fls64(p); /* k is the most-significant 0-to-1 flip */ + q = U64_MAX << k; + r = q & z; /* positions > k matched to z */ + s = ~q & t.value; /* positions <= k matched to t.value */ + v = r | s; + res = v; + } else { + p = z & ~t.value & ~t.mask; + k = fls64(p); /* k is the most-significant 1-to-0 flip */ + q = U64_MAX << k; + r = q & t.mask & z; /* unknown positions > k, matched to z */ + s = q & ~t.mask; /* known positions > k, set to 1 */ + v = r | s; + /* add 1 to unknown positions > k to make value greater than z */ + u = v + (1ULL << k); + /* extract bits in unknown positions > k from u, rest from t.value */ + w = (u & t.mask) | t.value; + res = w; + } + return res; +} -- cgit v1.2.3 From efc11a667878a1d655ff034a93a539debbfedb12 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:35:02 +0100 Subject: bpf: Improve bounds when tnum has a single possible value We're hitting an invariant violation in Cilium that sometimes leads to BPF programs being rejected and Cilium failing to start [1]. The following extract from verifier logs shows what's happening: from 201 to 236: R1=0 R6=ctx() R7=1 R9=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) R10=fp0 236: R1=0 R6=ctx() R7=1 R9=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) R10=fp0 ; if (magic == MARK_MAGIC_HOST || magic == MARK_MAGIC_OVERLAY || magic == MARK_MAGIC_ENCRYPT) @ bpf_host.c:1337 236: (16) if w9 == 0xe00 goto pc+45 ; R9=scalar(smin=umin=smin32=umin32=3585,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) 237: (16) if w9 == 0xf00 goto pc+1 verifier bug: REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0xe01, 0xe00] s64=[0xe01, 0xe00] u32=[0xe01, 0xe00] s32=[0xe01, 0xe00] var_off=(0xe00, 0x0) We reach instruction 236 with two possible values for R9, 0xe00 and 0xf00. This is perfectly reflected in the tnum, but of course the ranges are less accurate and cover [0xe00; 0xf00]. Taking the fallthrough path at instruction 236 allows the verifier to reduce the range to [0xe01; 0xf00]. The tnum is however not updated. With these ranges, at instruction 237, the verifier is not able to deduce that R9 is always equal to 0xf00. Hence the fallthrough pass is explored first, the verifier refines the bounds using the assumption that R9 != 0xf00, and ends up with an invariant violation. This pattern of impossible branch + bounds refinement is common to all invariant violations seen so far. The long-term solution is likely to rely on the refinement + invariant violation check to detect dead branches, as started by Eduard. To fix the current issue, we need something with less refactoring that we can backport. This patch uses the tnum_step helper introduced in the previous patch to detect the above situation. In particular, three cases are now detected in the bounds refinement: 1. The u64 range and the tnum only overlap in umin. u64: ---[xxxxxx]----- tnum: --xx----------x- 2. The u64 range and the tnum only overlap in the maximum value represented by the tnum, called tmax. u64: ---[xxxxxx]----- tnum: xx-----x-------- 3. The u64 range and the tnum only overlap in between umin (excluded) and umax. u64: ---[xxxxxx]----- tnum: xx----x-------x- To detect these three cases, we call tnum_step(tnum, umin), which returns the smallest member of the tnum greater than umin, called tnum_next here. We're in case (1) if umin is part of the tnum and tnum_next is greater than umax. We're in case (2) if umin is not part of the tnum and tnum_next is equal to tmax. Finally, we're in case (3) if umin is not part of the tnum, tnum_next is inferior or equal to umax, and calling tnum_step a second time gives us a value past umax. This change implements these three cases. With it, the above bytecode looks as follows: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (47) r0 |= 3584 ; R0=scalar(smin=0x8000000000000e00,umin=umin32=3584,smin32=0x80000e00,var_off=(0xe00; 0xfffffffffffff1ff)) 2: (57) r0 &= 3840 ; R0=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) 3: (15) if r0 == 0xe00 goto pc+2 ; R0=3840 4: (15) if r0 == 0xf00 goto pc+1 4: R0=3840 6: (95) exit In addition to the new selftests, this change was also verified with Agni [3]. For the record, the raw SMT is available at [4]. The property it verifies is that: If a concrete value x is contained in all input abstract values, after __update_reg_bounds, it will continue to be contained in all output abstract values. Link: https://github.com/cilium/cilium/issues/44216 [1] Link: https://pchaigno.github.io/test-verifier-complexity.html [2] Link: https://github.com/bpfverif/agni [3] Link: https://pastebin.com/raw/naCfaqNx [4] Fixes: 0df1a55afa83 ("bpf: Warn on internal verifier errors") Acked-by: Eduard Zingerman Tested-by: Marco Schirrmeister Co-developed-by: Harishankar Vishwanathan Signed-off-by: Harishankar Vishwanathan Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/ef254c4f68be19bd393d450188946821c588565d.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb12ba020649..401d6c4960ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2379,6 +2379,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) static void __update_reg64_bounds(struct bpf_reg_state *reg) { + u64 tnum_next, tmax; + bool umin_in_tnum; + /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, reg->var_off.value | (reg->var_off.mask & S64_MIN)); @@ -2388,6 +2391,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) reg->umin_value = max(reg->umin_value, reg->var_off.value); reg->umax_value = min(reg->umax_value, reg->var_off.value | reg->var_off.mask); + + /* Check if u64 and tnum overlap in a single value */ + tnum_next = tnum_step(reg->var_off, reg->umin_value); + umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tmax = reg->var_off.value | reg->var_off.mask; + if (umin_in_tnum && tnum_next > reg->umax_value) { + /* The u64 range and the tnum only overlap in umin. + * u64: ---[xxxxxx]----- + * tnum: --xx----------x- + */ + ___mark_reg_known(reg, reg->umin_value); + } else if (!umin_in_tnum && tnum_next == tmax) { + /* The u64 range and the tnum only overlap in the maximum value + * represented by the tnum, called tmax. + * u64: ---[xxxxxx]----- + * tnum: xx-----x-------- + */ + ___mark_reg_known(reg, tmax); + } else if (!umin_in_tnum && tnum_next <= reg->umax_value && + tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + /* The u64 range and the tnum only overlap in between umin + * (excluded) and umax. + * u64: ---[xxxxxx]----- + * tnum: xx----x-------x- + */ + ___mark_reg_known(reg, tnum_next); + } } static void __update_reg_bounds(struct bpf_reg_state *reg) -- cgit v1.2.3 From e6ad477d1bf8829973cddd9accbafa9d1a6cd15a Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:36:30 +0100 Subject: selftests/bpf: Test refinement of single-value tnum This patch introduces selftests to cover the new bounds refinement logic introduced in the previous patch. Without the previous patch, the first two tests fail because of the invariant violation they trigger. The last test fails because the R10 access is not detected as dead code. In addition, all three tests fail because of R0 having a non-constant value in the verifier logs. In addition, the last two cases are covering the negative cases: when we shouldn't refine the bounds because the u64 and tnum overlap in at least two values. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/90d880c8cf587b9f7dc715d8961cd1b8111d01a8.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_bounds.c | 137 +++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 560531404bce..97065a26cf70 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1863,4 +1863,141 @@ l1_%=: r0 = 1; \ : __clobber_all); } +/* This test covers the bounds deduction when the u64 range and the tnum + * overlap only at umax. After instruction 3, the ranges look as follows: + * + * 0 umin=0xe01 umax=0xf00 U64_MAX + * | [xxxxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x | tnum values + * + * The verifier can therefore deduce that the R0=0xf0=240. + */ +SEC("socket") +__description("bounds refinement with single-value tnum on umax") +__msg("3: (15) if r0 == 0xe0 {{.*}} R0=240") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void bounds_refinement_tnum_umax(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + r0 |= 0xe0; \ + r0 &= 0xf0; \ + if r0 == 0xe0 goto +2; \ + if r0 == 0xf0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test covers the bounds deduction when the u64 range and the tnum + * overlap only at umin. After instruction 3, the ranges look as follows: + * + * 0 umin=0xe00 umax=0xeff U64_MAX + * | [xxxxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x | tnum values + * + * The verifier can therefore deduce that the R0=0xe0=224. + */ +SEC("socket") +__description("bounds refinement with single-value tnum on umin") +__msg("3: (15) if r0 == 0xf0 {{.*}} R0=224") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void bounds_refinement_tnum_umin(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + r0 |= 0xe0; \ + r0 &= 0xf0; \ + if r0 == 0xf0 goto +2; \ + if r0 == 0xe0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test covers the bounds deduction when the only possible tnum value is + * in the middle of the u64 range. After instruction 3, the ranges look as + * follows: + * + * 0 umin=0x7cf umax=0x7df U64_MAX + * | [xxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x x x x | tnum values + * | +--- 0x7e0 + * +--- 0x7d0 + * + * Since the lower four bits are zero, the tnum and the u64 range only overlap + * in R0=0x7d0=2000. Instruction 5 is therefore dead code. + */ +SEC("socket") +__description("bounds refinement with single-value tnum in middle of range") +__msg("3: (a5) if r0 < 0x7cf {{.*}} R0=2000") +__success __log_level(2) +__naked void bounds_refinement_tnum_middle(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 & 0x0f goto +4; \ + if r0 > 0x7df goto +3; \ + if r0 < 0x7cf goto +2; \ + if r0 == 0x7d0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test cover the negative case for the tnum/u64 overlap. Since + * they contain the same two values (i.e., {0, 1}), we can't deduce + * anything more. + */ +SEC("socket") +__description("bounds refinement: several overlaps between tnum and u64") +__msg("2: (25) if r0 > 0x1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1))") +__failure __log_level(2) +__naked void bounds_refinement_several_overlaps(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 < 0 goto +3; \ + if r0 > 1 goto +2; \ + if r0 == 1 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test cover the negative case for the tnum/u64 overlap. Since + * they overlap in the two values contained by the u64 range (i.e., + * {0xf, 0x10}), we can't deduce anything more. + */ +SEC("socket") +__description("bounds refinement: multiple overlaps between tnum and u64") +__msg("2: (25) if r0 > 0x10 {{.*}} R0=scalar(smin=umin=smin32=umin32=15,smax=umax=smax32=umax32=16,var_off=(0x0; 0x1f))") +__failure __log_level(2) +__naked void bounds_refinement_multiple_overlaps(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 < 0xf goto +3; \ + if r0 > 0x10 goto +2; \ + if r0 == 0x10 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 024cea2d647ed8ab942f19544b892d324dba42b4 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:42:45 +0100 Subject: selftests/bpf: Avoid simplification of crafted bounds test The reg_bounds_crafted tests validate the verifier's range analysis logic. They focus on the actual ranges and thus ignore the tnum. As a consequence, they carry the assumption that the tested cases can be reproduced in userspace without using the tnum information. Unfortunately, the previous change the refinement logic breaks that assumption for one test case: (u64)2147483648 (u32) [4294967294; 0x100000000] The tested bytecode is shown below. Without our previous improvement, on the false branch of the condition, R7 is only known to have u64 range [0xfffffffe; 0x100000000]. With our improvement, and using the tnum information, we can deduce that R7 equals 0x100000000. 19: (bc) w0 = w6 ; R6=0x80000000 20: (bc) w0 = w7 ; R7=scalar(smin=umin=0xfffffffe,smax=umax=0x100000000,smin32=-2,smax32=0,var_off=(0x0; 0x1ffffffff)) 21: (be) if w6 <= w7 goto pc+3 ; R6=0x80000000 R7=0x100000000 R7's tnum is (0; 0x1ffffffff). On the false branch, regs_refine_cond_op refines R7's u32 range to [0; 0x7fffffff]. Then, __reg32_deduce_bounds refines the s32 range to 0 using u32 and finally also sets u32=0. From this, __reg_bound_offset improves the tnum to (0; 0x100000000). Finally, our previous patch uses this new tnum to deduce that it only intersect with u64=[0xfffffffe; 0x100000000] in a single value: 0x100000000. Because the verifier uses the tnum to reach this constant value, the selftest is unable to reproduce it by only simulating ranges. The solution implemented in this patch is to change the test case such that there is more than one overlap value between u64 and the tnum. The max. u64 value is thus changed from 0x100000000 to 0x300000000. Acked-by: Eduard Zingerman Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/50641c6a7ef39520595dcafa605692427c1006ec.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index d93a0c7b1786..0322f817d07b 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -2091,7 +2091,7 @@ static struct subtest_case crafted_cases[] = { {U64, S64, {0, 0xffffffffULL}, {0x7fffffff, 0x7fffffff}}, {U64, U32, {0, 0x100000000}, {0, 0}}, - {U64, U32, {0xfffffffe, 0x100000000}, {0x80000000, 0x80000000}}, + {U64, U32, {0xfffffffe, 0x300000000}, {0x80000000, 0x80000000}}, {U64, S32, {0, 0xffffffff00000000ULL}, {0, 0}}, /* these are tricky cases where lower 32 bits allow to tighten 64 -- cgit v1.2.3 From 29252397bcc1e0a1f85e5c3bee59c325f5c26341 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Feb 2026 20:35:45 +0000 Subject: inet: annotate data-races around isk->inet_num UDP/TCP lookups are using RCU, thus isk->inet_num accesses should use READ_ONCE() and WRITE_ONCE() where needed. Fixes: 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260225203545.1512417-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 +- include/net/inet_hashtables.h | 2 +- include/net/ip.h | 2 +- net/ipv4/inet_hashtables.c | 8 ++++---- net/ipv4/tcp_diag.c | 2 +- net/ipv6/inet6_hashtables.c | 3 ++- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 282e29237d93..c16de5b7963f 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -175,7 +175,7 @@ static inline bool inet6_match(const struct net *net, const struct sock *sk, { if (!net_eq(sock_net(sk), net) || sk->sk_family != AF_INET6 || - sk->sk_portpair != ports || + READ_ONCE(sk->sk_portpair) != ports || !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return false; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ac05a52d9e13..5a979dcab538 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -345,7 +345,7 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || - sk->sk_portpair != ports || + READ_ONCE(sk->sk_portpair) != ports || sk->sk_addrpair != cookie) return false; diff --git a/include/net/ip.h b/include/net/ip.h index 69d5cef46004..7f9abd457e01 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -101,7 +101,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; - ipcm->protocol = inet->inet_num; + ipcm->protocol = READ_ONCE(inet->inet_num); } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fca980772c81..9bfccc283fa6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -200,7 +200,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { - inet_sk(sk)->inet_num = port; + WRITE_ONCE(inet_sk(sk)->inet_num, port); inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -224,7 +224,7 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); @@ -352,7 +352,7 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && + if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; @@ -1206,7 +1206,7 @@ error: sk->sk_hash = 0; inet_sk(sk)->inet_sport = 0; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index d83efd91f461..7935702e394b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -509,7 +509,7 @@ next_chunk: if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && + if (r->id.idiag_sport != htons(READ_ONCE(sk->sk_num)) && r->id.idiag_sport) goto next_normal; if (r->id.idiag_dport != sk->sk_dport && diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 5e1da088d8e1..182d38e6d6d8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -95,7 +95,8 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + if (net_eq(sock_net(sk), net) && + READ_ONCE(inet_sk(sk)->inet_num) == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; -- cgit v1.2.3 From 0b3cd139be565b85f4a3579e376152b9def6256a Mon Sep 17 00:00:00 2001 From: Jonas Köppeler Date: Thu, 26 Feb 2026 12:40:15 +0100 Subject: net/sched: sch_cake: avoid sync overhead when unlimited MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip inter-instance sync when no rate limit is configured, as it serves no purpose and only adds overhead. Fixes: 1bddd758bac2 ("net/sched: sch_cake: share shaper state across sub-instances of cake_mq") Signed-off-by: Jonas Köppeler Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260226-cake-mq-skip-sync-bandwidth-unlimited-v1-1-01830bb4db87@tu-berlin.de Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a01f14b1c216..b23fc7d9fafe 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2013,7 +2013,8 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) u64 delay; u32 len; - if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { + if (q->config->is_shared && q->rate_ns && + now - q->last_checked_active >= q->config->sync_time) { struct net_device *dev = qdisc_dev(sch); struct cake_sched_data *other_priv; u64 new_rate = q->config->rate_bps; -- cgit v1.2.3 From 15c2715a52645fd8e6e18b7abc3449292d118c7c Mon Sep 17 00:00:00 2001 From: Jonas Köppeler Date: Thu, 26 Feb 2026 12:40:16 +0100 Subject: net/sched: sch_cake: fixup cake_mq rate adjustment for diffserv config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_mq's rate adjustment during the sync periods did not adjust the rates for every tin in a diffserv config. This lead to inconsistencies of rates between the tins. Fix this by setting the rates for all tins during synchronization. Fixes: 1bddd758bac2 ("net/sched: sch_cake: share shaper state across sub-instances of cake_mq") Signed-off-by: Jonas Köppeler Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260226-cake-mq-skip-sync-bandwidth-unlimited-v1-2-01830bb4db87@tu-berlin.de Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index b23fc7d9fafe..9efe23f8371b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -391,8 +391,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 1239850263, 1191209601, 1147878294, 1108955788 }; -static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, - u64 target_ns, u64 rtt_est_ns); +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); + /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * @@ -2040,12 +2040,9 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) if (num_active_qs > 1) new_rate = div64_u64(q->config->rate_bps, num_active_qs); - /* mtu = 0 is used to only update the rate and not mess with cobalt params */ - cake_set_rate(b, new_rate, 0, 0, 0); + cake_configure_rates(sch, new_rate, true); q->last_checked_active = now; q->active_queues = num_active_qs; - q->rate_ns = b->tin_rate_ns; - q->rate_shft = b->tin_rate_shft; } begin: @@ -2362,12 +2359,10 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->cparams.p_dec = 1 << 20; /* 1/4096 */ } -static int cake_config_besteffort(struct Qdisc *sch) +static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; q->tin_cnt = 1; @@ -2381,12 +2376,10 @@ static int cake_config_besteffort(struct Qdisc *sch) return 0; } -static int cake_config_precedence(struct Qdisc *sch) +static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu) { /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2457,7 +2450,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Total 12 traffic classes. */ -static int cake_config_diffserv8(struct Qdisc *sch) +static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu) { /* Pruned list of traffic classes for typical applications: * @@ -2474,8 +2467,6 @@ static int cake_config_diffserv8(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2505,7 +2496,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) return 0; } -static int cake_config_diffserv4(struct Qdisc *sch) +static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu) { /* Further pruned list of traffic classes for four-class system: * @@ -2518,8 +2509,6 @@ static int cake_config_diffserv4(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; @@ -2547,7 +2536,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) return 0; } -static int cake_config_diffserv3(struct Qdisc *sch) +static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu) { /* Simplified Diffserv structure with 3 tins. * Latency Sensitive (CS7, CS6, EF, VA, TOS4) @@ -2555,8 +2544,6 @@ static int cake_config_diffserv3(struct Qdisc *sch) * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; @@ -2581,32 +2568,33 @@ static int cake_config_diffserv3(struct Qdisc *sch) return 0; } -static void cake_reconfigure(struct Qdisc *sch) +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust) { + u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch)); struct cake_sched_data *qd = qdisc_priv(sch); struct cake_sched_config *q = qd->config; int c, ft; switch (q->tin_mode) { case CAKE_DIFFSERV_BESTEFFORT: - ft = cake_config_besteffort(sch); + ft = cake_config_besteffort(sch, rate, mtu); break; case CAKE_DIFFSERV_PRECEDENCE: - ft = cake_config_precedence(sch); + ft = cake_config_precedence(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV8: - ft = cake_config_diffserv8(sch); + ft = cake_config_diffserv8(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV4: - ft = cake_config_diffserv4(sch); + ft = cake_config_diffserv4(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV3: default: - ft = cake_config_diffserv3(sch); + ft = cake_config_diffserv3(sch, rate, mtu); break; } @@ -2617,6 +2605,14 @@ static void cake_reconfigure(struct Qdisc *sch) qd->rate_ns = qd->tins[ft].tin_rate_ns; qd->rate_shft = qd->tins[ft].tin_rate_shft; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; + + cake_configure_rates(sch, qd->config->rate_bps, false); if (q->buffer_config_limit) { qd->buffer_limit = q->buffer_config_limit; -- cgit v1.2.3 From ba14798653bb815b4dcd116c5265a9f748bc0c7f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Feb 2026 17:19:17 +0100 Subject: selftests: netfilter: nft_queue.sh: avoid flakes on debug kernels Jakub reports test flakes on debug kernels: FAIL: test_udp_gro_ct: Expected software segmentation to occur, had 23 and 17 This test assumes that the kernels nfnetlink_queue module sees N GSO packets, segments them into M skbs and queues them to userspace for reinjection. Hence, if M >= N, no segmentation occurred. However, its possible that this happens: - nfnetlink_queue gets GSO packet - segments that into n skbs - userspace buffer is full, kernel drops the segmented skbs -> "toqueue" counter incremented by 1, "fromqueue" is unchanged. If this happens often enough in a single run, M >= N check triggers incorrectly. To solve this, allow the nf_queue.c test program to set the FAIL_OPEN flag so that the segmented skbs bypass the queueing step in the kernel if the receive buffer is full. Also, reduce number of sending socat instances, decrease their priority and increase nice value for the nf_queue program itself to reduce the probability of overruns happening in the first place. Fixes: 59ecffa3995e ("selftests: netfilter: nft_queue.sh: add udp fraglist gro test case") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20260218184114.0b405b72@kernel.org/ Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260226161920.1205-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netfilter/nf_queue.c | 10 ++++++++-- tools/testing/selftests/net/netfilter/nft_queue.sh | 13 +++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/netfilter/nf_queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c index 9e56b9d47037..116c0ca0eabb 100644 --- a/tools/testing/selftests/net/netfilter/nf_queue.c +++ b/tools/testing/selftests/net/netfilter/nf_queue.c @@ -18,6 +18,7 @@ struct options { bool count_packets; bool gso_enabled; + bool failopen; int verbose; unsigned int queue_num; unsigned int timeout; @@ -30,7 +31,7 @@ static struct options opts; static void help(const char *p) { - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); + printf("Usage: %s [-c|-v [-vv] ] [-o] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); } static int parse_attr_cb(const struct nlattr *attr, void *data) @@ -236,6 +237,8 @@ struct mnl_socket *open_queue(void) flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; flags |= NFQA_CFG_F_UID_GID; + if (opts.failopen) + flags |= NFQA_CFG_F_FAIL_OPEN; mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); @@ -329,7 +332,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { + while ((c = getopt(argc, argv, "chvot:q:Q:d:G")) != -1) { switch (c) { case 'c': opts.count_packets = true; @@ -366,6 +369,9 @@ static void parse_opts(int argc, char **argv) case 'G': opts.gso_enabled = false; break; + case 'o': + opts.failopen = true; + break; case 'v': opts.verbose++; break; diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index 139bc1211878..ea766bdc5d04 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -591,6 +591,7 @@ EOF test_udp_gro_ct() { local errprefix="FAIL: test_udp_gro_ct:" + local timeout=5 ip netns exec "$nsrouter" conntrack -F 2>/dev/null @@ -630,10 +631,10 @@ table inet udpq { } } EOF - timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc & + timeout "$timeout" ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc & local rpid=$! - ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" & + ip netns exec "$nsrouter" nice -n -19 ./nf_queue -G -c -q 1 -o -t 2 > "$TMPFILE2" & local nfqpid=$! ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on @@ -643,8 +644,12 @@ EOF local bs=512 local count=$(((32 * 1024 * 1024) / bs)) - dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do - timeout 5 ip netns exec "$ns1" \ + + local nprocs=$(nproc) + [ $nprocs -gt 1 ] && nprocs=$((nprocs - 1)) + + dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 $nprocs); do + timeout "$timeout" nice -n 19 ip netns exec "$ns1" \ socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 & done -- cgit v1.2.3 From 11cb63b0d1a0685e0831ae3c77223e002ef18189 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 25 Feb 2026 10:43:48 -0300 Subject: net/sched: Only allow act_ct to bind to clsact/ingress qdiscs and shared blocks As Paolo said earlier [1]: "Since the blamed commit below, classify can return TC_ACT_CONSUMED while the current skb being held by the defragmentation engine. As reported by GangMin Kim, if such packet is that may cause a UaF when the defrag engine later on tries to tuch again such packet." act_ct was never meant to be used in the egress path, however some users are attaching it to egress today [2]. Attempting to reach a middle ground, we noticed that, while most qdiscs are not handling TC_ACT_CONSUMED, clsact/ingress qdiscs are. With that in mind, we address the issue by only allowing act_ct to bind to clsact/ingress qdiscs and shared blocks. That way it's still possible to attach act_ct to egress (albeit only with clsact). [1] https://lore.kernel.org/netdev/674b8cbfc385c6f37fb29a1de08d8fe5c2b0fbee.1771321118.git.pabeni@redhat.com/ [2] https://lore.kernel.org/netdev/cc6bfb4a-4a2b-42d8-b9ce-7ef6644fb22b@ovn.org/ Reported-by: GangMin Kim Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags") CC: stable@vger.kernel.org Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260225134349.1287037-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 1 + net/sched/act_ct.c | 6 ++++++ net/sched/cls_api.c | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/include/net/act_api.h b/include/net/act_api.h index e1e8f0f7dacb..d11b79107930 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -70,6 +70,7 @@ struct tc_action { #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) #define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) +#define TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT (1U << (TCA_ACT_FLAGS_USER_BITS + 5)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index bd51522c7953..7d5e50c921a0 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1360,6 +1360,12 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return -EINVAL; } + if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) { + NL_SET_ERR_MSG_MOD(extack, + "Attaching ct to a non ingress/clsact qdisc is unsupported"); + return -EOPNOTSUPP; + } + err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); if (err < 0) return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 343309e9009b..4829c27446e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2228,6 +2228,11 @@ static bool is_qdisc_ingress(__u32 classid) return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); } +static bool is_ingress_or_clsact(struct tcf_block *block, struct Qdisc *q) +{ + return tcf_block_shared(block) || (q && !!(q->flags & TCQ_F_INGRESS)); +} + static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -2420,6 +2425,8 @@ replay: flags |= TCA_ACT_FLAGS_NO_RTNL; if (is_qdisc_ingress(parent)) flags |= TCA_ACT_FLAGS_AT_INGRESS; + if (is_ingress_or_clsact(block, q)) + flags |= TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { -- cgit v1.2.3 From b14e82abf78affa50f4cabe8493e17f9adcfcec7 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 25 Feb 2026 10:43:49 -0300 Subject: selftests/tc-testing: Create tests to exercise act_ct binding restrictions Add 4 test cases to exercise new act_ct binding restrictions: - Try to attach act_ct to an ets qdisc - Attach act_ct to an ingress qdisc - Attach act_ct to a clsact/egress qdisc - Attach act_ct to a shared block Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260225134349.1287037-2-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/actions/ct.json | 159 +++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json index 7d07c55bb668..33bb8f3ff8ed 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json @@ -505,5 +505,164 @@ "teardown": [ "$TC qdisc del dev $DEV1 ingress" ] + }, + { + "id": "8883", + "name": "Try to attach act_ct to an ets qdisc", + "category": [ + "actions", + "ct" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ct", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 root handle 1: ets bands 2" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent 1: prio 1 protocol ip matchall action ct index 42", + "expExitCode": "2", + "verifyCmd": "$TC -j filter ls dev $DEV1 parent 1: prio 1 protocol ip", + "matchJSON": [], + "teardown": [ + "$TC qdisc del dev $DEV1 root" + ] + }, + { + "id": "3b10", + "name": "Attach act_ct to an ingress qdisc", + "category": [ + "actions", + "ct" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ct", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress prio 1 protocol ip matchall action ct index 42", + "expExitCode": "0", + "verifyCmd": "$TC -j filter ls dev $DEV1 ingress prio 1 protocol ip", + "matchJSON": [ + { + "kind": "matchall" + }, + { + "options": { + "actions": [ + { + "order": 1, + "kind": "ct", + "index": 42, + "ref": 1, + "bind": 1 + } + ] + } + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "0337", + "name": "Attach act_ct to a clsact/egress qdisc", + "category": [ + "actions", + "ct" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ct", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 clsact" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 egress prio 1 protocol ip matchall action ct index 42", + "expExitCode": "0", + "verifyCmd": "$TC -j filter ls dev $DEV1 egress prio 1 protocol ip", + "matchJSON": [ + { + "kind": "matchall" + }, + { + "options": { + "actions": [ + { + "order": 1, + "kind": "ct", + "index": 42, + "ref": 1, + "bind": 1 + } + ] + } + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "4f60", + "name": "Attach act_ct to a shared block", + "category": [ + "actions", + "ct" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ct", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 ingress_block 21 clsact" + ], + "cmdUnderTest": "$TC filter add block 21 prio 1 protocol ip matchall action ct index 42", + "expExitCode": "0", + "verifyCmd": "$TC -j filter ls block 21 prio 1 protocol ip", + "matchJSON": [ + { + "kind": "matchall" + }, + { + "options": { + "actions": [ + { + "order": 1, + "kind": "ct", + "index": 42, + "ref": 1, + "bind": 1 + } + ] + } + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 ingress_block 21 clsact" + ] } ] -- cgit v1.2.3 From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001 From: Valentin Spreckels Date: Thu, 26 Feb 2026 20:54:09 +0100 Subject: net: usb: r8152: add TRENDnet TUC-ET2G The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its vendor and product IDs. Signed-off-by: Valentin Spreckels Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 1 + include/linux/usb/r8152.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 4af85728ac4f..0c83bbbea2e7 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -10054,6 +10054,7 @@ static const struct usb_device_id rtl8152_table[] = { { USB_DEVICE(VENDOR_ID_DLINK, 0xb301) }, { USB_DEVICE(VENDOR_ID_DELL, 0xb097) }, { USB_DEVICE(VENDOR_ID_ASUS, 0x1976) }, + { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) }, {} }; diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 2ca60828f28b..1502b2a355f9 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -32,6 +32,7 @@ #define VENDOR_ID_DLINK 0x2001 #define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 +#define VENDOR_ID_TRENDNET 0x20f4 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From dabffd08545ffa1d7183bc45e387860984025291 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 26 Feb 2026 11:28:33 -0800 Subject: net: mana: Ring doorbell at 4 CQ wraparounds MANA hardware requires at least one doorbell ring every 8 wraparounds of the CQ. The driver rings the doorbell as a form of flow control to inform hardware that CQEs have been consumed. The NAPI poll functions mana_poll_tx_cq() and mana_poll_rx_cq() can poll up to CQE_POLLING_BUFFER (512) completions per call. If the CQ has fewer than 512 entries, a single poll call can process more than 4 wraparounds without ringing the doorbell. The doorbell threshold check also uses ">" instead of ">=", delaying the ring by one extra CQE beyond 4 wraparounds. Combined, these issues can cause the driver to exceed the 8-wraparound hardware limit, leading to missed completions and stalled queues. Fix this by capping the number of CQEs polled per call to 4 wraparounds of the CQ in both TX and RX paths. Also change the doorbell threshold from ">" to ">=" so the doorbell is rung as soon as 4 wraparounds are reached. Cc: stable@vger.kernel.org Fixes: 58a63729c957 ("net: mana: Fix doorbell out of order violation and avoid unnecessary doorbell rings") Signed-off-by: Long Li Reviewed-by: Haiyang Zhang Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260226192833.1050807-1-longli@microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 933e9d681ded..9017e806ecda 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1770,8 +1770,14 @@ static void mana_poll_tx_cq(struct mana_cq *cq) ndev = txq->ndev; apc = netdev_priv(ndev); + /* Limit CQEs polled to 4 wraparounds of the CQ to ensure the + * doorbell can be rung in time for the hardware's requirement + * of at least one doorbell ring every 8 wraparounds. + */ comp_read = mana_gd_poll_cq(cq->gdma_cq, completions, - CQE_POLLING_BUFFER); + min((cq->gdma_cq->queue_size / + COMP_ENTRY_SIZE) * 4, + CQE_POLLING_BUFFER)); if (comp_read < 1) return; @@ -2156,7 +2162,14 @@ static void mana_poll_rx_cq(struct mana_cq *cq) struct mana_rxq *rxq = cq->rxq; int comp_read, i; - comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER); + /* Limit CQEs polled to 4 wraparounds of the CQ to ensure the + * doorbell can be rung in time for the hardware's requirement + * of at least one doorbell ring every 8 wraparounds. + */ + comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, + min((cq->gdma_cq->queue_size / + COMP_ENTRY_SIZE) * 4, + CQE_POLLING_BUFFER)); WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER); rxq->xdp_flush = false; @@ -2201,11 +2214,11 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) mana_gd_ring_cq(gdma_queue, SET_ARM_BIT); cq->work_done_since_doorbell = 0; napi_complete_done(&cq->napi, w); - } else if (cq->work_done_since_doorbell > - cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) { + } else if (cq->work_done_since_doorbell >= + (cq->gdma_cq->queue_size / COMP_ENTRY_SIZE) * 4) { /* MANA hardware requires at least one doorbell ring every 8 * wraparounds of CQ even if there is no need to arm the CQ. - * This driver rings the doorbell as soon as we have exceeded + * This driver rings the doorbell as soon as it has processed * 4 wraparounds. */ mana_gd_ring_cq(gdma_queue, 0); -- cgit v1.2.3 From 407fd8b8d8cce03856aa67329715de48b254b529 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Feb 2026 19:03:03 +0100 Subject: KVM: remove CONFIG_KVM_GENERIC_MMU_NOTIFIER All architectures now use MMU notifier for KVM page table management. Remove the Kconfig symbol and the code that is used when it is disabled. Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Kconfig | 1 - arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 4 ---- arch/powerpc/kvm/powerpc.c | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 2 -- arch/x86/kvm/Kconfig | 1 - include/linux/kvm_host.h | 7 +------ virt/kvm/Kconfig | 9 +-------- virt/kvm/kvm_main.c | 16 ---------------- 11 files changed, 2 insertions(+), 42 deletions(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 4f803fd1c99a..7d1f22fd490b 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,7 +21,6 @@ menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING - select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ed4f724db774..8e5213609975 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -28,7 +28,6 @@ config KVM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING - select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO select VIRT_XFER_TO_GUEST_WORK select SCHED_INFO diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index cc13cc35f208..b1b9a1d67758 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING select HAVE_KVM_READONLY_MEM help diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c9a2d50ff1b0..9a0d1c1aca6c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -38,7 +38,6 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER config KVM_BOOK3S_HV_POSSIBLE bool @@ -81,7 +80,6 @@ config KVM_BOOK3S_64_HV tristate "KVM for POWER7 and later using hypervisor mode in host" depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE - select KVM_GENERIC_MMU_NOTIFIER select KVM_BOOK3S_HV_PMU select CMA help @@ -203,7 +201,6 @@ config KVM_E500V2 depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -220,7 +217,6 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV - select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500MC/E5500/E6500 guest kernels in virtual machines on E500MC/E5500/E6500 host processors. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9a89a6d98f97..3da40ea8c562 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -625,7 +625,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_MMU: - BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER)); r = 1; break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 77379f77840a..ec2cee0a39e0 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,6 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO select VIRT_XFER_TO_GUEST_WORK - select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS help diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 917ac740513e..5b835bc6a194 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,9 +28,7 @@ config KVM select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL select KVM_VFIO - select MMU_NOTIFIER select VIRT_XFER_TO_GUEST_WORK - select KVM_GENERIC_MMU_NOTIFIER select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d916bd766c94..801bf9e520db 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM_X86 def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n) select KVM_COMMON - select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dde605cb894e..34759a262b28 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,7 +253,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; @@ -275,7 +274,6 @@ struct kvm_gfn_range { bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -#endif enum { OUTSIDE_GUEST_MODE, @@ -849,13 +847,12 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; -#endif + struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; @@ -2118,7 +2115,6 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) @@ -2196,7 +2192,6 @@ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } -#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 267c7369c765..794976b88c6f 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -5,6 +5,7 @@ config KVM_COMMON bool select EVENTFD select INTERVAL_TREE + select MMU_NOTIFIER select PREEMPT_NOTIFIERS config HAVE_KVM_PFNCACHE @@ -93,24 +94,16 @@ config HAVE_KVM_PM_NOTIFIER config KVM_GENERIC_HARDWARE_ENABLING bool -config KVM_GENERIC_MMU_NOTIFIER - select MMU_NOTIFIER - bool - config KVM_ELIDE_TLB_FLUSH_IF_YOUNG - depends on KVM_GENERIC_MMU_NOTIFIER bool config KVM_MMU_LOCKLESS_AGING - depends on KVM_GENERIC_MMU_NOTIFIER bool config KVM_GENERIC_MEMORY_ATTRIBUTES - depends on KVM_GENERIC_MMU_NOTIFIER bool config KVM_GUEST_MEMFD - depends on KVM_GENERIC_MMU_NOTIFIER select XARRAY_MULTI bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 22f8a672e1fd..29ee01747347 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -502,7 +502,6 @@ void kvm_destroy_vcpus(struct kvm *kvm) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus); -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); @@ -901,15 +900,6 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ - -static int kvm_init_mmu_notifier(struct kvm *kvm) -{ - return 0; -} - -#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ - #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, unsigned long state, @@ -1226,10 +1216,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); -#endif out_err_no_mmu_notifier: kvm_disable_virtualization(); out_err_no_disable: @@ -1292,7 +1280,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() @@ -1311,9 +1298,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->mn_active_invalidate_count = 0; else WARN_ON(kvm->mmu_invalidate_in_progress); -#else - kvm_flush_shadow_all(kvm); -#endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { -- cgit v1.2.3 From 70295a479da684905c18d96656d781823f418ec2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Feb 2026 19:06:31 +0100 Subject: KVM: always define KVM_CAP_SYNC_MMU KVM_CAP_SYNC_MMU is provided by KVM's MMU notifiers, which are now always available. Move the definition from individual architectures to common code. Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 10 ++++------ arch/arm64/kvm/arm.c | 1 - arch/loongarch/kvm/vm.c | 1 - arch/mips/kvm/mips.c | 1 - arch/powerpc/kvm/powerpc.c | 5 ----- arch/riscv/kvm/vm.c | 1 - arch/s390/kvm/kvm-s390.c | 1 - arch/x86/kvm/x86.c | 1 - virt/kvm/kvm_main.c | 1 + 9 files changed, 5 insertions(+), 17 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fc5736839edd..6f85e1b321dd 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1396,7 +1396,10 @@ or its flags may be modified, but it may not be resized. Memory for the region is taken starting at the address denoted by the field userspace_addr, which must point at user addressable memory for the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. +anonymous memory, ordinary files, and hugetlbfs. Changes in the backing +of the memory region are automatically reflected into the guest. +For example, an mmap() that affects the region will be made visible +immediately. Another example is madvise(MADV_DROP). On architectures that support a form of address tagging, userspace_addr must be an untagged address. @@ -1412,11 +1415,6 @@ use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, to make a new slot read-only. In this case, writes to this memory will be posted to userspace as KVM_EXIT_MMIO exits. -When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of -the memory region are automatically reflected into the guest. For example, an -mmap() that affects the region will be made visible immediately. Another -example is madvise(MADV_DROP). - For TDX guest, deleting/moving memory region loses guest memory contents. Read only region isn't supported. Only as-id 0 is supported. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 29f0326f7e00..410ffd41fd73 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -358,7 +358,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_IOEVENTFD: case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 63fd40530aa9..5282158b8122 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -118,7 +118,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: - case KVM_CAP_SYNC_MMU: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_IOEVENTFD: case KVM_CAP_MP_STATE: diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b0fb92fda4d4..29d9f630edfb 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1035,7 +1035,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: - case KVM_CAP_SYNC_MMU: case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3da40ea8c562..00302399fc37 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -623,11 +623,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && !kvmppc_hv_ops->enable_nested(NULL)); break; -#endif - case KVM_CAP_SYNC_MMU: - r = 1; - break; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: r = hv_enabled; break; diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 7cbd2340c190..58bce57dc55b 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -181,7 +181,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_IOEVENTFD: case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_READONLY_MEM: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7a175d86cef0..bc7d6fa66eaf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -601,7 +601,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_S390_PSW: case KVM_CAP_S390_GMAP: - case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3fb64905d190..a03530795707 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4805,7 +4805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: - case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 29ee01747347..1bc1da66b4b0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4870,6 +4870,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { + case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_MEMORY: case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: -- cgit v1.2.3 From 6996a2d2d0a64808c19c98002aeb5d9d1b2df6a4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 27 Feb 2026 03:55:35 +0000 Subject: udp: Unhash auto-bound connected sk from 4-tuple hash table when disconnected. Let's say we bind() an UDP socket to the wildcard address with a non-zero port, connect() it to an address, and disconnect it from the address. bind() sets SOCK_BINDPORT_LOCK on sk->sk_userlocks (but not SOCK_BINDADDR_LOCK), and connect() calls udp_lib_hash4() to put the socket into the 4-tuple hash table. Then, __udp_disconnect() calls sk->sk_prot->rehash(sk). It computes a new hash based on the wildcard address and moves the socket to a new slot in the 4-tuple hash table, leaving a garbage in the chain that no packet hits. Let's remove such a socket from 4-tuple hash table when disconnected. Note that udp_sk(sk)->udp_portaddr_hash needs to be udpated after udp_hash4_dec(hslot2) in udp_unhash4(). Fixes: 78c91ae2c6de ("ipv4/udp: Add 4-tuple hash for connected socket") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260227035547.3321327-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 014fdfdd331b..b60fad393e18 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2287,7 +2287,6 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); - udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -2321,19 +2320,25 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) if (udp_hashed4(sk)) { spin_lock_bh(&hslot->lock); - udp_rehash4(udptable, sk, newhash4); - if (hslot2 != nhslot2) { - spin_lock(&hslot2->lock); - udp_hash4_dec(hslot2); - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - udp_hash4_inc(nhslot2); - spin_unlock(&nhslot2->lock); + if (inet_rcv_saddr_any(sk)) { + udp_unhash4(udptable, sk); + } else { + udp_rehash4(udptable, sk, newhash4); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } } spin_unlock_bh(&hslot->lock); } + + udp_sk(sk)->udp_portaddr_hash = newhash; } } EXPORT_IPV6_MOD(udp_lib_rehash); -- cgit v1.2.3 From 026dfef287c07f37d4d4eef7a0b5a4bfdb29b32d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Feb 2026 16:33:59 -0800 Subject: tcp: give up on stronger sk_rcvbuf checks (for now) We hit another corner case which leads to TcpExtTCPRcvQDrop Connections which send RPCs in the 20-80kB range over loopback experience spurious drops. The exact conditions for most of the drops I investigated are that: - socket exchanged >1MB of data so its not completely fresh - rcvbuf is around 128kB (default, hasn't grown) - there is ~60kB of data in rcvq - skb > 64kB arrives The sum of skb->len (!) of both of the skbs (the one already in rcvq and the arriving one) is larger than rwnd. My suspicion is that this happens because __tcp_select_window() rounds the rwnd up to (1 << wscale) if less than half of the rwnd has been consumed. Eric suggests that given the number of Fixes we already have pointing to 1d2fbaad7cd8 it's probably time to give up on it, until a bigger revamp of rmem management. Also while we could risk tweaking the rwnd math, there are other drops on workloads I investigated, after the commit in question, not explained by this phenomenon. Suggested-by: Eric Dumazet Link: https://lore.kernel.org/20260225122355.585fd57b@kernel.org Fixes: 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260227003359.2391017-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6404e53382ca..7b03f2460751 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5374,25 +5374,11 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); -/* Check if this incoming skb can be added to socket receive queues - * while satisfying sk->sk_rcvbuf limit. - * - * In theory we should use skb->truesize, but this can cause problems - * when applications use too small SO_RCVBUF values. - * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, - * allowing RWIN to be close to available space. - * Whenever the receive queue gets full, we can receive a small packet - * filling RWIN, but with a high skb->truesize, because most NIC use 4K page - * plus sk_buff metadata even when receiving less than 1500 bytes of payload. - * - * Note that we use skb->len to decide to accept or drop this packet, - * but sk->sk_rmem_alloc is the sum of all skb->truesize. - */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return rmem + skb->len <= sk->sk_rcvbuf; + return rmem <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, -- cgit v1.2.3 From 1cc93c48b5d7add5ea038860539893ce1310d72d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Feb 2026 19:34:46 -0800 Subject: selftests/net: packetdrill: remove tests for tcp_rcv_*big Since commit 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") has been reverted we need to remove the corresponding tests. Link: https://lore.kernel.org/20260227003359.2391017-1-kuba@kernel.org Link: https://patch.msgid.link/20260227033446.2596457-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/packetdrill/tcp_rcv_big_endseq.pkt | 44 ---------------------- .../selftests/net/packetdrill/tcp_rcv_toobig.pkt | 33 ---------------- 2 files changed, 77 deletions(-) delete mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt delete mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt deleted file mode 100644 index 3848b419e68c..000000000000 --- a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - ---mss=1000 - -`./defaults.sh` - - 0 `nstat -n` - -// Establish a connection. - +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 - +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 - +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [10000], 4) = 0 - +0 bind(3, ..., ...) = 0 - +0 listen(3, 1) = 0 - - +0 < S 0:0(0) win 32792 - +0 > S. 0:0(0) ack 1 - +.1 < . 1:1(0) ack 1 win 257 - - +0 accept(3, ..., ...) = 4 - - +0 < P. 1:4001(4000) ack 1 win 257 - +0 > . 1:1(0) ack 4001 win 5000 - -// packet in sequence : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW - +0 < P. 4001:54001(50000) ack 1 win 257 - +0 > . 1:1(0) ack 4001 win 5000 - -// ooo packet. : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW - +1 < P. 5001:55001(50000) ack 1 win 257 - +0 > . 1:1(0) ack 4001 win 5000 - -// SKB_DROP_REASON_TCP_INVALID_SEQUENCE / LINUX_MIB_BEYOND_WINDOW - +0 < P. 70001:80001(10000) ack 1 win 257 - +0 > . 1:1(0) ack 4001 win 5000 - - +0 read(4, ..., 100000) = 4000 - -// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd - +0 < P. 4001:54001(50000) ack 1 win 257 - +0 > . 1:1(0) ack 54001 win 0 - -// Check LINUX_MIB_BEYOND_WINDOW has been incremented 3 times. -+0 `nstat | grep TcpExtBeyondWindow | grep -q " 3 "` diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt deleted file mode 100644 index f575c0ff89da..000000000000 --- a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - ---mss=1000 - -`./defaults.sh` - - 0 `nstat -n` - -// Establish a connection. - +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 - +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 - +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0 - +0 bind(3, ..., ...) = 0 - +0 listen(3, 1) = 0 - - +0 < S 0:0(0) win 32792 - +0 > S. 0:0(0) ack 1 win 18980 - +.1 < . 1:1(0) ack 1 win 257 - - +0 accept(3, ..., ...) = 4 - - +0 < P. 1:20001(20000) ack 1 win 257 - +.04 > . 1:1(0) ack 20001 win 18000 - - +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0 - +0 < P. 20001:80001(60000) ack 1 win 257 - +0 > . 1:1(0) ack 20001 win 18000 - - +0 read(4, ..., 20000) = 20000 -// A too big packet is accepted if the receive queue is empty - +0 < P. 20001:80001(60000) ack 1 win 257 - +0 > . 1:1(0) ack 80001 win 0 - -- cgit v1.2.3 From 60abb0ac11dccd6b98fd9182bc5f85b621688861 Mon Sep 17 00:00:00 2001 From: "Nikhil P. Rao" Date: Wed, 25 Feb 2026 00:00:26 +0000 Subject: xsk: Fix fragment node deletion to prevent buffer leak After commit b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node"), the list_node field is reused for both the xskb pool list and the buffer free list, this causes a buffer leak as described below. xp_free() checks if a buffer is already on the free list using list_empty(&xskb->list_node). When list_del() is used to remove a node from the xskb pool list, it doesn't reinitialize the node pointers. This means list_empty() will return false even after the node has been removed, causing xp_free() to incorrectly skip adding the buffer to the free list. Fix this by using list_del_init() instead of list_del() in all fragment handling paths, this ensures the list node is reinitialized after removal, allowing the list_empty() to work correctly. Fixes: b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node") Acked-by: Maciej Fijalkowski Signed-off-by: Nikhil P. Rao Link: https://patch.msgid.link/20260225000456.107806-2-nikhil.rao@amd.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 6 +++--- net/xdp/xsk.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 242e34f771cc..aefc368449d5 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -122,7 +122,7 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { - list_del(&pos->list_node); + list_del_init(&pos->list_node); xp_free(pos); } @@ -157,7 +157,7 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->list_node); + list_del_init(&frag->list_node); ret = &frag->xdp; } @@ -168,7 +168,7 @@ static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); - list_del(&xskb->list_node); + list_del_init(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b46bc635c43..be882a8d473c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -186,7 +186,7 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->list_node); + list_del_init(&pos->list_node); } return 0; -- cgit v1.2.3 From f7387d6579d65efd490a864254101cb665f2e7a7 Mon Sep 17 00:00:00 2001 From: "Nikhil P. Rao" Date: Wed, 25 Feb 2026 00:00:27 +0000 Subject: xsk: Fix zero-copy AF_XDP fragment drop AF_XDP should ensure that only a complete packet is sent to application. In the zero-copy case, if the Rx queue gets full as fragments are being enqueued, the remaining fragments are dropped. For the multi-buffer case, add a check to ensure that the Rx queue has enough space for all fragments of a packet before starting to enqueue them. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Nikhil P. Rao Link: https://patch.msgid.link/20260225000456.107806-3-nikhil.rao@amd.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index be882a8d473c..6149f6a79897 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,25 +167,31 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; + u32 num_desc; int err; - if (frags) - contd = XDP_PKT_CONTD; + if (likely(!frags)) { + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + return 0; + } - err = __xsk_rcv_zc(xs, xskb, len, contd); - if (err) + contd = XDP_PKT_CONTD; + num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + err = -ENOBUFS; goto err; - if (likely(!frags)) - return 0; + } + __xsk_rcv_zc(xs, xskb, len, contd); xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; - err = __xsk_rcv_zc(xs, pos, len, contd); - if (err) - goto err; + __xsk_rcv_zc(xs, pos, len, contd); list_del_init(&pos->list_node); } -- cgit v1.2.3 From 74badb9c20b1a9c02a95c735c6d3cd6121679c93 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 26 Feb 2026 21:58:12 -0800 Subject: dpaa2-switch: Fix interrupt storm after receiving bad if_id in IRQ handler Commit 31a7a0bbeb00 ("dpaa2-switch: add bounds check for if_id in IRQ handler") introduces a range check for if_id to avoid an out-of-bounds access. If an out-of-bounds if_id is detected, the interrupt status is not cleared. This may result in an interrupt storm. Clear the interrupt status after detecting an out-of-bounds if_id to avoid the problem. Found by an experimental AI code review agent at Google. Fixes: 31a7a0bbeb00 ("dpaa2-switch: add bounds check for if_id in IRQ handler") Cc: Junrui Luo Signed-off-by: Guenter Roeck Reviewed-by: Ioana Ciornei Link: https://patch.msgid.link/20260227055812.1777915-1-linux@roeck-us.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index fc0e0f36186e..52c1cb9cb7e0 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -1533,7 +1533,7 @@ static irqreturn_t dpaa2_switch_irq0_handler_thread(int irq_num, void *arg) if_id = (status & 0xFFFF0000) >> 16; if (if_id >= ethsw->sw_attr.num_ifs) { dev_err(dev, "Invalid if_id %d in IRQ status\n", if_id); - goto out; + goto out_clear; } port_priv = ethsw->ports[if_id]; @@ -1553,6 +1553,7 @@ static irqreturn_t dpaa2_switch_irq0_handler_thread(int irq_num, void *arg) dpaa2_switch_port_connect_mac(port_priv); } +out_clear: err = dpsw_clear_irq_status(ethsw->mc_io, 0, ethsw->dpsw_handle, DPSW_IRQ_INDEX_IF, status); if (err) -- cgit v1.2.3 From 101bacb303e89dc2e0640ae6a5e0fb97c4eb45bb Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:32:40 +0800 Subject: atm: lec: fix null-ptr-deref in lec_arp_clear_vccs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported a null-ptr-deref in lec_arp_clear_vccs(). This issue can be easily reproduced using the syzkaller reproducer. In the ATM LANE (LAN Emulation) module, the same atm_vcc can be shared by multiple lec_arp_table entries (e.g., via entry->vcc or entry->recv_vcc). When the underlying VCC is closed, lec_vcc_close() iterates over all ARP entries and calls lec_arp_clear_vccs() for each matched entry. For example, when lec_vcc_close() iterates through the hlists in priv->lec_arp_empty_ones or other ARP tables: 1. In the first iteration, for the first matched ARP entry sharing the VCC, lec_arp_clear_vccs() frees the associated vpriv (which is vcc->user_back) and sets vcc->user_back to NULL. 2. In the second iteration, for the next matched ARP entry sharing the same VCC, lec_arp_clear_vccs() is called again. It obtains a NULL vpriv from vcc->user_back (via LEC_VCC_PRIV(vcc)) and then attempts to dereference it via `vcc->pop = vpriv->old_pop`, leading to a null-ptr-deref crash. Fix this by adding a null check for vpriv before dereferencing it. If vpriv is already NULL, it means the VCC has been cleared by a previous call, so we can safely skip the cleanup and just clear the entry's vcc/recv_vcc pointers. The entire cleanup block (including vcc_release_async()) is placed inside the vpriv guard because a NULL vpriv indicates the VCC has already been fully released by a prior iteration — repeating the teardown would redundantly set flags and trigger callbacks on an already-closing socket. The Fixes tag points to the initial commit because the entry->vcc path has been vulnerable since the original code. The entry->recv_vcc path was later added by commit 8d9f73c0ad2f ("atm: fix a memory leak of vcc->user_back") with the same pattern, and both paths are fixed here. Reported-by: syzbot+72e3ea390c305de0e259@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68c95a83.050a0220.3c6139.0e5c.GAE@google.com/T/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Dan Carpenter Reviewed-by: Simon Horman Signed-off-by: Jiayuan Chen Link: https://patch.msgid.link/20260225123250.189289-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/atm/lec.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/net/atm/lec.c b/net/atm/lec.c index a107375c0629..fb93c6e1c329 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1260,24 +1260,28 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry) struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = (struct net_device *)vcc->proto_data; - vcc->pop = vpriv->old_pop; - if (vpriv->xoff) - netif_wake_queue(dev); - kfree(vpriv); - vcc->user_back = NULL; - vcc->push = entry->old_push; - vcc_release_async(vcc, -EPIPE); + if (vpriv) { + vcc->pop = vpriv->old_pop; + if (vpriv->xoff) + netif_wake_queue(dev); + kfree(vpriv); + vcc->user_back = NULL; + vcc->push = entry->old_push; + vcc_release_async(vcc, -EPIPE); + } entry->vcc = NULL; } if (entry->recv_vcc) { struct atm_vcc *vcc = entry->recv_vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); - kfree(vpriv); - vcc->user_back = NULL; + if (vpriv) { + kfree(vpriv); + vcc->user_back = NULL; - entry->recv_vcc->push = entry->old_recv_push; - vcc_release_async(entry->recv_vcc, -EPIPE); + entry->recv_vcc->push = entry->old_recv_push; + vcc_release_async(entry->recv_vcc, -EPIPE); + } entry->recv_vcc = NULL; } } -- cgit v1.2.3 From 9197e5949a41cfb5d44a6b8a860766266340d558 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 28 Feb 2026 11:56:03 +0900 Subject: firewire: ohci: initialize page array to use alloc_pages_bulk() correctly The call of alloc_pages_bulk() skips to fill entries of page array when the entries already have values. While, 1394 OHCI PCI driver passes the page array without initializing. It could cause invalid state at PFN validation in vmap(). Fixes: f2ae92780ab9 ("firewire: ohci: split page allocation from dma mapping") Reported-by: John Ogness Reported-and-tested-by: Harald Arnesen Reported-and-tested-by: David Gow Closes: https://lore.kernel.org/lkml/87tsv1vig5.fsf@jogness.linutronix.de/ Signed-off-by: Takashi Sakamoto Signed-off-by: Linus Torvalds --- drivers/firewire/ohci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 1c868c1e4a49..8153d62c58f0 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -848,7 +848,7 @@ static int ar_context_init(struct ar_context *ctx, struct fw_ohci *ohci, { struct device *dev = ohci->card.device; unsigned int i; - struct page *pages[AR_BUFFERS + AR_WRAPAROUND_PAGES]; + struct page *pages[AR_BUFFERS + AR_WRAPAROUND_PAGES] = { NULL }; dma_addr_t dma_addrs[AR_BUFFERS]; void *vaddr; struct descriptor *d; -- cgit v1.2.3 From 147792c395db870756a0dc87ce656c75ae7ab7e8 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Thu, 26 Feb 2026 15:53:56 +0530 Subject: net: ti: icssg-prueth: Fix ping failure after offload mode setup when link speed is not 1G When both eth interfaces with links up are added to a bridge or hsr interface, ping fails if the link speed is not 1Gbps (e.g., 100Mbps). The issue is seen because when switching to offload (bridge/hsr) mode, prueth_emac_restart() restarts the firmware and clears DRAM with memset_io(), setting all memory to 0. This includes PORT_LINK_SPEED_OFFSET which firmware reads for link speed. The value 0 corresponds to FW_LINK_SPEED_1G (0x00), so for 1Gbps links the default value is correct and ping works. For 100Mbps links, the firmware needs FW_LINK_SPEED_100M (0x01) but gets 0 instead, causing ping to fail. The function emac_adjust_link() is called to reconfigure, but it detects no state change (emac->link is still 1, speed/duplex match PHY) so new_state remains false and icssg_config_set_speed() is never called to correct the firmware speed value. The fix resets emac->link to 0 before calling emac_adjust_link() in prueth_emac_common_start(). This forces new_state=true, ensuring icssg_config_set_speed() is called to write the correct speed value to firmware memory. Fixes: 06feac15406f ("net: ti: icssg-prueth: Fix emac link speed handling") Signed-off-by: MD Danish Anwar Link: https://patch.msgid.link/20260226102356.2141871-1-danishanwar@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 0939994c932f..42a881bee109 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -273,6 +273,14 @@ static int prueth_emac_common_start(struct prueth *prueth) if (ret) goto disable_class; + /* Reset link state to force reconfiguration in + * emac_adjust_link(). Without this, if the link was already up + * before restart, emac_adjust_link() won't detect any state + * change and will skip critical configuration like writing + * speed to firmware. + */ + emac->link = 0; + mutex_lock(&emac->ndev->phydev->lock); emac_adjust_link(emac->ndev); mutex_unlock(&emac->ndev->phydev->lock); -- cgit v1.2.3 From 9439a661c2e80485406ce2c90b107ca17858382d Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 26 Feb 2026 22:37:53 +0530 Subject: amd-xgbe: fix MAC_TCR_SS register width for 2.5G and 10M speeds Extend the MAC_TCR_SS (Speed Select) register field width from 2 bits to 3 bits to properly support all speed settings. The MAC_TCR register's SS field encoding requires 3 bits to represent all supported speeds: - 0x00: 10Gbps (XGMII) - 0x02: 2.5Gbps (GMII) / 100Mbps - 0x03: 1Gbps / 10Mbps - 0x06: 2.5Gbps (XGMII) - P100a only With only 2 bits, values 0x04-0x07 cannot be represented, which breaks 2.5G XGMII mode on newer platforms and causes incorrect speed select values to be programmed. Fixes: 07445f3c7ca1 ("amd-xgbe: Add support for 10 Mbps speed") Co-developed-by: Guruvendra Punugupati Signed-off-by: Guruvendra Punugupati Signed-off-by: Raju Rangoju Link: https://patch.msgid.link/20260226170753.250312-1-Raju.Rangoju@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 711f295eb777..80c2c27ac9dc 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -431,7 +431,7 @@ #define MAC_SSIR_SSINC_INDEX 16 #define MAC_SSIR_SSINC_WIDTH 8 #define MAC_TCR_SS_INDEX 29 -#define MAC_TCR_SS_WIDTH 2 +#define MAC_TCR_SS_WIDTH 3 #define MAC_TCR_TE_INDEX 0 #define MAC_TCR_TE_WIDTH 1 #define MAC_TCR_VNE_INDEX 24 -- cgit v1.2.3 From 11439c4635edd669ae435eec308f4ab8a0804808 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Mar 2026 15:39:31 -0800 Subject: Linux 7.0-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e944c6e71e81..2446085983f7 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 7 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* -- cgit v1.2.3 From c35636e91e392e1540949bbc67932167cb48bc3a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 18 Feb 2026 11:58:06 +0100 Subject: can: bcm: fix locking for bcm_op runtime updates Commit c2aba69d0c36 ("can: bcm: add locking for bcm_op runtime updates") added a locking for some variables that can be modified at runtime when updating the sending bcm_op with a new TX_SETUP command in bcm_tx_setup(). Usually the RX_SETUP only handles and filters incoming traffic with one exception: When the RX_RTR_FRAME flag is set a predefined CAN frame is sent when a specific RTR frame is received. Therefore the rx bcm_op uses bcm_can_tx() which uses the bcm_tx_lock that was only initialized in bcm_tx_setup(). Add the missing spin_lock_init() when allocating the bcm_op in bcm_rx_setup() to handle the RTR case properly. Fixes: c2aba69d0c36 ("can: bcm: add locking for bcm_op runtime updates") Reported-by: syzbot+5b11eccc403dd1cea9f8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-can/699466e4.a70a0220.2c38d7.00ff.GAE@google.com/ Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260218-bcm_spin_lock_init-v1-1-592634c8a5b5@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/bcm.c b/net/can/bcm.c index b7324e9c955b..fd9fa072881e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1176,6 +1176,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); -- cgit v1.2.3 From c77bfbdd6aac31b152ee81522cd90ad1de18738f Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 26 Jan 2026 11:45:40 +0100 Subject: can: dummy_can: dummy_can_init(): fix packet statistics The former implementation was only counting the tx_packets value but not the tx_bytes as the skb was dropped on driver layer. Enable CAN echo support (IFF_ECHO) in dummy_can_init(), which activates the code for setting and retrieving the echo SKB and counts the tx_bytes correctly. Fixes: 816cf430e84b ("can: add dummy_can driver") Cc: Vincent Mailhol Signed-off-by: Oliver Hartkopp Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20260126104540.21024-1-socketcan@hartkopp.net [mkl: make commit message imperative] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dummy_can.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/dummy_can.c b/drivers/net/can/dummy_can.c index 41953655e3d3..cd23de488edc 100644 --- a/drivers/net/can/dummy_can.c +++ b/drivers/net/can/dummy_can.c @@ -241,6 +241,7 @@ static int __init dummy_can_init(void) dev->netdev_ops = &dummy_can_netdev_ops; dev->ethtool_ops = &dummy_can_ethtool_ops; + dev->flags |= IFF_ECHO; /* enable echo handling */ priv = netdev_priv(dev); priv->can.bittiming_const = &dummy_can_bittiming_const; priv->can.bitrate_max = 20 * MEGA /* BPS */; -- cgit v1.2.3 From ab3f894de216f4a62adc3b57e9191888cbf26885 Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Mon, 9 Feb 2026 15:47:05 +0100 Subject: can: mcp251x: fix deadlock in error path of mcp251x_open The mcp251x_open() function call free_irq() in its error path with the mpc_lock mutex held. But if an interrupt already occurred the interrupt handler will be waiting for the mpc_lock and free_irq() will deadlock waiting for the handler to finish. This issue is similar to the one fixed in commit 7dd9c26bd6cf ("can: mcp251x: fix deadlock if an interrupt occurs during mcp251x_open") but for the error path. To solve this issue move the call to free_irq() after the lock is released. Setting `priv->force_quit = 1` beforehand ensure that the IRQ handler will exit right away once it acquired the lock. Signed-off-by: Alban Bedel Link: https://patch.msgid.link/20260209144706.2261954-1-alban.bedel@lht.dlh.de Fixes: bf66f3736a94 ("can: mcp251x: Move to threaded interrupts instead of workqueues.") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251x.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index fa97adf25b73..bb7782582f40 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -1214,6 +1214,7 @@ static int mcp251x_open(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; + bool release_irq = false; unsigned long flags = 0; int ret; @@ -1257,12 +1258,24 @@ static int mcp251x_open(struct net_device *net) return 0; out_free_irq: - free_irq(spi->irq, priv); + /* The IRQ handler might be running, and if so it will be waiting + * for the lock. But free_irq() must wait for the handler to finish + * so calling it here would deadlock. + * + * Setting priv->force_quit will let the handler exit right away + * without any access to the hardware. This make it safe to call + * free_irq() after the lock is released. + */ + priv->force_quit = 1; + release_irq = true; + mcp251x_hw_sleep(spi); out_close: mcp251x_power_enable(priv->transceiver, 0); close_candev(net); mutex_unlock(&priv->mcp_lock); + if (release_irq) + free_irq(spi->irq, priv); return ret; } -- cgit v1.2.3 From 968b098220e393a10488b6a5dddb302b2eaedf66 Mon Sep 17 00:00:00 2001 From: Ziyi Guo Date: Fri, 13 Feb 2026 20:39:27 +0000 Subject: can: esd_usb: add endpoint type validation esd_usb_probe() constructs bulk pipes for two endpoints without verifying their transfer types: - usb_rcvbulkpipe(dev->udev, 1) for RX (version reply, async RX data) - usb_sndbulkpipe(dev->udev, 2) for TX (version query, CAN frames) A malformed USB device can present these endpoints with transfer types that differ from what the driver assumes, triggering the WARNING in usb_submit_urb(). Use usb_find_common_endpoints() to discover and validate the first bulk IN and bulk OUT endpoints at probe time, before any allocation. Found pipes are saved to struct esd_usb and code uses them directly instead of making pipes in place. Similar to - commit 136bed0bfd3b ("can: mcba_usb: properly check endpoint type") which established the usb_find_common_endpoints() + stored pipes pattern for CAN USB drivers. Fixes: 96d8e90382dc ("can: Add driver for esd CAN-USB/2 device") Suggested-by: Vincent Mailhol Signed-off-by: Ziyi Guo Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20260213203927.599163-1-n7l8m4@u.northwestern.edu Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/esd_usb.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c index 2892a68f510a..d257440fa01f 100644 --- a/drivers/net/can/usb/esd_usb.c +++ b/drivers/net/can/usb/esd_usb.c @@ -272,6 +272,9 @@ struct esd_usb { struct usb_anchor rx_submitted; + unsigned int rx_pipe; + unsigned int tx_pipe; + int net_count; u32 version; int rxinitdone; @@ -537,7 +540,7 @@ static void esd_usb_read_bulk_callback(struct urb *urb) } resubmit_urb: - usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), + usb_fill_bulk_urb(urb, dev->udev, dev->rx_pipe, urb->transfer_buffer, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); @@ -626,9 +629,7 @@ static int esd_usb_send_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; - return usb_bulk_msg(dev->udev, - usb_sndbulkpipe(dev->udev, 2), - msg, + return usb_bulk_msg(dev->udev, dev->tx_pipe, msg, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ &actual_length, 1000); @@ -639,12 +640,8 @@ static int esd_usb_wait_msg(struct esd_usb *dev, { int actual_length; - return usb_bulk_msg(dev->udev, - usb_rcvbulkpipe(dev->udev, 1), - msg, - sizeof(*msg), - &actual_length, - 1000); + return usb_bulk_msg(dev->udev, dev->rx_pipe, msg, + sizeof(*msg), &actual_length, 1000); } static int esd_usb_setup_rx_urbs(struct esd_usb *dev) @@ -677,8 +674,7 @@ static int esd_usb_setup_rx_urbs(struct esd_usb *dev) urb->transfer_dma = buf_dma; - usb_fill_bulk_urb(urb, dev->udev, - usb_rcvbulkpipe(dev->udev, 1), + usb_fill_bulk_urb(urb, dev->udev, dev->rx_pipe, buf, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; @@ -903,7 +899,7 @@ static netdev_tx_t esd_usb_start_xmit(struct sk_buff *skb, /* hnd must not be 0 - MSB is stripped in txdone handling */ msg->tx.hnd = BIT(31) | i; /* returned in TX done message */ - usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf, + usb_fill_bulk_urb(urb, dev->udev, dev->tx_pipe, buf, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ esd_usb_write_bulk_callback, context); @@ -1298,10 +1294,16 @@ done: static int esd_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { + struct usb_endpoint_descriptor *ep_in, *ep_out; struct esd_usb *dev; union esd_usb_msg *msg; int i, err; + err = usb_find_common_endpoints(intf->cur_altsetting, &ep_in, &ep_out, + NULL, NULL); + if (err) + return err; + dev = kzalloc_obj(*dev); if (!dev) { err = -ENOMEM; @@ -1309,6 +1311,8 @@ static int esd_usb_probe(struct usb_interface *intf, } dev->udev = interface_to_usbdev(intf); + dev->rx_pipe = usb_rcvbulkpipe(dev->udev, ep_in->bEndpointAddress); + dev->tx_pipe = usb_sndbulkpipe(dev->udev, ep_out->bEndpointAddress); init_usb_anchor(&dev->rx_submitted); -- cgit v1.2.3 From 38a01c9700b0dcafe97dfa9dc7531bf4a245deff Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 17:51:17 +0100 Subject: can: ems_usb: ems_usb_read_bulk_callback(): check the proper length of a message When looking at the data in a USB urb, the actual_length is the size of the buffer passed to the driver, not the transfer_buffer_length which is set by the driver as the max size of the buffer. When parsing the messages in ems_usb_read_bulk_callback() properly check the size both at the beginning of parsing the message to make sure it is big enough for the expected structure, and at the end of the message to make sure we don't overflow past the end of the buffer for the next message. Cc: Vincent Mailhol Cc: Marc Kleine-Budde Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026022316-answering-strainer-a5db@gregkh Fixes: 702171adeed3 ("ems_usb: Added support for EMS CPC-USB/ARM7 CAN/USB interface") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ems_usb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 4c219a5b139b..9b25dda7c183 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -445,6 +445,11 @@ static void ems_usb_read_bulk_callback(struct urb *urb) start = CPC_HEADER_SIZE; while (msg_count) { + if (start + CPC_MSG_HEADER_LEN > urb->actual_length) { + netdev_err(netdev, "format error\n"); + break; + } + msg = (struct ems_cpc_msg *)&ibuf[start]; switch (msg->type) { @@ -474,7 +479,7 @@ static void ems_usb_read_bulk_callback(struct urb *urb) start += CPC_MSG_HEADER_LEN + msg->length; msg_count--; - if (start > urb->transfer_buffer_length) { + if (start > urb->actual_length) { netdev_err(netdev, "format error\n"); break; } -- cgit v1.2.3 From 1e446fd0582ad8be9f6dafb115fc2e7245f9bea7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 17:30:20 +0100 Subject: can: ucan: Fix infinite loop from zero-length messages If a broken ucan device gets a message with the message length field set to 0, then the driver will loop for forever in ucan_read_bulk_callback(), hanging the system. If the length is 0, just skip the message and go on to the next one. This has been fixed in the kvaser_usb driver in the past in commit 0c73772cd2b8 ("can: kvaser_usb: leaf: Fix potential infinite loop in command parsers"), so there must be some broken devices out there like this somewhere. Cc: Marc Kleine-Budde Cc: Vincent Mailhol Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026022319-huff-absurd-6a18@gregkh Fixes: 9f2d3eae88d2 ("can: ucan: add driver for Theobroma Systems UCAN devices") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ucan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index c79508b1c43e..0ea0ac75e42f 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -748,7 +748,7 @@ static void ucan_read_bulk_callback(struct urb *urb) len = le16_to_cpu(m->len); /* check sanity (length of content) */ - if (urb->actual_length - pos < len) { + if ((len == 0) || (urb->actual_length - pos < len)) { netdev_warn(up->netdev, "invalid message (short; no data; l:%d)\n", urb->actual_length); -- cgit v1.2.3 From 5eaad4f768266f1f17e01232ffe2ef009f8129b7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 17:39:20 +0100 Subject: can: usb: etas_es58x: correctly anchor the urb in the read bulk callback When submitting an urb, that is using the anchor pattern, it needs to be anchored before submitting it otherwise it could be leaked if usb_kill_anchored_urbs() is called. This logic is correctly done elsewhere in the driver, except in the read bulk callback so do that here also. Cc: Vincent Mailhol Cc: Marc Kleine-Budde Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Reviewed-by: Vincent Mailhol Tested-by: Vincent Mailhol Link: https://patch.msgid.link/2026022320-poser-stiffly-9d84@gregkh Fixes: 8537257874e9 ("can: etas_es58x: add core support for ETAS ES58X CAN USB interfaces") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 2d248deb69dc..b259f6109808 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -1461,12 +1461,18 @@ static void es58x_read_bulk_callback(struct urb *urb) } resubmit_urb: + usb_anchor_urb(urb, &es58x_dev->rx_urbs); ret = usb_submit_urb(urb, GFP_ATOMIC); + if (!ret) + return; + + usb_unanchor_urb(urb); + if (ret == -ENODEV) { for (i = 0; i < es58x_dev->num_can_ch; i++) if (es58x_dev->netdev[i]) netif_device_detach(es58x_dev->netdev[i]); - } else if (ret) + } else dev_err_ratelimited(dev, "Failed resubmitting read bulk urb: %pe\n", ERR_PTR(ret)); -- cgit v1.2.3 From 7299b1b39a255f6092ce4ec0b65f66e9d6a357af Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 13:10:30 +0100 Subject: can: usb: f81604: handle short interrupt urb messages properly If an interrupt urb is received that is not the correct length, properly detect it and don't attempt to treat the data as valid. Cc: Ji-Ze Hong (Peter Hong) Cc: Marc Kleine-Budde Cc: Vincent Mailhol Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026022331-opal-evaluator-a928@gregkh Fixes: 88da17436973 ("can: usb: f81604: add Fintek F81604 support") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/f81604.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/can/usb/f81604.c b/drivers/net/can/usb/f81604.c index 76578063ac82..c61bd30d1765 100644 --- a/drivers/net/can/usb/f81604.c +++ b/drivers/net/can/usb/f81604.c @@ -620,6 +620,12 @@ static void f81604_read_int_callback(struct urb *urb) netdev_info(netdev, "%s: Int URB aborted: %pe\n", __func__, ERR_PTR(urb->status)); + if (urb->actual_length < sizeof(*data)) { + netdev_warn(netdev, "%s: short int URB: %u < %zu\n", + __func__, urb->actual_length, sizeof(*data)); + goto resubmit_urb; + } + switch (urb->status) { case 0: /* success */ break; -- cgit v1.2.3 From 51f94780720fa90c424f67e3e9784cb8ef8190e5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 13:10:31 +0100 Subject: can: usb: f81604: handle bulk write errors properly If a write urb fails then more needs to be done other than just logging the message, otherwise the transmission could be stalled. Properly increment the error counters and wake up the queues so that data will continue to flow. Cc: Ji-Ze Hong (Peter Hong) Cc: Marc Kleine-Budde Cc: Vincent Mailhol Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026022334-slackness-dynamic-9195@gregkh Fixes: 88da17436973 ("can: usb: f81604: add Fintek F81604 support") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/f81604.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/usb/f81604.c b/drivers/net/can/usb/f81604.c index c61bd30d1765..1cc927d79b6a 100644 --- a/drivers/net/can/usb/f81604.c +++ b/drivers/net/can/usb/f81604.c @@ -880,9 +880,27 @@ static void f81604_write_bulk_callback(struct urb *urb) if (!netif_device_present(netdev)) return; - if (urb->status) - netdev_info(netdev, "%s: Tx URB error: %pe\n", __func__, - ERR_PTR(urb->status)); + if (!urb->status) + return; + + switch (urb->status) { + case -ENOENT: + case -ECONNRESET: + case -ESHUTDOWN: + return; + default: + break; + } + + if (net_ratelimit()) + netdev_err(netdev, "%s: Tx URB error: %pe\n", __func__, + ERR_PTR(urb->status)); + + can_free_echo_skb(netdev, 0, NULL); + netdev->stats.tx_dropped++; + netdev->stats.tx_errors++; + + netif_wake_queue(netdev); } static void f81604_clear_reg_work(struct work_struct *work) -- cgit v1.2.3 From 952caa5da10bed22be09612433964f6877ba0dde Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Feb 2026 13:10:32 +0100 Subject: can: usb: f81604: correctly anchor the urb in the read bulk callback When submitting an urb, that is using the anchor pattern, it needs to be anchored before submitting it otherwise it could be leaked if usb_kill_anchored_urbs() is called. This logic is correctly done elsewhere in the driver, except in the read bulk callback so do that here also. Cc: Ji-Ze Hong (Peter Hong) Cc: Marc Kleine-Budde Cc: Vincent Mailhol Cc: stable@kernel.org Assisted-by: gkh_clanker_2000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026022334-starlight-scaling-2cea@gregkh Fixes: 88da17436973 ("can: usb: f81604: add Fintek F81604 support") Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/f81604.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/usb/f81604.c b/drivers/net/can/usb/f81604.c index 1cc927d79b6a..f12318268e46 100644 --- a/drivers/net/can/usb/f81604.c +++ b/drivers/net/can/usb/f81604.c @@ -413,6 +413,7 @@ static void f81604_read_bulk_callback(struct urb *urb) { struct f81604_can_frame *frame = urb->transfer_buffer; struct net_device *netdev = urb->context; + struct f81604_port_priv *priv = netdev_priv(netdev); int ret; if (!netif_device_present(netdev)) @@ -445,10 +446,15 @@ static void f81604_read_bulk_callback(struct urb *urb) f81604_process_rx_packet(netdev, frame); resubmit_urb: + usb_anchor_urb(urb, &priv->urbs_anchor); ret = usb_submit_urb(urb, GFP_ATOMIC); + if (!ret) + return; + usb_unanchor_urb(urb); + if (ret == -ENODEV) netif_device_detach(netdev); - else if (ret) + else netdev_err(netdev, "%s: failed to resubmit read bulk urb: %pe\n", __func__, ERR_PTR(ret)); @@ -652,10 +658,15 @@ static void f81604_read_int_callback(struct urb *urb) f81604_handle_tx(priv, data); resubmit_urb: + usb_anchor_urb(urb, &priv->urbs_anchor); ret = usb_submit_urb(urb, GFP_ATOMIC); + if (!ret) + return; + usb_unanchor_urb(urb); + if (ret == -ENODEV) netif_device_detach(netdev); - else if (ret) + else netdev_err(netdev, "%s: failed to resubmit int urb: %pe\n", __func__, ERR_PTR(ret)); } -- cgit v1.2.3 From 2df6162785f31f1bbb598cfc3b08e4efc88f80b6 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 19 Feb 2026 13:57:34 +0100 Subject: can: gs_usb: gs_can_open(): always configure bitrates before starting device So far the driver populated the struct can_priv::do_set_bittiming() and struct can_priv::fd::do_set_data_bittiming() callbacks. Before bringing up the interface, user space has to configure the bitrates. With these callbacks the configuration is directly forwarded into the CAN hardware. Then the interface can be brought up. An ifdown-ifup cycle (without changing the bit rates) doesn't re-configure the bitrates in the CAN hardware. This leads to a problem with the CANable-2.5 [1] firmware, which resets the configured bit rates during ifdown. To fix the problem remove both bit timing callbacks and always configure the bitrates in the struct net_device_ops::ndo_open() callback. [1] https://github.com/Elmue/CANable-2.5-firmware-Slcan-and-Candlelight Cc: stable@vger.kernel.org Fixes: d08e973a77d1 ("can: gs_usb: Added support for the GS_USB CAN devices") Link: https://patch.msgid.link/20260219-gs_usb-always-configure-bitrates-v2-1-671f8ba5b0a5@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 9d27d6f0c0b5..ec9a7cbbbc69 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -772,9 +772,8 @@ device_detach: } } -static int gs_usb_set_bittiming(struct net_device *netdev) +static int gs_usb_set_bittiming(struct gs_can *dev) { - struct gs_can *dev = netdev_priv(netdev); struct can_bittiming *bt = &dev->can.bittiming; struct gs_device_bittiming dbt = { .prop_seg = cpu_to_le32(bt->prop_seg), @@ -791,9 +790,8 @@ static int gs_usb_set_bittiming(struct net_device *netdev) GFP_KERNEL); } -static int gs_usb_set_data_bittiming(struct net_device *netdev) +static int gs_usb_set_data_bittiming(struct gs_can *dev) { - struct gs_can *dev = netdev_priv(netdev); struct can_bittiming *bt = &dev->can.fd.data_bittiming; struct gs_device_bittiming dbt = { .prop_seg = cpu_to_le32(bt->prop_seg), @@ -1057,6 +1055,20 @@ static int gs_can_open(struct net_device *netdev) if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) flags |= GS_CAN_MODE_HW_TIMESTAMP; + rc = gs_usb_set_bittiming(dev); + if (rc) { + netdev_err(netdev, "failed to set bittiming: %pe\n", ERR_PTR(rc)); + goto out_usb_kill_anchored_urbs; + } + + if (ctrlmode & CAN_CTRLMODE_FD) { + rc = gs_usb_set_data_bittiming(dev); + if (rc) { + netdev_err(netdev, "failed to set data bittiming: %pe\n", ERR_PTR(rc)); + goto out_usb_kill_anchored_urbs; + } + } + /* finally start device */ dev->can.state = CAN_STATE_ERROR_ACTIVE; dm.flags = cpu_to_le32(flags); @@ -1370,7 +1382,6 @@ static struct gs_can *gs_make_candev(unsigned int channel, dev->can.state = CAN_STATE_STOPPED; dev->can.clock.freq = le32_to_cpu(bt_const.fclk_can); dev->can.bittiming_const = &dev->bt_const; - dev->can.do_set_bittiming = gs_usb_set_bittiming; dev->can.ctrlmode_supported = CAN_CTRLMODE_CC_LEN8_DLC; @@ -1394,7 +1405,6 @@ static struct gs_can *gs_make_candev(unsigned int channel, * GS_CAN_FEATURE_BT_CONST_EXT is set. */ dev->can.fd.data_bittiming_const = &dev->bt_const; - dev->can.fd.do_set_data_bittiming = gs_usb_set_data_bittiming; } if (feature & GS_CAN_FEATURE_TERMINATION) { -- cgit v1.2.3 From 7e1e6d6845329adb2da75110a061557e9c26d9b7 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 12 Feb 2026 11:30:00 -0500 Subject: dt-bindings: net: can: nxp,sja1000: add reference to mc-peripheral-props.yaml Add a reference to mc-peripheral-props.yaml to allow vendor-specific properties for memory access timings. Fix below CHECK_DTBS warings: arch/arm/boot/dts/nxp/imx/imx27-phytec-phycore-rdk.dtb: can@4,0 (nxp,sja1000): Unevaluated properties are not allowed ('fsl,weim-cs-timing' was unexpected) from schema $id: http://devicetree.org/schemas/net/can/nxp,sja1000.yaml Signed-off-by: Frank Li Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260212163000.1195586-1-Frank.Li@nxp.com Signed-off-by: Marc Kleine-Budde --- Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml b/Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml index ec0c2168e4b9..6bcfff970117 100644 --- a/Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml +++ b/Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml @@ -87,6 +87,7 @@ required: allOf: - $ref: can-controller.yaml# + - $ref: /schemas/memory-controllers/mc-peripheral-props.yaml - if: properties: compatible: -- cgit v1.2.3 From d973b1039ccde6b241b438d53297edce4de45b5c Mon Sep 17 00:00:00 2001 From: Sebastian Krzyszkowiak Date: Sat, 21 Feb 2026 17:28:04 +0100 Subject: wifi: rsi: Don't default to -EOPNOTSUPP in rsi_mac80211_config This triggers a WARN_ON in ieee80211_hw_conf_init and isn't the expected behavior from the driver - other drivers default to 0 too. Fixes: 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers") Signed-off-by: Sebastian Krzyszkowiak Link: https://patch.msgid.link/20260221-rsi-config-ret-v1-1-9a8f805e2f31@puri.sm Signed-off-by: Johannes Berg --- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index 8c8e074a3a70..c7ae8031436a 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -668,7 +668,7 @@ static int rsi_mac80211_config(struct ieee80211_hw *hw, struct rsi_hw *adapter = hw->priv; struct rsi_common *common = adapter->priv; struct ieee80211_conf *conf = &hw->conf; - int status = -EOPNOTSUPP; + int status = 0; mutex_lock(&common->mutex); -- cgit v1.2.3 From 9adfcef334bf9c6ef68eaecfca5f45d18614efe0 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Mon, 2 Mar 2026 17:14:39 +0800 Subject: sched_ext: Use READ_ONCE() for the read side of dsq->nr update scx_bpf_dsq_nr_queued() reads dsq->nr via READ_ONCE() without holding any lock, making dsq->nr a lock-free concurrently accessed variable. However, dsq_mod_nr(), the sole writer of dsq->nr, only uses WRITE_ONCE() on the write side without the matching READ_ONCE() on the read side: WRITE_ONCE(dsq->nr, dsq->nr + delta); ^^^^^^^ plain read -- KCSAN data race The KCSAN documentation requires that if one accessor uses READ_ONCE() or WRITE_ONCE() on a variable to annotate lock-free access, all other accesses must also use the appropriate accessor. A plain read on the right-hand side of WRITE_ONCE() leaves the pair incomplete and will trigger KCSAN warnings. Fix by using READ_ONCE() for the read side of the update: WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta); This is consistent with scx_bpf_dsq_nr_queued() and makes the concurrent access annotation complete and KCSAN-clean. Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9280381f8923..4c2cf0d7d495 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) { - /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ - WRITE_ONCE(dsq->nr, dsq->nr + delta); + /* + * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE() + * on the read side and WRITE_ONCE() on the write side to properly + * annotate the concurrent lockless access and avoid KCSAN warnings. + */ + WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta); } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) -- cgit v1.2.3 From 494eaf4651975127d34d5ae6555c72dedba092c9 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Mon, 2 Mar 2026 17:14:40 +0800 Subject: sched_ext: Replace naked scx_root dereferences in kobject callbacks scx_attr_ops_show() and scx_uevent() access scx_root->ops.name directly. This is problematic for two reasons: 1. The file-level comment explicitly identifies naked scx_root dereferences as a temporary measure that needs to be replaced with proper per-instance access. 2. scx_attr_events_show(), the neighboring sysfs show function in the same group, already uses the correct pattern: struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); Having inconsistent access patterns in the same sysfs/uevent group is error-prone. The kobject embedded in struct scx_sched is initialized as: kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); so container_of(kobj, struct scx_sched, kobj) correctly retrieves the owning scx_sched instance in both callbacks. Replace the naked scx_root dereferences with container_of()-based access, consistent with scx_attr_events_show() and in preparation for proper multi-instance scx_sched support. Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4c2cf0d7d495..a566d2cc8a43 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3712,7 +3712,9 @@ static void scx_kobj_release(struct kobject *kobj) static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_root->ops.name); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + return sysfs_emit(buf, "%s\n", sch->ops.name); } SCX_ATTR(ops); @@ -3756,7 +3758,9 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); + const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { -- cgit v1.2.3 From af4e9ef3d78420feb8fe58cd9a1ab80c501b3c08 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 2 Mar 2026 13:27:51 +0000 Subject: uaccess: Fix scoped_user_read_access() for 'pointer to const' If a 'const struct foo __user *ptr' is used for the address passed to scoped_user_read_access() then you get a warning/error uaccess.h:691:1: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] for the void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl) assignment. Fix by using 'auto' for both _tmpptr and the redeclaration of uptr. Replace the CLASS() with explicit __cleanup() functions on uptr. Fixes: e497310b4ffb ("uaccess: Provide scoped user access regions") Signed-off-by: David Laight Reviewed-and-tested-by: Christophe Leroy (CS GROUP) Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 54 ++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1f3804245c06..809e4f7dfdbd 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -647,36 +647,22 @@ static inline void user_access_restore(unsigned long flags) { } /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) -#define user_rw_access_end() user_access_end() /* Scoped user access */ -#define USER_ACCESS_GUARD(_mode) \ -static __always_inline void __user * \ -class_user_##_mode##_begin(void __user *ptr) \ -{ \ - return ptr; \ -} \ - \ -static __always_inline void \ -class_user_##_mode##_end(void __user *ptr) \ -{ \ - user_##_mode##_access_end(); \ -} \ - \ -DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ - class_user_##_mode##_end(_T), \ - class_user_##_mode##_begin(ptr), void __user *ptr) \ - \ -static __always_inline class_user_##_mode##_access_t \ -class_user_##_mode##_access_ptr(void __user *scope) \ -{ \ - return scope; \ -} -USER_ACCESS_GUARD(read) -USER_ACCESS_GUARD(write) -USER_ACCESS_GUARD(rw) -#undef USER_ACCESS_GUARD +/* Cleanup wrapper functions */ +static __always_inline void __scoped_user_read_access_end(const void *p) +{ + user_read_access_end(); +}; +static __always_inline void __scoped_user_write_access_end(const void *p) +{ + user_write_access_end(); +}; +static __always_inline void __scoped_user_rw_access_end(const void *p) +{ + user_access_end(); +}; /** * __scoped_user_access_begin - Start a scoped user access @@ -750,13 +736,13 @@ USER_ACCESS_GUARD(rw) * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ -#define __scoped_user_access(mode, uptr, size, elbl) \ -for (bool done = false; !done; done = true) \ - for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ - !done; done = true) \ - for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ - /* Force modified pointer usage within the scope */ \ - for (const typeof(uptr) uptr = _tmpptr; !done; done = true) +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const auto uptr __cleanup(__scoped_user_##mode##_access_end) = \ + _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size -- cgit v1.2.3 From 7cbe98f7bef965241a5908d50d557008cf998aee Mon Sep 17 00:00:00 2001 From: Mieczyslaw Nalewaj Date: Sun, 1 Mar 2026 18:13:14 -0300 Subject: net: dsa: realtek: rtl8365mb: fix rtl8365mb_phy_ocp_write return value Function rtl8365mb_phy_ocp_write() always returns 0, even when an error occurs during register access. This patch fixes the return value to propagate the actual error code from regmap operations. Link: https://lore.kernel.org/netdev/a2dfde3c-d46f-434b-9d16-1e251e449068@yahoo.com/ Fixes: 2796728460b8 ("net: dsa: realtek: rtl8365mb: serialize indirect PHY register access") Signed-off-by: Mieczyslaw Nalewaj Reviewed-by: Andrew Lunn Signed-off-by: Luiz Angelo Daros de Luca Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20260301-realtek_namiltd_fix1-v1-1-43a6bb707f9c@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/realtek/rtl8365mb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c index c575e164368c..f938a3f701cc 100644 --- a/drivers/net/dsa/realtek/rtl8365mb.c +++ b/drivers/net/dsa/realtek/rtl8365mb.c @@ -769,7 +769,7 @@ static int rtl8365mb_phy_ocp_write(struct realtek_priv *priv, int phy, out: rtl83xx_unlock(priv); - return 0; + return ret; } static int rtl8365mb_phy_read(struct realtek_priv *priv, int phy, int regnum) -- cgit v1.2.3 From 3f10543c5bdd11568d1a54f5b1d955f5652e3095 Mon Sep 17 00:00:00 2001 From: Simon Baatz Date: Sun, 1 Mar 2026 09:41:33 +0100 Subject: selftests/net: packetdrill: restore tcp_rcv_big_endseq.pkt Commit 1cc93c48b5d7 ("selftests/net: packetdrill: remove tests for tcp_rcv_*big") removed the test for the reverted commit 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") but also the one for commit 9ca48d616ed7 ("tcp: do not accept packets beyond window"). Restore the test with the necessary adaptation: expect a delayed ACK instead of an immediate one, since tcp_can_ingest() does not fail anymore for the last data packet. Signed-off-by: Simon Baatz Link: https://patch.msgid.link/20260301-tcp_rcv_big_endseq-v1-1-86ab7415ab58@gmail.com Signed-off-by: Jakub Kicinski --- .../net/packetdrill/tcp_rcv_big_endseq.pkt | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt new file mode 100644 index 000000000000..6c0f32c40f19 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +--mss=1000 + +`./defaults.sh` + + 0 `nstat -n` + +// Establish a connection. + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [10000], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + + +0 < P. 1:4001(4000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 win 5000 + +// packet in sequence : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW + +0 < P. 4001:54001(50000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 win 5000 + +// ooo packet. : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE / LINUX_MIB_BEYOND_WINDOW + +1 < P. 5001:55001(50000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 win 5000 + +// SKB_DROP_REASON_TCP_INVALID_SEQUENCE / LINUX_MIB_BEYOND_WINDOW + +0 < P. 70001:80001(10000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 win 5000 + + +0 read(4, ..., 100000) = 4000 + +// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd + +0 < P. 4001:54001(50000) ack 1 win 257 + * > . 1:1(0) ack 54001 win 0 + +// Check LINUX_MIB_BEYOND_WINDOW has been incremented 3 times. ++0 `nstat | grep TcpExtBeyondWindow | grep -q " 3 "` -- cgit v1.2.3 From 1939d9816dbfc057508267f0c5214f9478fbd6d1 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Sun, 1 Mar 2026 19:16:51 +0000 Subject: MAINTAINERS: ena: update AMAZON ETHERNET maintainers Remove Shay Agroskin and Saeed Bishara. Promote David Arinzon to maintainer. Signed-off-by: Arthur Kiyanovski Link: https://patch.msgid.link/20260301191652.5916-1-akiyano@amazon.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2265e2c9bfbe..7829ff2180ae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -993,10 +993,8 @@ F: Documentation/devicetree/bindings/thermal/amazon,al-thermal.yaml F: drivers/thermal/thermal_mmio.c AMAZON ETHERNET DRIVERS -M: Shay Agroskin M: Arthur Kiyanovski -R: David Arinzon -R: Saeed Bishara +M: David Arinzon L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/device_drivers/ethernet/amazon/ena.rst -- cgit v1.2.3 From 8fb54c7307f6add04246d2f7845e42ecd41950fe Mon Sep 17 00:00:00 2001 From: MeiChia Chiu Date: Tue, 3 Mar 2026 13:47:25 +0800 Subject: wifi: mac80211: fix missing ieee80211_eml_params member initialization The missing initialization causes driver to misinterpret the EML control bitmap, resulting in incorrect link bitmap handling. Fixes: 0d95280a2d54e ("wifi: mac80211: Add eMLSR/eMLMR action frame parsing support") Signed-off-by: MeiChia Chiu Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260303054725.471548-1-MeiChia.Chiu@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/eht.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index 75096b2195d2..078e1e23d8d1 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -154,6 +154,7 @@ void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, u8 *ptr = mgmt->u.action.u.eml_omn.variable; struct ieee80211_eml_params eml_params = { .link_id = status->link_id, + .control = control, }; struct sta_info *sta; int opt_len = 0; -- cgit v1.2.3 From 3f27958b729a2337336a6b50e0d9aee5fbbce816 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Tue, 3 Mar 2026 14:09:58 +0800 Subject: sched_ext: Use READ_ONCE() for plain reads of scx_watchdog_timeout scx_watchdog_timeout is written with WRITE_ONCE() in scx_enable(): WRITE_ONCE(scx_watchdog_timeout, timeout); However, three read-side accesses use plain reads without the matching READ_ONCE(): /* check_rq_for_timeouts() - L2824 */ last_runnable + scx_watchdog_timeout /* scx_watchdog_workfn() - L2852 */ scx_watchdog_timeout / 2 /* scx_enable() - L5179 */ scx_watchdog_timeout / 2 The KCSAN documentation requires that if one accessor uses WRITE_ONCE() to annotate lock-free access, all other accesses must also use the appropriate accessor. Plain reads alongside WRITE_ONCE() leave the pair incomplete and can trigger KCSAN warnings. Note that scx_tick() already uses the correct READ_ONCE() annotation: last_check + READ_ONCE(scx_watchdog_timeout) Fix the three remaining plain reads to match, making all accesses to scx_watchdog_timeout consistently annotated and KCSAN-clean. Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a566d2cc8a43..2ba69b302368 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2739,7 +2739,7 @@ static bool check_rq_for_timeouts(struct rq *rq) unsigned long last_runnable = p->scx.runnable_at; if (unlikely(time_after(jiffies, - last_runnable + scx_watchdog_timeout))) { + last_runnable + READ_ONCE(scx_watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, @@ -2767,7 +2767,7 @@ static void scx_watchdog_workfn(struct work_struct *work) cond_resched(); } queue_delayed_work(system_unbound_wq, to_delayed_work(work), - scx_watchdog_timeout / 2); + READ_ONCE(scx_watchdog_timeout) / 2); } void scx_tick(struct rq *rq) @@ -5081,7 +5081,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) WRITE_ONCE(scx_watchdog_timeout, timeout); WRITE_ONCE(scx_watchdog_timestamp, jiffies); queue_delayed_work(system_unbound_wq, &scx_watchdog_work, - scx_watchdog_timeout / 2); + READ_ONCE(scx_watchdog_timeout) / 2); /* * Once __scx_enabled is set, %current can be switched to SCX anytime. -- cgit v1.2.3 From 9af832c0a76eedce169c4c6360e4e20d8a0c9ab1 Mon Sep 17 00:00:00 2001 From: Zhao Mengmeng Date: Tue, 3 Mar 2026 15:23:15 +0800 Subject: tools/sched_ext: Add -fms-extensions to bpf build flags Similar to commit 835a50753579 ("selftests/bpf: Add -fms-extensions to bpf build flags") and commit 639f58a0f480 ("bpftool: Fix build warnings due to MS extensions") The kernel is now built with -fms-extensions, therefore generated vmlinux.h contains types like: struct aes_key { struct aes_enckey; union aes_invkey_arch inv_k; }; struct ns_common { ... union { struct ns_tree; struct callback_head ns_rcu; }; }; Which raise warning like below when building scx scheduler: tools/sched_ext/build/include/vmlinux.h:50533:3: warning: declaration does not declare anything [-Wmissing-declarations] 50533 | struct ns_tree; | ^ Fix it by using -fms-extensions and -Wno-microsoft-anon-tag flags to build bpf programs that #include "vmlinux.h" Signed-off-by: Zhao Mengmeng Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index 47ad7444677e..21554f089692 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -122,6 +122,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ -I../../include \ $(call get_sys_includes,$(CLANG)) \ -Wall -Wno-compare-distinct-pointer-types \ + -Wno-microsoft-anon-tag \ + -fms-extensions \ -O2 -mcpu=v3 # sort removes libbpf duplicates when not cross-building -- cgit v1.2.3 From 01a867c2e090cb440c8f27158e8650c8ddefec8e Mon Sep 17 00:00:00 2001 From: Zhao Mengmeng Date: Tue, 3 Mar 2026 15:23:16 +0800 Subject: selftests/sched_ext: Add -fms-extensions to bpf build flags Similar to commit 835a50753579 ("selftests/bpf: Add -fms-extensions to bpf build flags") and commit 639f58a0f480 ("bpftool: Fix build warnings due to MS extensions") Fix "declaration does not declare anything" warning by using -fms-extensions and -Wno-microsoft-anon-tag flags to build bpf programs that #include "vmlinux.h" Signed-off-by: Zhao Mengmeng Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 2c601a7eaff5..006300ac6dff 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -93,6 +93,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ $(CLANG_SYS_INCLUDES) \ -Wall -Wno-compare-distinct-pointer-types \ -Wno-incompatible-function-pointer-types \ + -Wno-microsoft-anon-tag \ + -fms-extensions \ -O2 -mcpu=v3 # sort removes libbpf duplicates when not cross-building -- cgit v1.2.3 From 75ad51825933575906394d0d5db04586b02db00a Mon Sep 17 00:00:00 2001 From: Zhao Mengmeng Date: Tue, 3 Mar 2026 15:23:17 +0800 Subject: selftests/sched_ext: Fix peek_dsq.bpf.c compile error for clang 17 When compiling sched_ext selftests using clang 17.0.6, it raised compiler crash and build error: Error at line 68: Unsupport signed division for DAG: 0x55b2f9a60240: i64 = sdiv 0x55b2f9a609b0, Constant:i64<100>, peek_dsq.bpf.c:68:25 @[ peek_dsq.bpf.c:95:4 @[ peek_dsq.bpf.c:169:8 @[ peek _dsq.bpf.c:140:6 ] ] ]Please convert to unsigned div/mod After digging, it's not a compiler error, clang supported Signed division only when using -mcpu=v4, while we use -mcpu=v3 currently, the better way is to use unsigned div, see [1] for details. [1] https://github.com/llvm/llvm-project/issues/70433 Signed-off-by: Zhao Mengmeng Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/peek_dsq.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c index a3faf5bb49d6..784f2f6c1af9 100644 --- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -58,14 +58,14 @@ static void record_peek_result(long pid) { u32 slot_key; long *slot_pid_ptr; - int ix; + u32 ix; if (pid <= 0) return; /* Find an empty slot or one with the same PID */ bpf_for(ix, 0, 10) { - slot_key = (pid + ix) % MAX_SAMPLES; + slot_key = ((u64)pid + ix) % MAX_SAMPLES; slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); if (!slot_pid_ptr) continue; -- cgit v1.2.3 From 479d589b40b836442bbdadc3fdb37f001bb67f26 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 26 Feb 2026 16:03:01 +0800 Subject: bpf/bonding: reject vlan+srcmac xmit_hash_policy change when XDP is loaded bond_option_mode_set() already rejects mode changes that would make a loaded XDP program incompatible via bond_xdp_check(). However, bond_option_xmit_hash_policy_set() has no such guard. For 802.3ad and balance-xor modes, bond_xdp_check() returns false when xmit_hash_policy is vlan+srcmac, because the 802.1q payload is usually absent due to hardware offload. This means a user can: 1. Attach a native XDP program to a bond in 802.3ad/balance-xor mode with a compatible xmit_hash_policy (e.g. layer2+3). 2. Change xmit_hash_policy to vlan+srcmac while XDP remains loaded. This leaves bond->xdp_prog set but bond_xdp_check() now returning false for the same device. When the bond is later destroyed, dev_xdp_uninstall() calls bond_xdp_set(dev, NULL, NULL) to remove the program, which hits the bond_xdp_check() guard and returns -EOPNOTSUPP, triggering: WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)) Fix this by rejecting xmit_hash_policy changes to vlan+srcmac when an XDP program is loaded on a bond in 802.3ad or balance-xor mode. commit 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP") introduced bond_xdp_check() which returns false for 802.3ad/balance-xor modes when xmit_hash_policy is vlan+srcmac. The check was wired into bond_xdp_set() to reject XDP attachment with an incompatible policy, but the symmetric path -- preventing xmit_hash_policy from being changed to an incompatible value after XDP is already loaded -- was left unguarded in bond_option_xmit_hash_policy_set(). Note: commit 094ee6017ea0 ("bonding: check xdp prog when set bond mode") later added a similar guard to bond_option_mode_set(), but bond_option_xmit_hash_policy_set() remained unprotected. Reported-by: syzbot+5a287bcdc08104bc3132@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6995aff6.050a0220.2eeac1.014e.GAE@google.com/T/ Fixes: 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP") Signed-off-by: Jiayuan Chen Link: https://patch.msgid.link/20260226080306.98766-2-jiayuan.chen@linux.dev Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 9 +++++++-- drivers/net/bonding/bond_options.c | 2 ++ include/net/bonding.h | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a1de08ee3815..14ed91391fcc 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -324,7 +324,7 @@ static bool bond_sk_check(struct bonding *bond) } } -bool bond_xdp_check(struct bonding *bond, int mode) +bool __bond_xdp_check(int mode, int xmit_policy) { switch (mode) { case BOND_MODE_ROUNDROBIN: @@ -335,7 +335,7 @@ bool bond_xdp_check(struct bonding *bond, int mode) /* vlan+srcmac is not supported with XDP as in most cases the 802.1q * payload is not in the packet due to hardware offload. */ - if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) + if (xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) return true; fallthrough; default: @@ -343,6 +343,11 @@ bool bond_xdp_check(struct bonding *bond, int mode) } } +bool bond_xdp_check(struct bonding *bond, int mode) +{ + return __bond_xdp_check(mode, bond->params.xmit_policy); +} + /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index dcee384c2f06..7380cc4ee75a 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1575,6 +1575,8 @@ static int bond_option_fail_over_mac_set(struct bonding *bond, static int bond_option_xmit_hash_policy_set(struct bonding *bond, const struct bond_opt_value *newval) { + if (bond->xdp_prog && !__bond_xdp_check(BOND_MODE(bond), newval->value)) + return -EOPNOTSUPP; netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n", newval->string, newval->value); bond->params.xmit_policy = newval->value; diff --git a/include/net/bonding.h b/include/net/bonding.h index 4ad5521e7731..395c6e281c5f 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -699,6 +699,7 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool __bond_xdp_check(int mode, int xmit_policy); bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); -- cgit v1.2.3 From 181cafbd8a01d22f3078a84f079c4a7cc0653068 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 26 Feb 2026 16:03:02 +0800 Subject: selftests/bpf: add test for xdp_bonding xmit_hash_policy compat Add a selftest to verify that changing xmit_hash_policy to vlan+srcmac is rejected when a native XDP program is loaded on a bond in 802.3ad mode. Without the fix in bond_option_xmit_hash_policy_set(), the change succeeds silently, creating an inconsistent state that triggers a kernel WARNING in dev_xdp_uninstall() when the bond is torn down. The test attaches native XDP to a bond0 (802.3ad, layer2+3), then attempts to switch xmit_hash_policy to vlan+srcmac and asserts the operation fails. It also verifies the change succeeds after XDP is detached, confirming the rejection is specific to the XDP-loaded state. Signed-off-by: Jiayuan Chen Link: https://patch.msgid.link/20260226080306.98766-3-jiayuan.chen@linux.dev Signed-off-by: Paolo Abeni --- .../testing/selftests/bpf/prog_tests/xdp_bonding.c | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c index fb952703653e..e8ea26464349 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -610,6 +610,61 @@ out: system("ip link del bond"); } +/* + * Test that changing xmit_hash_policy to vlan+srcmac is rejected when a + * native XDP program is loaded on a bond in 802.3ad or balance-xor mode. + * These modes support XDP only when xmit_hash_policy != vlan+srcmac; freely + * changing the policy creates an inconsistency that triggers a WARNING in + * dev_xdp_uninstall() during device teardown. + */ +static void test_xdp_bonding_xmit_policy_compat(struct skeletons *skeletons) +{ + struct nstoken *nstoken = NULL; + int bond_ifindex = -1; + int xdp_fd, err; + + SYS(out, "ip netns add ns_xmit_policy"); + nstoken = open_netns("ns_xmit_policy"); + if (!ASSERT_OK_PTR(nstoken, "open ns_xmit_policy")) + goto out; + + /* 802.3ad with layer2+3 policy: native XDP is supported */ + SYS(out, "ip link add bond0 type bond mode 802.3ad xmit_hash_policy layer2+3"); + SYS(out, "ip link add veth0 type veth peer name veth0p"); + SYS(out, "ip link set veth0 master bond0"); + SYS(out, "ip link set bond0 up"); + + bond_ifindex = if_nametoindex("bond0"); + if (!ASSERT_GT(bond_ifindex, 0, "bond0 ifindex")) + goto out; + + xdp_fd = bpf_program__fd(skeletons->xdp_dummy->progs.xdp_dummy_prog); + if (!ASSERT_GE(xdp_fd, 0, "xdp_dummy fd")) + goto out; + + err = bpf_xdp_attach(bond_ifindex, xdp_fd, XDP_FLAGS_DRV_MODE, NULL); + if (!ASSERT_OK(err, "attach XDP to bond0")) + goto out; + + /* With XDP loaded, switching to vlan+srcmac must be rejected */ + err = system("ip link set bond0 type bond xmit_hash_policy vlan+srcmac 2>/dev/null"); + ASSERT_NEQ(err, 0, "vlan+srcmac change with XDP loaded should fail"); + + /* Detach XDP first, then the same change must succeed */ + ASSERT_OK(bpf_xdp_detach(bond_ifindex, XDP_FLAGS_DRV_MODE, NULL), + "detach XDP from bond0"); + + bond_ifindex = -1; + err = system("ip link set bond0 type bond xmit_hash_policy vlan+srcmac 2>/dev/null"); + ASSERT_OK(err, "vlan+srcmac change without XDP should succeed"); + +out: + if (bond_ifindex > 0) + bpf_xdp_detach(bond_ifindex, XDP_FLAGS_DRV_MODE, NULL); + close_netns(nstoken); + SYS_NOFAIL("ip netns del ns_xmit_policy"); +} + static int libbpf_debug_print(enum libbpf_print_level level, const char *format, va_list args) { @@ -677,6 +732,9 @@ void serial_test_xdp_bonding(void) test_case->xmit_policy); } + if (test__start_subtest("xdp_bonding_xmit_policy_compat")) + test_xdp_bonding_xmit_policy_compat(&skeletons); + if (test__start_subtest("xdp_bonding_redirect_multi")) test_xdp_bonding_redirect_multi(&skeletons); -- cgit v1.2.3 From 18c04a808c436d629d5812ce883e3822a5f5a47f Mon Sep 17 00:00:00 2001 From: Vimlesh Kumar Date: Fri, 27 Feb 2026 09:13:57 +0000 Subject: octeon_ep: Relocate counter updates before NAPI Relocate IQ/OQ IN/OUT_CNTS updates to occur before NAPI completion, and replace napi_complete with napi_complete_done. Moving the IQ/OQ counter updates before napi_complete_done ensures 1. Counter registers are updated before re-enabling interrupts. 2. Prevents a race where new packets arrive but counters aren't properly synchronized. napi_complete_done (vs napi_complete) allows for better interrupt coalescing. Fixes: 37d79d0596062 ("octeon_ep: add Tx/Rx processing and interrupt support") Signed-off-by: Sathesh Edara Signed-off-by: Shinas Rasheed Signed-off-by: Vimlesh Kumar Link: https://patch.msgid.link/20260227091402.1773833-2-vimleshk@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index ec55eb2a6c04..bb5f21b33508 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -554,12 +554,12 @@ static void octep_clean_irqs(struct octep_device *oct) } /** - * octep_enable_ioq_irq() - Enable MSI-x interrupt of a Tx/Rx queue. + * octep_update_pkt() - Update IQ/OQ IN/OUT_CNT registers. * * @iq: Octeon Tx queue data structure. * @oq: Octeon Rx queue data structure. */ -static void octep_enable_ioq_irq(struct octep_iq *iq, struct octep_oq *oq) +static void octep_update_pkt(struct octep_iq *iq, struct octep_oq *oq) { u32 pkts_pend = oq->pkts_pending; @@ -575,7 +575,17 @@ static void octep_enable_ioq_irq(struct octep_iq *iq, struct octep_oq *oq) } /* Flush the previous wrties before writing to RESEND bit */ - wmb(); + smp_wmb(); +} + +/** + * octep_enable_ioq_irq() - Enable MSI-x interrupt of a Tx/Rx queue. + * + * @iq: Octeon Tx queue data structure. + * @oq: Octeon Rx queue data structure. + */ +static void octep_enable_ioq_irq(struct octep_iq *iq, struct octep_oq *oq) +{ writeq(1UL << OCTEP_OQ_INTR_RESEND_BIT, oq->pkts_sent_reg); writeq(1UL << OCTEP_IQ_INTR_RESEND_BIT, iq->inst_cnt_reg); } @@ -601,7 +611,8 @@ static int octep_napi_poll(struct napi_struct *napi, int budget) if (tx_pending || rx_done >= budget) return budget; - napi_complete(napi); + octep_update_pkt(ioq_vector->iq, ioq_vector->oq); + napi_complete_done(napi, rx_done); octep_enable_ioq_irq(ioq_vector->iq, ioq_vector->oq); return rx_done; } -- cgit v1.2.3 From 43b3160cb639079a15daeb5f080120afbfbfc918 Mon Sep 17 00:00:00 2001 From: Vimlesh Kumar Date: Fri, 27 Feb 2026 09:13:58 +0000 Subject: octeon_ep: avoid compiler and IQ/OQ reordering Utilize READ_ONCE and WRITE_ONCE APIs for IO queue Tx/Rx variable access to prevent compiler optimization and reordering. Additionally, ensure IO queue OUT/IN_CNT registers are flushed by performing a read-back after writing. The compiler could reorder reads/writes to pkts_pending, last_pkt_count, etc., causing stale values to be used when calculating packets to process or register updates to send to hardware. The Octeon hardware requires a read-back after writing to OUT_CNT/IN_CNT registers to ensure the write has been flushed through any posted write buffers before the interrupt resend bit is set. Without this, we have observed cases where the hardware didn't properly update its internal state. wmb/rmb only provides ordering guarantees but doesn't prevent the compiler from performing optimizations like caching in registers, load tearing etc. Fixes: 37d79d0596062 ("octeon_ep: add Tx/Rx processing and interrupt support") Signed-off-by: Sathesh Edara Signed-off-by: Shinas Rasheed Signed-off-by: Vimlesh Kumar Link: https://patch.msgid.link/20260227091402.1773833-3-vimleshk@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeon_ep/octep_main.c | 21 ++++++++++------- drivers/net/ethernet/marvell/octeon_ep/octep_rx.c | 27 +++++++++++++++------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index bb5f21b33508..028dd55604ea 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -561,17 +561,22 @@ static void octep_clean_irqs(struct octep_device *oct) */ static void octep_update_pkt(struct octep_iq *iq, struct octep_oq *oq) { - u32 pkts_pend = oq->pkts_pending; + u32 pkts_pend = READ_ONCE(oq->pkts_pending); + u32 last_pkt_count = READ_ONCE(oq->last_pkt_count); + u32 pkts_processed = READ_ONCE(iq->pkts_processed); + u32 pkt_in_done = READ_ONCE(iq->pkt_in_done); netdev_dbg(iq->netdev, "enabling intr for Q-%u\n", iq->q_no); - if (iq->pkts_processed) { - writel(iq->pkts_processed, iq->inst_cnt_reg); - iq->pkt_in_done -= iq->pkts_processed; - iq->pkts_processed = 0; + if (pkts_processed) { + writel(pkts_processed, iq->inst_cnt_reg); + readl(iq->inst_cnt_reg); + WRITE_ONCE(iq->pkt_in_done, (pkt_in_done - pkts_processed)); + WRITE_ONCE(iq->pkts_processed, 0); } - if (oq->last_pkt_count - pkts_pend) { - writel(oq->last_pkt_count - pkts_pend, oq->pkts_sent_reg); - oq->last_pkt_count = pkts_pend; + if (last_pkt_count - pkts_pend) { + writel(last_pkt_count - pkts_pend, oq->pkts_sent_reg); + readl(oq->pkts_sent_reg); + WRITE_ONCE(oq->last_pkt_count, pkts_pend); } /* Flush the previous wrties before writing to RESEND bit */ diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c index f2a7c6a76c74..74de19166488 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c @@ -324,10 +324,16 @@ static int octep_oq_check_hw_for_pkts(struct octep_device *oct, struct octep_oq *oq) { u32 pkt_count, new_pkts; + u32 last_pkt_count, pkts_pending; pkt_count = readl(oq->pkts_sent_reg); - new_pkts = pkt_count - oq->last_pkt_count; + last_pkt_count = READ_ONCE(oq->last_pkt_count); + new_pkts = pkt_count - last_pkt_count; + if (pkt_count < last_pkt_count) { + dev_err(oq->dev, "OQ-%u pkt_count(%u) < oq->last_pkt_count(%u)\n", + oq->q_no, pkt_count, last_pkt_count); + } /* Clear the hardware packets counter register if the rx queue is * being processed continuously with-in a single interrupt and * reached half its max value. @@ -338,8 +344,9 @@ static int octep_oq_check_hw_for_pkts(struct octep_device *oct, pkt_count = readl(oq->pkts_sent_reg); new_pkts += pkt_count; } - oq->last_pkt_count = pkt_count; - oq->pkts_pending += new_pkts; + WRITE_ONCE(oq->last_pkt_count, pkt_count); + pkts_pending = READ_ONCE(oq->pkts_pending); + WRITE_ONCE(oq->pkts_pending, (pkts_pending + new_pkts)); return new_pkts; } @@ -414,7 +421,7 @@ static int __octep_oq_process_rx(struct octep_device *oct, u16 rx_ol_flags; u32 read_idx; - read_idx = oq->host_read_idx; + read_idx = READ_ONCE(oq->host_read_idx); rx_bytes = 0; desc_used = 0; for (pkt = 0; pkt < pkts_to_process; pkt++) { @@ -499,7 +506,7 @@ static int __octep_oq_process_rx(struct octep_device *oct, napi_gro_receive(oq->napi, skb); } - oq->host_read_idx = read_idx; + WRITE_ONCE(oq->host_read_idx, read_idx); oq->refill_count += desc_used; oq->stats->packets += pkt; oq->stats->bytes += rx_bytes; @@ -522,22 +529,26 @@ int octep_oq_process_rx(struct octep_oq *oq, int budget) { u32 pkts_available, pkts_processed, total_pkts_processed; struct octep_device *oct = oq->octep_dev; + u32 pkts_pending; pkts_available = 0; pkts_processed = 0; total_pkts_processed = 0; while (total_pkts_processed < budget) { /* update pending count only when current one exhausted */ - if (oq->pkts_pending == 0) + pkts_pending = READ_ONCE(oq->pkts_pending); + if (pkts_pending == 0) octep_oq_check_hw_for_pkts(oct, oq); + pkts_pending = READ_ONCE(oq->pkts_pending); pkts_available = min(budget - total_pkts_processed, - oq->pkts_pending); + pkts_pending); if (!pkts_available) break; pkts_processed = __octep_oq_process_rx(oct, oq, pkts_available); - oq->pkts_pending -= pkts_processed; + pkts_pending = READ_ONCE(oq->pkts_pending); + WRITE_ONCE(oq->pkts_pending, (pkts_pending - pkts_processed)); total_pkts_processed += pkts_processed; } -- cgit v1.2.3 From 2ae7d20fb24f598f60faa8f6ecc856dac782261a Mon Sep 17 00:00:00 2001 From: Vimlesh Kumar Date: Fri, 27 Feb 2026 09:13:59 +0000 Subject: octeon_ep_vf: Relocate counter updates before NAPI Relocate IQ/OQ IN/OUT_CNTS updates to occur before NAPI completion. Moving the IQ/OQ counter updates before napi_complete_done ensures 1. Counter registers are updated before re-enabling interrupts. 2. Prevents a race where new packets arrive but counters aren't properly synchronized. Fixes: 1cd3b407977c3 ("octeon_ep_vf: add Tx/Rx processing and interrupt support") Signed-off-by: Sathesh Edara Signed-off-by: Shinas Rasheed Signed-off-by: Vimlesh Kumar Link: https://patch.msgid.link/20260227091402.1773833-4-vimleshk@marvell.com Signed-off-by: Paolo Abeni --- .../net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 562fe945b422..0e76f671a226 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -286,12 +286,13 @@ static void octep_vf_clean_irqs(struct octep_vf_device *oct) } /** - * octep_vf_enable_ioq_irq() - Enable MSI-x interrupt of a Tx/Rx queue. + * octep_vf_update_pkt() - Update IQ/OQ IN/OUT_CNT registers. * * @iq: Octeon Tx queue data structure. * @oq: Octeon Rx queue data structure. */ -static void octep_vf_enable_ioq_irq(struct octep_vf_iq *iq, struct octep_vf_oq *oq) + +static void octep_vf_update_pkt(struct octep_vf_iq *iq, struct octep_vf_oq *oq) { u32 pkts_pend = oq->pkts_pending; @@ -308,6 +309,17 @@ static void octep_vf_enable_ioq_irq(struct octep_vf_iq *iq, struct octep_vf_oq * /* Flush the previous wrties before writing to RESEND bit */ smp_wmb(); +} + +/** + * octep_vf_enable_ioq_irq() - Enable MSI-x interrupt of a Tx/Rx queue. + * + * @iq: Octeon Tx queue data structure. + * @oq: Octeon Rx queue data structure. + */ +static void octep_vf_enable_ioq_irq(struct octep_vf_iq *iq, + struct octep_vf_oq *oq) +{ writeq(1UL << OCTEP_VF_OQ_INTR_RESEND_BIT, oq->pkts_sent_reg); writeq(1UL << OCTEP_VF_IQ_INTR_RESEND_BIT, iq->inst_cnt_reg); } @@ -333,6 +345,7 @@ static int octep_vf_napi_poll(struct napi_struct *napi, int budget) if (tx_pending || rx_done >= budget) return budget; + octep_vf_update_pkt(ioq_vector->iq, ioq_vector->oq); if (likely(napi_complete_done(napi, rx_done))) octep_vf_enable_ioq_irq(ioq_vector->iq, ioq_vector->oq); -- cgit v1.2.3 From 6c73126ecd1080351b468fe43353b2f705487f44 Mon Sep 17 00:00:00 2001 From: Vimlesh Kumar Date: Fri, 27 Feb 2026 09:14:00 +0000 Subject: octeon_ep_vf: avoid compiler and IQ/OQ reordering Utilize READ_ONCE and WRITE_ONCE APIs for IO queue Tx/Rx variable access to prevent compiler optimization and reordering. Additionally, ensure IO queue OUT/IN_CNT registers are flushed by performing a read-back after writing. The compiler could reorder reads/writes to pkts_pending, last_pkt_count, etc., causing stale values to be used when calculating packets to process or register updates to send to hardware. The Octeon hardware requires a read-back after writing to OUT_CNT/IN_CNT registers to ensure the write has been flushed through any posted write buffers before the interrupt resend bit is set. Without this, we have observed cases where the hardware didn't properly update its internal state. wmb/rmb only provides ordering guarantees but doesn't prevent the compiler from performing optimizations like caching in registers, load tearing etc. Fixes: 1cd3b407977c3 ("octeon_ep_vf: add Tx/Rx processing and interrupt support") Signed-off-by: Sathesh Edara Signed-off-by: Shinas Rasheed Signed-off-by: Vimlesh Kumar Link: https://patch.msgid.link/20260227091402.1773833-5-vimleshk@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 21 +++++++++------- .../ethernet/marvell/octeon_ep_vf/octep_vf_rx.c | 28 +++++++++++++++------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 0e76f671a226..7f1b93cffb98 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -294,17 +294,22 @@ static void octep_vf_clean_irqs(struct octep_vf_device *oct) static void octep_vf_update_pkt(struct octep_vf_iq *iq, struct octep_vf_oq *oq) { - u32 pkts_pend = oq->pkts_pending; + u32 pkts_pend = READ_ONCE(oq->pkts_pending); + u32 last_pkt_count = READ_ONCE(oq->last_pkt_count); + u32 pkts_processed = READ_ONCE(iq->pkts_processed); + u32 pkt_in_done = READ_ONCE(iq->pkt_in_done); netdev_dbg(iq->netdev, "enabling intr for Q-%u\n", iq->q_no); - if (iq->pkts_processed) { - writel(iq->pkts_processed, iq->inst_cnt_reg); - iq->pkt_in_done -= iq->pkts_processed; - iq->pkts_processed = 0; + if (pkts_processed) { + writel(pkts_processed, iq->inst_cnt_reg); + readl(iq->inst_cnt_reg); + WRITE_ONCE(iq->pkt_in_done, (pkt_in_done - pkts_processed)); + WRITE_ONCE(iq->pkts_processed, 0); } - if (oq->last_pkt_count - pkts_pend) { - writel(oq->last_pkt_count - pkts_pend, oq->pkts_sent_reg); - oq->last_pkt_count = pkts_pend; + if (last_pkt_count - pkts_pend) { + writel(last_pkt_count - pkts_pend, oq->pkts_sent_reg); + readl(oq->pkts_sent_reg); + WRITE_ONCE(oq->last_pkt_count, pkts_pend); } /* Flush the previous wrties before writing to RESEND bit */ diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c index 6f865dbbba6c..b579d5b545c4 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c @@ -325,9 +325,16 @@ static int octep_vf_oq_check_hw_for_pkts(struct octep_vf_device *oct, struct octep_vf_oq *oq) { u32 pkt_count, new_pkts; + u32 last_pkt_count, pkts_pending; pkt_count = readl(oq->pkts_sent_reg); - new_pkts = pkt_count - oq->last_pkt_count; + last_pkt_count = READ_ONCE(oq->last_pkt_count); + new_pkts = pkt_count - last_pkt_count; + + if (pkt_count < last_pkt_count) { + dev_err(oq->dev, "OQ-%u pkt_count(%u) < oq->last_pkt_count(%u)\n", + oq->q_no, pkt_count, last_pkt_count); + } /* Clear the hardware packets counter register if the rx queue is * being processed continuously with-in a single interrupt and @@ -339,8 +346,9 @@ static int octep_vf_oq_check_hw_for_pkts(struct octep_vf_device *oct, pkt_count = readl(oq->pkts_sent_reg); new_pkts += pkt_count; } - oq->last_pkt_count = pkt_count; - oq->pkts_pending += new_pkts; + WRITE_ONCE(oq->last_pkt_count, pkt_count); + pkts_pending = READ_ONCE(oq->pkts_pending); + WRITE_ONCE(oq->pkts_pending, (pkts_pending + new_pkts)); return new_pkts; } @@ -369,7 +377,7 @@ static int __octep_vf_oq_process_rx(struct octep_vf_device *oct, struct sk_buff *skb; u32 read_idx; - read_idx = oq->host_read_idx; + read_idx = READ_ONCE(oq->host_read_idx); rx_bytes = 0; desc_used = 0; for (pkt = 0; pkt < pkts_to_process; pkt++) { @@ -463,7 +471,7 @@ static int __octep_vf_oq_process_rx(struct octep_vf_device *oct, napi_gro_receive(oq->napi, skb); } - oq->host_read_idx = read_idx; + WRITE_ONCE(oq->host_read_idx, read_idx); oq->refill_count += desc_used; oq->stats->packets += pkt; oq->stats->bytes += rx_bytes; @@ -486,22 +494,26 @@ int octep_vf_oq_process_rx(struct octep_vf_oq *oq, int budget) { u32 pkts_available, pkts_processed, total_pkts_processed; struct octep_vf_device *oct = oq->octep_vf_dev; + u32 pkts_pending; pkts_available = 0; pkts_processed = 0; total_pkts_processed = 0; while (total_pkts_processed < budget) { /* update pending count only when current one exhausted */ - if (oq->pkts_pending == 0) + pkts_pending = READ_ONCE(oq->pkts_pending); + if (pkts_pending == 0) octep_vf_oq_check_hw_for_pkts(oct, oq); + pkts_pending = READ_ONCE(oq->pkts_pending); pkts_available = min(budget - total_pkts_processed, - oq->pkts_pending); + pkts_pending); if (!pkts_available) break; pkts_processed = __octep_vf_oq_process_rx(oct, oq, pkts_available); - oq->pkts_pending -= pkts_processed; + pkts_pending = READ_ONCE(oq->pkts_pending); + WRITE_ONCE(oq->pkts_pending, (pkts_pending - pkts_processed)); total_pkts_processed += pkts_processed; } -- cgit v1.2.3 From d98c24617a831e92e7224a07dcaed2dd0b02af96 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 14:00:24 -0800 Subject: wifi: cw1200: Fix locking in error paths cw1200_wow_suspend() must only return with priv->conf_mutex locked if it returns zero. This mutex must be unlocked if an error is returned. Add mutex_unlock() calls to the error paths from which that call is missing. This has been detected by the Clang thread-safety analyzer. Fixes: a910e4a94f69 ("cw1200: add driver for the ST-E CW1100 & CW1200 WLAN chipsets") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20260223220102.2158611-25-bart.vanassche@linux.dev Signed-off-by: Johannes Berg --- drivers/net/wireless/st/cw1200/pm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/st/cw1200/pm.c b/drivers/net/wireless/st/cw1200/pm.c index 120f0379f81d..84eb15d729c7 100644 --- a/drivers/net/wireless/st/cw1200/pm.c +++ b/drivers/net/wireless/st/cw1200/pm.c @@ -264,12 +264,14 @@ int cw1200_wow_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) wiphy_err(priv->hw->wiphy, "PM request failed: %d. WoW is disabled.\n", ret); cw1200_wow_resume(hw); + mutex_unlock(&priv->conf_mutex); return -EBUSY; } /* Force resume if event is coming from the device. */ if (atomic_read(&priv->bh_rx)) { cw1200_wow_resume(hw); + mutex_unlock(&priv->conf_mutex); return -EAGAIN; } -- cgit v1.2.3 From 72c6df8f284b3a49812ce2ac136727ace70acc7c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 14:00:25 -0800 Subject: wifi: wlcore: Fix a locking bug Make sure that wl->mutex is locked before it is unlocked. This has been detected by the Clang thread-safety analyzer. Fixes: 45aa7f071b06 ("wlcore: Use generic runtime pm calls for wowlan elp configuration") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20260223220102.2158611-26-bart.vanassche@linux.dev Signed-off-by: Johannes Berg --- drivers/net/wireless/ti/wlcore/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 17dd417756f2..1c340a4a0930 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -1875,6 +1875,8 @@ static int __maybe_unused wl1271_op_resume(struct ieee80211_hw *hw) wl->wow_enabled); WARN_ON(!wl->wow_enabled); + mutex_lock(&wl->mutex); + ret = pm_runtime_force_resume(wl->dev); if (ret < 0) { wl1271_error("ELP wakeup failure!"); @@ -1891,8 +1893,6 @@ static int __maybe_unused wl1271_op_resume(struct ieee80211_hw *hw) run_irq_work = true; spin_unlock_irqrestore(&wl->wl_lock, flags); - mutex_lock(&wl->mutex); - /* test the recovery flag before calling any SDIO functions */ pending_recovery = test_bit(WL1271_FLAG_RECOVERY_IN_PROGRESS, &wl->flags); -- cgit v1.2.3 From 60862846308627e9e15546bb647a00de44deb27b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 26 Feb 2026 20:11:14 +0100 Subject: wifi: mt76: mt7996: Fix possible oob access in mt7996_mac_write_txwi_80211() Check frame length before accessing the mgmt fields in mt7996_mac_write_txwi_80211 in order to avoid a possible oob access. Fixes: 98686cd21624c ("wifi: mt76: mt7996: add driver for MediaTek Wi-Fi 7 (802.11be) devices") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260226-mt76-addba-req-oob-access-v1-1-b0f6d1ad4850@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt7996/mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c index 2560e2f46e89..d4f3ee943b47 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c @@ -800,6 +800,7 @@ mt7996_mac_write_txwi_80211(struct mt7996_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && + skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && mgmt->u.action.category == WLAN_CATEGORY_BACK && mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { if (is_mt7990(&dev->mt76)) -- cgit v1.2.3 From c41a9abd6ae31d130e8f332e7c8800c4c866234b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 26 Feb 2026 20:11:15 +0100 Subject: wifi: mt76: mt7925: Fix possible oob access in mt7925_mac_write_txwi_80211() Check frame length before accessing the mgmt fields in mt7925_mac_write_txwi_80211 in order to avoid a possible oob access. Fixes: c948b5da6bbec ("wifi: mt76: mt7925: add Mediatek Wi-Fi7 driver for mt7925 chips") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260226-mt76-addba-req-oob-access-v1-2-b0f6d1ad4850@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt7925/mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c index 871b67101976..0d9435900423 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c @@ -668,6 +668,7 @@ mt7925_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && + skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && mgmt->u.action.category == WLAN_CATEGORY_BACK && mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) tid = MT_TX_ADDBA; -- cgit v1.2.3 From 4e10a730d1b511ff49723371ed6d694dd1b2c785 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 26 Feb 2026 20:11:16 +0100 Subject: wifi: mt76: Fix possible oob access in mt76_connac2_mac_write_txwi_80211() Check frame length before accessing the mgmt fields in mt76_connac2_mac_write_txwi_80211 in order to avoid a possible oob access. Fixes: 577dbc6c656d ("mt76: mt7915: enable offloading of sequence number assignment") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260226-mt76-addba-req-oob-access-v1-3-b0f6d1ad4850@kernel.org [fix check to also cover mgmt->u.action.u.addba_req.capab, correct Fixes tag] Signed-off-by: Johannes Berg --- drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c index 3304b5971be0..b41ca1410da9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c @@ -413,6 +413,7 @@ mt76_connac2_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && + skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + 1 + 2 && mgmt->u.action.category == WLAN_CATEGORY_BACK && mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { u16 capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); -- cgit v1.2.3 From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2026 17:26:03 +0000 Subject: indirect_call_wrapper: do not reevaluate function pointer We have an increasing number of READ_ONCE(xxx->function) combined with INDIRECT_CALL_[1234]() helpers. Unfortunately this forces INDIRECT_CALL_[1234]() to read xxx->function many times, which is not what we wanted. Fix these macros so that xxx->function value is not reloaded. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962) Function old new delta ip_push_pending_frames 59 181 +122 ip6_finish_output 687 681 -6 __udp_enqueue_schedule_skb 1078 1072 -6 ioam6_output 2319 2312 -7 xfrm4_rcv_encap_finish2 64 56 -8 xfrm4_output 297 289 -8 vrf_ip_local_out 278 270 -8 vrf_ip6_local_out 278 270 -8 seg6_input_finish 64 56 -8 rpl_output 700 692 -8 ipmr_forward_finish 124 116 -8 ip_forward_finish 143 135 -8 ip6mr_forward2_finish 100 92 -8 ip6_forward_finish 73 65 -8 input_action_end_bpf 1091 1083 -8 dst_input 52 44 -8 __xfrm6_output 801 793 -8 __xfrm4_output 83 75 -8 bpf_input 500 491 -9 __tcp_check_space 530 521 -9 input_action_end_dt6 291 280 -11 vti6_tnl_xmit 1634 1622 -12 bpf_xmit 1203 1191 -12 rpl_input 497 483 -14 rawv6_send_hdrinc 1355 1341 -14 ndisc_send_skb 1030 1016 -14 ipv6_srh_rcv 1377 1363 -14 ip_send_unicast_reply 1253 1239 -14 ip_rcv_finish 226 212 -14 ip6_rcv_finish 300 286 -14 input_action_end_x_core 205 191 -14 input_action_end_x 355 341 -14 input_action_end_t 205 191 -14 input_action_end_dx6_finish 127 113 -14 input_action_end_dx4_finish 373 359 -14 input_action_end_dt4 426 412 -14 input_action_end_core 186 172 -14 input_action_end_b6_encap 292 278 -14 input_action_end_b6 198 184 -14 igmp6_send 1332 1318 -14 ip_sublist_rcv 864 848 -16 ip6_sublist_rcv 1091 1075 -16 ipv6_rpl_srh_rcv 1937 1920 -17 xfrm_policy_queue_process 1246 1228 -18 seg6_output_core 903 885 -18 mld_sendpack 856 836 -20 NF_HOOK 756 736 -20 vti_tunnel_xmit 1447 1426 -21 input_action_end_dx6 664 642 -22 input_action_end 1502 1480 -22 sock_sendmsg_nosec 134 111 -23 ip6mr_forward2 388 364 -24 sock_recvmsg_nosec 134 109 -25 seg6_input_core 836 810 -26 ip_send_skb 172 146 -26 ip_local_out 140 114 -26 ip6_local_out 140 114 -26 __sock_sendmsg 162 136 -26 __ip_queue_xmit 1196 1170 -26 __ip_finish_output 405 379 -26 ipmr_queue_fwd_xmit 373 346 -27 sock_recvmsg 173 145 -28 ip6_xmit 1635 1607 -28 xfrm_output_resume 1418 1389 -29 ip_build_and_send_pkt 625 591 -34 dst_output 504 432 -72 Total: Before=25217686, After=25216724, chg -0.00% Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 35227d47cfc9..dc272b514a01 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -16,22 +16,26 @@ */ #define INDIRECT_CALL_1(f, f1, ...) \ ({ \ - likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \ + typeof(f) __f1 = (f); \ + likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__); \ }) #define INDIRECT_CALL_2(f, f2, f1, ...) \ ({ \ - likely(f == f2) ? f2(__VA_ARGS__) : \ - INDIRECT_CALL_1(f, f1, __VA_ARGS__); \ + typeof(f) __f2 = (f); \ + likely(__f2 == f2) ? f2(__VA_ARGS__) : \ + INDIRECT_CALL_1(__f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_3(f, f3, f2, f1, ...) \ ({ \ - likely(f == f3) ? f3(__VA_ARGS__) : \ - INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__); \ + typeof(f) __f3 = (f); \ + likely(__f3 == f3) ? f3(__VA_ARGS__) : \ + INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) \ ({ \ - likely(f == f4) ? f4(__VA_ARGS__) : \ - INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__); \ + typeof(f) __f4 = (f); \ + likely(__f4 == f4) ? f4(__VA_ARGS__) : \ + INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__); \ }) #define INDIRECT_CALLABLE_DECLARE(f) f -- cgit v1.2.3 From 6a877ececd6daa002a9a0002cd0fbca6592a9244 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 27 Feb 2026 13:23:36 -0700 Subject: net/rds: Fix circular locking dependency in rds_tcp_tune syzbot reported a circular locking dependency in rds_tcp_tune() where sk_net_refcnt_upgrade() is called while holding the socket lock: ====================================================== WARNING: possible circular locking dependency detected ====================================================== kworker/u10:8/15040 is trying to acquire lock: ffffffff8e9aaf80 (fs_reclaim){+.+.}-{0:0}, at: __kmalloc_cache_noprof+0x4b/0x6f0 but task is already holding lock: ffff88805a3c1ce0 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at: rds_tcp_tune+0xd7/0x930 The issue occurs because sk_net_refcnt_upgrade() performs memory allocation (via get_net_track() -> ref_tracker_alloc()) while the socket lock is held, creating a circular dependency with fs_reclaim. Fix this by moving sk_net_refcnt_upgrade() outside the socket lock critical section. This is safe because the fields modified by the sk_net_refcnt_upgrade() call (sk_net_refcnt, ns_tracker) are not accessed by any concurrent code path at this point. v2: - Corrected fixes tag - check patch line wrap nits - ai commentary nits Reported-by: syzbot+2e2cf5331207053b8106@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2e2cf5331207053b8106 Fixes: 3a58f13a881e ("net: rds: acquire refcount on TCP sockets") Signed-off-by: Allison Henderson Link: https://patch.msgid.link/20260227202336.167757-1-achender@kernel.org Signed-off-by: Paolo Abeni --- net/rds/tcp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 04f310255692..654e23d13e3d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -490,18 +490,24 @@ bool rds_tcp_tune(struct socket *sock) struct rds_tcp_net *rtn; tcp_sock_set_nodelay(sock->sk); - lock_sock(sk); /* TCP timer functions might access net namespace even after * a process which created this net namespace terminated. */ if (!sk->sk_net_refcnt) { - if (!maybe_get_net(net)) { - release_sock(sk); + if (!maybe_get_net(net)) return false; - } + /* + * sk_net_refcnt_upgrade() must be called before lock_sock() + * because it does a GFP_KERNEL allocation, which can trigger + * fs_reclaim and create a circular lock dependency with the + * socket lock. The fields it modifies (sk_net_refcnt, + * ns_tracker) are not accessed by any concurrent code path + * at this point. + */ sk_net_refcnt_upgrade(sk); put_net(net); } + lock_sock(sk); rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; -- cgit v1.2.3 From 1a86a1f7d88996085934139fa4c063b6299a2dd3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 27 Feb 2026 22:19:37 +0000 Subject: net: Fix rcu_tasks stall in threaded busypoll I was debugging a NIC driver when I noticed that when I enable threaded busypoll, bpftrace hangs when starting up. dmesg showed: rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 10658 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 40793 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 131273 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 402058 jiffies old. INFO: rcu_tasks detected stalls on tasks: 00000000769f52cd: .N nvcsw: 2/2 holdout: 1 idle_cpu: -1/64 task:napi/eth2-8265 state:R running task stack:0 pid:48300 tgid:48300 ppid:2 task_flags:0x208040 flags:0x00004000 Call Trace: ? napi_threaded_poll_loop+0x27c/0x2c0 ? __pfx_napi_threaded_poll+0x10/0x10 ? napi_threaded_poll+0x26/0x80 ? kthread+0xfa/0x240 ? __pfx_kthread+0x10/0x10 ? ret_from_fork+0x31/0x50 ? __pfx_kthread+0x10/0x10 ? ret_from_fork_asm+0x1a/0x30 The cause is that in threaded busypoll, the main loop is in napi_threaded_poll rather than napi_threaded_poll_loop, where the latter rarely iterates more than once within its loop. For rcu_softirq_qs_periodic inside napi_threaded_poll_loop to report its qs state, the last_qs must be 100ms behind, and this can't happen because napi_threaded_poll_loop rarely iterates in threaded busypoll, and each time napi_threaded_poll_loop is called last_qs is reset to latest jiffies. This patch changes so that in threaded busypoll, last_qs is saved in the outer napi_threaded_poll, and whether busy_poll_last_qs is NULL indicates whether napi_threaded_poll_loop is called for busypoll. This way last_qs would not reset to latest jiffies on each invocation of napi_threaded_poll_loop. Fixes: c18d4b190a46 ("net: Extend NAPI threaded polling to allow kthread based busy polling") Cc: stable@vger.kernel.org Signed-off-by: YiFei Zhu Reviewed-by: Samiullah Khawaja Link: https://patch.msgid.link/20260227221937.1060857-1-zhuyifei@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index c1a9f7fdcffa..4af4cf2d63a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7794,11 +7794,12 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) +static void napi_threaded_poll_loop(struct napi_struct *napi, + unsigned long *busy_poll_last_qs) { + unsigned long last_qs = busy_poll_last_qs ? *busy_poll_last_qs : jiffies; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; - unsigned long last_qs = jiffies; for (;;) { bool repoll = false; @@ -7827,12 +7828,12 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) /* When busy poll is enabled, the old packets are not flushed in * napi_complete_done. So flush them here. */ - if (busy_poll) + if (busy_poll_last_qs) gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); /* Call cond_resched here to avoid watchdog warnings. */ - if (repoll || busy_poll) { + if (repoll || busy_poll_last_qs) { rcu_softirq_qs_periodic(last_qs); cond_resched(); } @@ -7840,11 +7841,15 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) if (!repoll) break; } + + if (busy_poll_last_qs) + *busy_poll_last_qs = last_qs; } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + unsigned long last_qs = jiffies; bool want_busy_poll; bool in_busy_poll; unsigned long val; @@ -7862,7 +7867,7 @@ static int napi_threaded_poll(void *data) assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, want_busy_poll); - napi_threaded_poll_loop(napi, want_busy_poll); + napi_threaded_poll_loop(napi, want_busy_poll ? &last_qs : NULL); } return 0; @@ -13175,7 +13180,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog, false); + napi_threaded_poll_loop(&sd->backlog, NULL); } static void backlog_napi_setup(unsigned int cpu) -- cgit v1.2.3 From 1336b579f6079fb8520be03624fcd9ba443c930b Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Tue, 3 Mar 2026 22:35:30 +0800 Subject: sched_ext: Remove redundant css_put() in scx_cgroup_init() The iterator css_for_each_descendant_pre() walks the cgroup hierarchy under cgroup_lock(). It does not increment the reference counts on yielded css structs. According to the cgroup documentation, css_put() should only be used to release a reference obtained via css_get() or css_tryget_online(). Since the iterator does not use either of these to acquire a reference, calling css_put() in the error path of scx_cgroup_init() causes a refcount underflow. Remove the unbalanced css_put() to prevent a potential Use-After-Free (UAF) vulnerability. Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Cheng-Yang Chou Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2ba69b302368..eab6e09b6442 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3589,7 +3589,6 @@ static int scx_cgroup_init(struct scx_sched *sch) ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { - css_put(css); scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } -- cgit v1.2.3 From eef33aa44935d001747ca97703c08dd6f9031162 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Wed, 3 Dec 2025 14:29:48 +0100 Subject: ice: fix adding AQ LLDP filter for VF The referenced commit came from a misunderstanding of the FW LLDP filter AQ (Admin Queue) command due to the error in the internal documentation. Contrary to the assumptions in the original commit, VFs can be added and deleted from this filter without any problems. Introduced dev_info message proved to be useful, so reverting the whole commit does not make sense. Without this fix, trusted VFs do not receive LLDP traffic, if there is an AQ LLDP filter on PF. When trusted VF attempts to add an LLDP multicast MAC address, the following message can be seen in dmesg on host: ice 0000:33:00.0: Failed to add Rx LLDP rule on VSI 20 error: -95 Revert checking VSI type when adding LLDP filter through AQ. Fixes: 4d5a1c4e6d49 ("ice: do not add LLDP-specific filter if not necessary") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 6cd63190f55d..74fe74225277 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -6391,7 +6391,7 @@ int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add) struct ice_aqc_lldp_filter_ctrl *cmd; struct libie_aq_desc desc; - if (vsi->type != ICE_VSI_PF || !ice_fw_supports_lldp_fltr_ctrl(hw)) + if (!ice_fw_supports_lldp_fltr_ctrl(hw)) return -EOPNOTSUPP; cmd = libie_aq_raw(&desc); -- cgit v1.2.3 From 326256c0a72d4877cec1d4df85357da106233128 Mon Sep 17 00:00:00 2001 From: Jakub Staniszewski Date: Tue, 13 Jan 2026 20:38:16 +0100 Subject: ice: reintroduce retry mechanism for indirect AQ Add retry mechanism for indirect Admin Queue (AQ) commands. To do so we need to keep the command buffer. This technically reverts commit 43a630e37e25 ("ice: remove unused buffer copy code in ice_sq_send_cmd_retry()"), but combines it with a fix in the logic by using a kmemdup() call, making it more robust and less likely to break in the future due to programmer error. Cc: Michal Schmidt Cc: stable@vger.kernel.org Fixes: 3056df93f7a8 ("ice: Re-send some AQ commands, as result of EBUSY AQ error") Signed-off-by: Jakub Staniszewski Co-developed-by: Dawid Osuchowski Signed-off-by: Dawid Osuchowski Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 74fe74225277..fd32c318d3f5 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1841,6 +1841,7 @@ ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq, { struct libie_aq_desc desc_cpy; bool is_cmd_for_retry; + u8 *buf_cpy = NULL; u8 idx = 0; u16 opcode; int status; @@ -1850,8 +1851,11 @@ ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq, memset(&desc_cpy, 0, sizeof(desc_cpy)); if (is_cmd_for_retry) { - /* All retryable cmds are direct, without buf. */ - WARN_ON(buf); + if (buf) { + buf_cpy = kmemdup(buf, buf_size, GFP_KERNEL); + if (!buf_cpy) + return -ENOMEM; + } memcpy(&desc_cpy, desc, sizeof(desc_cpy)); } @@ -1863,12 +1867,14 @@ ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq, hw->adminq.sq_last_status != LIBIE_AQ_RC_EBUSY) break; + if (buf_cpy) + memcpy(buf, buf_cpy, buf_size); memcpy(desc, &desc_cpy, sizeof(desc_cpy)); - msleep(ICE_SQ_SEND_DELAY_TIME_MS); } while (++idx < ICE_SQ_SEND_MAX_EXECUTE); + kfree(buf_cpy); return status; } -- cgit v1.2.3 From fb4903b3354aed4a2301180cf991226f896c87ed Mon Sep 17 00:00:00 2001 From: Jakub Staniszewski Date: Tue, 13 Jan 2026 20:38:17 +0100 Subject: ice: fix retry for AQ command 0x06EE Executing ethtool -m can fail reporting a netlink I/O error while firmware link management holds the i2c bus used to communicate with the module. According to Intel(R) Ethernet Controller E810 Datasheet Rev 2.8 [1] Section 3.3.10.4 Read/Write SFF EEPROM (0x06EE) request should to be retried upon receiving EBUSY from firmware. Commit e9c9692c8a81 ("ice: Reimplement module reads used by ethtool") implemented it only for part of ice_get_module_eeprom(), leaving all other calls to ice_aq_sff_eeprom() vulnerable to returning early on getting EBUSY without retrying. Remove the retry loop from ice_get_module_eeprom() and add Admin Queue (AQ) command with opcode 0x06EE to the list of commands that should be retried on receiving EBUSY from firmware. Cc: stable@vger.kernel.org Fixes: e9c9692c8a81 ("ice: Reimplement module reads used by ethtool") Signed-off-by: Jakub Staniszewski Co-developed-by: Dawid Osuchowski Signed-off-by: Dawid Osuchowski Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Link: https://www.intel.com/content/www/us/en/content-details/613875/intel-ethernet-controller-e810-datasheet.html [1] Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 1 + drivers/net/ethernet/intel/ice/ice_ethtool.c | 35 +++++++++++----------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index fd32c318d3f5..ce11fea122d0 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1816,6 +1816,7 @@ static bool ice_should_retry_sq_send_cmd(u16 opcode) case ice_aqc_opc_lldp_stop: case ice_aqc_opc_lldp_start: case ice_aqc_opc_lldp_filter_ctrl: + case ice_aqc_opc_sff_eeprom: return true; } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 3c1a084c2a4b..29e341251754 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4509,7 +4509,7 @@ ice_get_module_eeprom(struct net_device *netdev, u8 addr = ICE_I2C_EEPROM_DEV_ADDR; struct ice_hw *hw = &pf->hw; bool is_sfp = false; - unsigned int i, j; + unsigned int i; u16 offset = 0; u8 page = 0; int status; @@ -4551,26 +4551,19 @@ ice_get_module_eeprom(struct net_device *netdev, if (page == 0 || !(data[0x2] & 0x4)) { u32 copy_len; - /* If i2c bus is busy due to slow page change or - * link management access, call can fail. This is normal. - * So we retry this a few times. - */ - for (j = 0; j < 4; j++) { - status = ice_aq_sff_eeprom(hw, 0, addr, offset, page, - !is_sfp, value, - SFF_READ_BLOCK_SIZE, - 0, NULL); - netdev_dbg(netdev, "SFF %02X %02X %02X %X = %02X%02X%02X%02X.%02X%02X%02X%02X (%X)\n", - addr, offset, page, is_sfp, - value[0], value[1], value[2], value[3], - value[4], value[5], value[6], value[7], - status); - if (status) { - usleep_range(1500, 2500); - memset(value, 0, SFF_READ_BLOCK_SIZE); - continue; - } - break; + status = ice_aq_sff_eeprom(hw, 0, addr, offset, page, + !is_sfp, value, + SFF_READ_BLOCK_SIZE, + 0, NULL); + netdev_dbg(netdev, "SFF %02X %02X %02X %X = %02X%02X%02X%02X.%02X%02X%02X%02X (%pe)\n", + addr, offset, page, is_sfp, + value[0], value[1], value[2], value[3], + value[4], value[5], value[6], value[7], + ERR_PTR(status)); + if (status) { + netdev_err(netdev, "%s: error reading module EEPROM: status %pe\n", + __func__, ERR_PTR(status)); + return status; } /* Make sure we have enough room for the new block */ -- cgit v1.2.3 From fe868b499d16f55bbeea89992edb98043c9de416 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Thu, 22 Jan 2026 03:26:44 +0000 Subject: ice: Fix memory leak in ice_set_ringparam() In ice_set_ringparam, tx_rings and xdp_rings are allocated before rx_rings. If the allocation of rx_rings fails, the code jumps to the done label leaking both tx_rings and xdp_rings. Furthermore, if the setup of an individual Rx ring fails during the loop, the code jumps to the free_tx label which releases tx_rings but leaks xdp_rings. Fix this by introducing a free_xdp label and updating the error paths to ensure both xdp_rings and tx_rings are properly freed if rx_rings allocation or setup fails. Compile tested only. Issue found using a prototype static analysis tool and code review. Fixes: fcea6f3da546 ("ice: Add stats and ethtool support") Fixes: efc2214b6047 ("ice: Add support for XDP") Signed-off-by: Zilin Guan Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 29e341251754..b9be10b58856 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3332,7 +3332,7 @@ process_rx: rx_rings = kzalloc_objs(*rx_rings, vsi->num_rxq); if (!rx_rings) { err = -ENOMEM; - goto done; + goto free_xdp; } ice_for_each_rxq(vsi, i) { @@ -3359,7 +3359,7 @@ rx_unwind: } kfree(rx_rings); err = -ENOMEM; - goto free_tx; + goto free_xdp; } } @@ -3411,6 +3411,13 @@ process_link: } goto done; +free_xdp: + if (xdp_rings) { + ice_for_each_xdp_txq(vsi, i) + ice_free_tx_ring(&xdp_rings[i]); + kfree(xdp_rings); + } + free_tx: /* error cleanup if the Rx allocations failed after getting Tx */ if (tx_rings) { -- cgit v1.2.3 From 636cc3bd12f499c74eaf5dc9a7d5b832f1bb24ed Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 11 Feb 2026 10:11:40 +0100 Subject: libie: don't unroll if fwlog isn't supported The libie_fwlog_deinit() function can be called during driver unload even when firmware logging was never properly initialized. This led to call trace: [ 148.576156] Oops: Oops: 0000 [#1] SMP NOPTI [ 148.576167] CPU: 80 UID: 0 PID: 12843 Comm: rmmod Kdump: loaded Not tainted 6.17.0-rc7next-queue-3oct-01915-g06d79d51cf51 #1 PREEMPT(full) [ 148.576177] Hardware name: HPE ProLiant DL385 Gen10 Plus/ProLiant DL385 Gen10 Plus, BIOS A42 07/18/2020 [ 148.576182] RIP: 0010:__dev_printk+0x16/0x70 [ 148.576196] Code: 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 55 41 54 49 89 d4 55 48 89 fd 53 48 85 f6 74 3c <4c> 8b 6e 50 48 89 f3 4d 85 ed 75 03 4c 8b 2e 48 89 df e8 f3 27 98 [ 148.576204] RSP: 0018:ffffd2fd7ea17a48 EFLAGS: 00010202 [ 148.576211] RAX: ffffd2fd7ea17aa0 RBX: ffff8eb288ae2000 RCX: 0000000000000000 [ 148.576217] RDX: ffffd2fd7ea17a70 RSI: 00000000000000c8 RDI: ffffffffb68d3d88 [ 148.576222] RBP: ffffffffb68d3d88 R08: 0000000000000000 R09: 0000000000000000 [ 148.576227] R10: 00000000000000c8 R11: ffff8eb2b1a49400 R12: ffffd2fd7ea17a70 [ 148.576231] R13: ffff8eb3141fb000 R14: ffffffffc1215b48 R15: ffffffffc1215bd8 [ 148.576236] FS: 00007f5666ba6740(0000) GS:ffff8eb2472b9000(0000) knlGS:0000000000000000 [ 148.576242] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 148.576247] CR2: 0000000000000118 CR3: 000000011ad17000 CR4: 0000000000350ef0 [ 148.576252] Call Trace: [ 148.576258] [ 148.576269] _dev_warn+0x7c/0x96 [ 148.576290] libie_fwlog_deinit+0x112/0x117 [libie_fwlog] [ 148.576303] ixgbe_remove+0x63/0x290 [ixgbe] [ 148.576342] pci_device_remove+0x42/0xb0 [ 148.576354] device_release_driver_internal+0x19c/0x200 [ 148.576365] driver_detach+0x48/0x90 [ 148.576372] bus_remove_driver+0x6d/0xf0 [ 148.576383] pci_unregister_driver+0x2e/0xb0 [ 148.576393] ixgbe_exit_module+0x1c/0xd50 [ixgbe] [ 148.576430] __do_sys_delete_module.isra.0+0x1bc/0x2e0 [ 148.576446] do_syscall_64+0x7f/0x980 It can be reproduced by trying to unload ixgbe driver in recovery mode. Fix that by checking if fwlog is supported before doing unroll. Fixes: 641585bc978e ("ixgbe: fwlog support for e610") Reviewed-by: Aleksandr Loktionov Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libie/fwlog.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/libie/fwlog.c b/drivers/net/ethernet/intel/libie/fwlog.c index 79020d990859..4d0c8370386b 100644 --- a/drivers/net/ethernet/intel/libie/fwlog.c +++ b/drivers/net/ethernet/intel/libie/fwlog.c @@ -1049,6 +1049,10 @@ void libie_fwlog_deinit(struct libie_fwlog *fwlog) { int status; + /* if FW logging isn't supported it means no configuration was done */ + if (!libie_fwlog_supported(fwlog)) + return; + /* make sure FW logging is disabled to not put the FW in a weird state * for the next driver load */ -- cgit v1.2.3 From b84852170153671bb0fa6737a6e48370addd8e1a Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Tue, 10 Feb 2026 15:57:14 +0000 Subject: iavf: fix netdev->max_mtu to respect actual hardware limit iavf sets LIBIE_MAX_MTU as netdev->max_mtu, ignoring vf_res->max_mtu from PF [1]. This allows setting an MTU beyond the actual hardware limit, causing TX queue timeouts [2]. Set correct netdev->max_mtu using vf_res->max_mtu from the PF. Note that currently PF drivers such as ice/i40e set the frame size in vf_res->max_mtu, not MTU. Convert vf_res->max_mtu to MTU before setting netdev->max_mtu. [1] # ip -j -d link show $DEV | jq '.[0].max_mtu' 16356 [2] iavf 0000:00:05.0 enp0s5: NETDEV WATCHDOG: CPU: 1: transmit queue 0 timed out 5692 ms iavf 0000:00:05.0 enp0s5: NIC Link is Up Speed is 10 Gbps Full Duplex iavf 0000:00:05.0 enp0s5: NETDEV WATCHDOG: CPU: 6: transmit queue 3 timed out 5312 ms iavf 0000:00:05.0 enp0s5: NIC Link is Up Speed is 10 Gbps Full Duplex ... Fixes: 5fa4caff59f2 ("iavf: switch to Page Pool") Signed-off-by: Kohei Enju Reviewed-by: Alexander Lobakin Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index bceaf4b1b85d..86c1964f42e1 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2793,7 +2793,22 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter) netdev->watchdog_timeo = 5 * HZ; netdev->min_mtu = ETH_MIN_MTU; - netdev->max_mtu = LIBIE_MAX_MTU; + + /* PF/VF API: vf_res->max_mtu is max frame size (not MTU). + * Convert to MTU. + */ + if (!adapter->vf_res->max_mtu) { + netdev->max_mtu = LIBIE_MAX_MTU; + } else if (adapter->vf_res->max_mtu < LIBETH_RX_LL_LEN + ETH_MIN_MTU || + adapter->vf_res->max_mtu > + LIBETH_RX_LL_LEN + LIBIE_MAX_MTU) { + netdev_warn_once(adapter->netdev, + "invalid max frame size %d from PF, using default MTU %d", + adapter->vf_res->max_mtu, LIBIE_MAX_MTU); + netdev->max_mtu = LIBIE_MAX_MTU; + } else { + netdev->max_mtu = adapter->vf_res->max_mtu - LIBETH_RX_LL_LEN; + } if (!is_valid_ether_addr(adapter->hw.mac.addr)) { dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n", -- cgit v1.2.3 From d4c13ab36273a8c318ba06799793cc1f5d9c6fa1 Mon Sep 17 00:00:00 2001 From: Vivek Behera Date: Thu, 22 Jan 2026 15:16:52 +0100 Subject: igb: Fix trigger of incorrect irq in igb_xsk_wakeup The current implementation in the igb_xsk_wakeup expects the Rx and Tx queues to share the same irq. This would lead to triggering of incorrect irq in split irq configuration. This patch addresses this issue which could impact environments with 2 active cpu cores or when the number of queues is reduced to 2 or less cat /proc/interrupts | grep eno2 167: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0 0-edge eno2 168: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0 1-edge eno2-rx-0 169: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0 2-edge eno2-rx-1 170: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0 3-edge eno2-tx-0 171: 0 0 0 0 IR-PCI-MSIX-0000:08:00.0 4-edge eno2-tx-1 Furthermore it uses the flags input argument to trigger either rx, tx or both rx and tx irqs as specified in the ndo_xsk_wakeup api documentation Fixes: 80f6ccf9f116 ("igb: Introduce XSK data structures and helpers") Signed-off-by: Vivek Behera Reviewed-by: Aleksandr Loktionov Suggested-by: Maciej Fijalkowski Acked-by: Maciej Fijalkowski Tested-by: Saritha Sanigani (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_xsk.c | 38 +++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 30ce5fbb5b77..ce4a7b58cad2 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -524,6 +524,16 @@ bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool) return nb_pkts < budget; } +static u32 igb_sw_irq_prep(struct igb_q_vector *q_vector) +{ + u32 eics = 0; + + if (!napi_if_scheduled_mark_missed(&q_vector->napi)) + eics = q_vector->eims_value; + + return eics; +} + int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { struct igb_adapter *adapter = netdev_priv(dev); @@ -542,20 +552,32 @@ int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) ring = adapter->tx_ring[qid]; - if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags)) - return -ENETDOWN; - if (!READ_ONCE(ring->xsk_pool)) return -EINVAL; - if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) { + if (flags & XDP_WAKEUP_TX) { + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags)) + return -ENETDOWN; + + eics |= igb_sw_irq_prep(ring->q_vector); + } + + if (flags & XDP_WAKEUP_RX) { + /* If IGB_FLAG_QUEUE_PAIRS is active, the q_vector + * and NAPI is shared between RX and TX. + * If NAPI is already running it would be marked as missed + * from the TX path, making this RX call a NOP + */ + ring = adapter->rx_ring[qid]; + eics |= igb_sw_irq_prep(ring->q_vector); + } + + if (eics) { /* Cause software interrupt */ - if (adapter->flags & IGB_FLAG_HAS_MSIX) { - eics |= ring->q_vector->eims_value; + if (adapter->flags & IGB_FLAG_HAS_MSIX) wr32(E1000_EICS, eics); - } else { + else wr32(E1000_ICS, E1000_ICS_RXDMT0); - } } return 0; -- cgit v1.2.3 From 554a1c34c11a057d01819ce9bb04653a8ffc8071 Mon Sep 17 00:00:00 2001 From: Vivek Behera Date: Tue, 20 Jan 2026 08:52:16 +0100 Subject: igc: Fix trigger of incorrect irq in igc_xsk_wakeup function This patch addresses the issue where the igc_xsk_wakeup function was triggering an incorrect IRQ for tx-0 when the i226 is configured with only 2 combined queues or in an environment with 2 active CPU cores. This prevented XDP Zero-copy send functionality in such split IRQ configurations. The fix implements the correct logic for extracting q_vectors saved during rx and tx ring allocation and utilizes flags provided by the ndo_xsk_wakeup API to trigger the appropriate IRQ. Fixes: fc9df2a0b520 ("igc: Enable RX via AF_XDP zero-copy") Fixes: 15fd021bc427 ("igc: Add Tx hardware timestamp request for AF_XDP zero-copy packet") Signed-off-by: Vivek Behera Reviewed-by: Jacob Keller Reviewed-by: Aleksandr loktinov Reviewed-by: Piotr Kwapulinski Reviewed-by: Song Yoong Siang Tested-by: Avigail Dahan Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 34 ++++++++++++++++++++++--------- drivers/net/ethernet/intel/igc/igc_ptp.c | 3 ++- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27e5c2109138..b2e8d0c0f827 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6906,28 +6906,29 @@ static int igc_xdp_xmit(struct net_device *dev, int num_frames, return nxmit; } -static void igc_trigger_rxtxq_interrupt(struct igc_adapter *adapter, - struct igc_q_vector *q_vector) +static u32 igc_sw_irq_prep(struct igc_q_vector *q_vector) { - struct igc_hw *hw = &adapter->hw; u32 eics = 0; - eics |= q_vector->eims_value; - wr32(IGC_EICS, eics); + if (!napi_if_scheduled_mark_missed(&q_vector->napi)) + eics = q_vector->eims_value; + + return eics; } int igc_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) { struct igc_adapter *adapter = netdev_priv(dev); - struct igc_q_vector *q_vector; + struct igc_hw *hw = &adapter->hw; struct igc_ring *ring; + u32 eics = 0; if (test_bit(__IGC_DOWN, &adapter->state)) return -ENETDOWN; if (!igc_xdp_is_enabled(adapter)) return -ENXIO; - + /* Check if queue_id is valid. Tx and Rx queue numbers are always same */ if (queue_id >= adapter->num_rx_queues) return -EINVAL; @@ -6936,9 +6937,22 @@ int igc_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) if (!ring->xsk_pool) return -ENXIO; - q_vector = adapter->q_vector[queue_id]; - if (!napi_if_scheduled_mark_missed(&q_vector->napi)) - igc_trigger_rxtxq_interrupt(adapter, q_vector); + if (flags & XDP_WAKEUP_RX) + eics |= igc_sw_irq_prep(ring->q_vector); + + if (flags & XDP_WAKEUP_TX) { + /* If IGC_FLAG_QUEUE_PAIRS is active, the q_vector + * and NAPI is shared between RX and TX. + * If NAPI is already running it would be marked as missed + * from the RX path, making this TX call a NOP + */ + ring = adapter->tx_ring[queue_id]; + eics |= igc_sw_irq_prep(ring->q_vector); + } + + if (eics) + /* Cause software interrupt */ + wr32(IGC_EICS, eics); return 0; } diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 7aae83c108fd..44ee19386766 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -550,7 +550,8 @@ static void igc_ptp_free_tx_buffer(struct igc_adapter *adapter, tstamp->buffer_type = 0; /* Trigger txrx interrupt for transmit completion */ - igc_xsk_wakeup(adapter->netdev, tstamp->xsk_queue_index, 0); + igc_xsk_wakeup(adapter->netdev, tstamp->xsk_queue_index, + XDP_WAKEUP_TX); return; } -- cgit v1.2.3 From b06ccbabe2506fd70b9167a644978b049150224a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Mar 2026 01:01:15 -1000 Subject: sched_ext: Fix starvation of scx_enable() under fair-class saturation During scx_enable(), the READY -> ENABLED task switching loop changes the calling thread's sched_class from fair to ext. Since fair has higher priority than ext, saturating fair-class workloads can indefinitely starve the enable thread, hanging the system. This was introduced when the enable path switched from preempt_disable() to scx_bypass() which doesn't protect against fair-class starvation. Note that the original preempt_disable() protection wasn't complete either - in partial switch modes, the calling thread could still be starved after preempt_enable() as it may have been switched to ext class. Fix it by offloading the enable body to a dedicated system-wide RT (SCHED_FIFO) kthread which cannot be starved by either fair or ext class tasks. scx_enable() lazily creates the kthread on first use and passes the ops pointer through a struct scx_enable_cmd containing the kthread_work, then synchronously waits for completion. The workfn runs on a different kthread from sch->helper (which runs disable_work), so it can safely flush disable_work on the error path without deadlock. Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index eab6e09b6442..ba51969718f5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4975,20 +4975,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return 0; } -static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +/* + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid + * starvation. During the READY -> ENABLED task switching loop, the calling + * thread's sched_class gets switched from fair to ext. As fair has higher + * priority than ext, the calling thread can be indefinitely starved under + * fair-class saturation, leading to a system hang. + */ +struct scx_enable_cmd { + struct kthread_work work; + struct sched_ext_ops *ops; + int ret; +}; + +static void scx_enable_workfn(struct kthread_work *work) { + struct scx_enable_cmd *cmd = + container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; int i, cpu, ret; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); - return -EINVAL; - } - mutex_lock(&scx_enable_mutex); if (scx_enable_state() != SCX_DISABLED) { @@ -5205,13 +5215,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_long_inc(&scx_enable_seq); - return 0; + cmd->ret = 0; + return; err_free_ksyncs: free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); - return ret; + cmd->ret = ret; + return; err_disable_unlock_all: scx_cgroup_unlock(); @@ -5230,7 +5242,41 @@ err_disable: */ scx_error(sch, "scx_enable() failed (%d)", ret); kthread_flush_work(&sch->disable_work); - return 0; + cmd->ret = 0; +} + +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + static struct kthread_worker *helper; + static DEFINE_MUTEX(helper_mutex); + struct scx_enable_cmd cmd; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); + return -EINVAL; + } + + if (!READ_ONCE(helper)) { + mutex_lock(&helper_mutex); + if (!helper) { + helper = kthread_run_worker(0, "scx_enable_helper"); + if (IS_ERR_OR_NULL(helper)) { + helper = NULL; + mutex_unlock(&helper_mutex); + return -ENOMEM; + } + sched_set_fifo(helper->task); + } + mutex_unlock(&helper_mutex); + } + + kthread_init_work(&cmd.work, scx_enable_workfn); + cmd.ops = ops; + + kthread_queue_work(READ_ONCE(helper), &cmd.work); + kthread_flush_work(&cmd.work); + return cmd.ret; } -- cgit v1.2.3 From 2ffb4f5c2ccb2fa1c049dd11899aee7967deef5a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 1 Mar 2026 11:45:48 -0800 Subject: ipv6: fix NULL pointer deref in ip6_rt_get_dev_rcu() l3mdev_master_dev_rcu() can return NULL when the slave device is being un-slaved from a VRF. All other callers deal with this, but we lost the fallback to loopback in ip6_rt_pcpu_alloc() -> ip6_rt_get_dev_rcu() with commit 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address"). KASAN: null-ptr-deref in range [0x0000000000000108-0x000000000000010f] RIP: 0010:ip6_rt_pcpu_alloc (net/ipv6/route.c:1418) Call Trace: ip6_pol_route (net/ipv6/route.c:2318) fib6_rule_lookup (net/ipv6/fib6_rules.c:115) ip6_route_output_flags (net/ipv6/route.c:2607) vrf_process_v6_outbound (drivers/net/vrf.c:437) I was tempted to rework the un-slaving code to clear the flag first and insert synchronize_rcu() before we remove the upper. But looks like the explicit fallback to loopback_dev is an established pattern. And I guess avoiding the synchronize_rcu() is nice, too. Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") Reviewed-by: David Ahern Link: https://patch.msgid.link/20260301194548.927324-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 85df25c36409..7db0c837196c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1063,7 +1063,8 @@ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) - dev = l3mdev_master_dev_rcu(dev); + dev = l3mdev_master_dev_rcu(dev) ? : + dev_net(dev)->loopback_dev; else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which -- cgit v1.2.3 From 67edfec516d30d3e62925c397be4a1e5185802fc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Mar 2026 12:36:00 -0800 Subject: net/tcp-ao: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: 0a3a809089eb ("net/tcp: Verify inbound TCP-AO signed segments") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20260302203600.13561-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/Kconfig | 1 + net/ipv4/tcp_ao.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index b71c22475c51..3ab6247be585 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -748,6 +748,7 @@ config TCP_SIGPOOL config TCP_AO bool "TCP: Authentication Option (RFC5925)" select CRYPTO + select CRYPTO_LIB_UTILS select TCP_SIGPOOL depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64) help diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 4980caddb0fc..a97cdf3e6af4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -922,7 +923,7 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, /* XXX: make it per-AF callback? */ tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key, (phash - (u8 *)th), sne); - if (memcmp(phash, hash_buf, maclen)) { + if (crypto_memneq(phash, hash_buf, maclen)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); -- cgit v1.2.3 From 4ee7fa6cf78ff26d783d39e2949d14c4c1cd5e7f Mon Sep 17 00:00:00 2001 From: Yung Chih Su Date: Mon, 2 Mar 2026 14:02:47 +0800 Subject: net: ipv4: fix ARM64 alignment fault in multipath hash seed `struct sysctl_fib_multipath_hash_seed` contains two u32 fields (user_seed and mp_seed), making it an 8-byte structure with a 4-byte alignment requirement. In `fib_multipath_hash_from_keys()`, the code evaluates the entire struct atomically via `READ_ONCE()`: mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; While this silently works on GCC by falling back to unaligned regular loads which the ARM64 kernel tolerates, it causes a fatal kernel panic when compiled with Clang and LTO enabled. Commit e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire when CONFIG_LTO=y") strengthens `READ_ONCE()` to use Load-Acquire instructions (`ldar` / `ldapr`) to prevent compiler reordering bugs under Clang LTO. Since the macro evaluates the full 8-byte struct, Clang emits a 64-bit `ldar` instruction. ARM64 architecture strictly requires `ldar` to be naturally aligned, thus executing it on a 4-byte aligned address triggers a strict Alignment Fault (FSC = 0x21). Fix the read side by moving the `READ_ONCE()` directly to the `u32` member, which emits a safe 32-bit `ldar Wn`. Furthermore, Eric Dumazet pointed out that `WRITE_ONCE()` on the entire struct in `proc_fib_multipath_hash_set_seed()` is also flawed. Analysis shows that Clang splits this 8-byte write into two separate 32-bit `str` instructions. While this avoids an alignment fault, it destroys atomicity and exposes a tear-write vulnerability. Fix this by explicitly splitting the write into two 32-bit `WRITE_ONCE()` operations. Finally, add the missing `READ_ONCE()` when reading `user_seed` in `proc_fib_multipath_hash_seed()` to ensure proper pairing and concurrency safety. Fixes: 4ee2a8cace3f ("net: ipv4: Add a sysctl to set multipath hash seed") Signed-off-by: Yung Chih Su Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260302060247.7066-1-yuuchihsu@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b4495c38e0a0..318593743b6e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -559,7 +559,7 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, siphash_aligned_key_t hash_key; u32 mp_seed; - mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; + mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed); fib_multipath_hash_construct_key(&hash_key, mp_seed); return flow_hash_from_keys_seed(keys, &hash_key); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 643763bc2142..5654cc9c8a0b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -486,7 +486,8 @@ static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) proc_fib_multipath_hash_rand_seed), }; - WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed); } static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write, @@ -500,7 +501,7 @@ static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write int ret; mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; - user_seed = mphs->user_seed; + user_seed = READ_ONCE(mphs->user_seed); tmp = *table; tmp.data = &user_seed; -- cgit v1.2.3 From 5af6e8b54927f7a8d3c7fd02b1bdc09e93d5c079 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 2 Mar 2026 03:40:46 -0800 Subject: netconsole: fix sysdata_release_enabled_show checking wrong flag sysdata_release_enabled_show() checks SYSDATA_TASKNAME instead of SYSDATA_RELEASE, causing the configfs release_enabled attribute to reflect the taskname feature state rather than the release feature state. This is a copy-paste error from the adjacent sysdata_taskname_enabled_show() function. The corresponding _store function already uses the correct SYSDATA_RELEASE flag. Fixes: 343f90227070 ("netconsole: implement configfs for release_enabled") Signed-off-by: Breno Leitao Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260302-sysdata_release_fix-v1-1-e5090f677c7c@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/netconsole.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 2db116fb1a7c..3c9acd6e49e8 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -617,7 +617,7 @@ static ssize_t sysdata_release_enabled_show(struct config_item *item, bool release_enabled; dynamic_netconsole_mutex_lock(); - release_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME); + release_enabled = !!(nt->sysdata_fields & SYSDATA_RELEASE); dynamic_netconsole_mutex_unlock(); return sysfs_emit(buf, "%d\n", release_enabled); -- cgit v1.2.3 From e2f27363aa6d983504c6836dd0975535e2e9dba0 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Mon, 2 Mar 2026 09:51:24 +0530 Subject: amd-xgbe: fix sleep while atomic on suspend/resume The xgbe_powerdown() and xgbe_powerup() functions use spinlocks (spin_lock_irqsave) while calling functions that may sleep: - napi_disable() can sleep waiting for NAPI polling to complete - flush_workqueue() can sleep waiting for pending work items This causes a "BUG: scheduling while atomic" error during suspend/resume cycles on systems using the AMD XGBE Ethernet controller. The spinlock protection in these functions is unnecessary as these functions are called from suspend/resume paths which are already serialized by the PM core Fix this by removing the spinlock. Since only code that takes this lock is xgbe_powerdown() and xgbe_powerup(), remove it completely. Fixes: c5aa9e3b8156 ("amd-xgbe: Initial AMD 10GbE platform driver") Signed-off-by: Raju Rangoju Link: https://patch.msgid.link/20260302042124.1386445-1-Raju.Rangoju@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 10 ---------- drivers/net/ethernet/amd/xgbe/xgbe-main.c | 1 - drivers/net/ethernet/amd/xgbe/xgbe.h | 3 --- 3 files changed, 14 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 62bb4b8a68e1..8b79d88480db 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1120,7 +1120,6 @@ int xgbe_powerdown(struct net_device *netdev, unsigned int caller) { struct xgbe_prv_data *pdata = netdev_priv(netdev); struct xgbe_hw_if *hw_if = &pdata->hw_if; - unsigned long flags; DBGPR("-->xgbe_powerdown\n"); @@ -1131,8 +1130,6 @@ int xgbe_powerdown(struct net_device *netdev, unsigned int caller) return -EINVAL; } - spin_lock_irqsave(&pdata->lock, flags); - if (caller == XGMAC_DRIVER_CONTEXT) netif_device_detach(netdev); @@ -1148,8 +1145,6 @@ int xgbe_powerdown(struct net_device *netdev, unsigned int caller) pdata->power_down = 1; - spin_unlock_irqrestore(&pdata->lock, flags); - DBGPR("<--xgbe_powerdown\n"); return 0; @@ -1159,7 +1154,6 @@ int xgbe_powerup(struct net_device *netdev, unsigned int caller) { struct xgbe_prv_data *pdata = netdev_priv(netdev); struct xgbe_hw_if *hw_if = &pdata->hw_if; - unsigned long flags; DBGPR("-->xgbe_powerup\n"); @@ -1170,8 +1164,6 @@ int xgbe_powerup(struct net_device *netdev, unsigned int caller) return -EINVAL; } - spin_lock_irqsave(&pdata->lock, flags); - pdata->power_down = 0; xgbe_napi_enable(pdata, 0); @@ -1186,8 +1178,6 @@ int xgbe_powerup(struct net_device *netdev, unsigned int caller) xgbe_start_timers(pdata); - spin_unlock_irqrestore(&pdata->lock, flags); - DBGPR("<--xgbe_powerup\n"); return 0; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index d1f0419edb23..7d45ea22a02e 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -76,7 +76,6 @@ struct xgbe_prv_data *xgbe_alloc_pdata(struct device *dev) pdata->netdev = netdev; pdata->dev = dev; - spin_lock_init(&pdata->lock); spin_lock_init(&pdata->xpcs_lock); mutex_init(&pdata->rss_mutex); spin_lock_init(&pdata->tstamp_lock); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 1269b8ce9249..e1d7d7150e16 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -1004,9 +1004,6 @@ struct xgbe_prv_data { unsigned int pp3; unsigned int pp4; - /* Overall device lock */ - spinlock_t lock; - /* XPCS indirect addressing lock */ spinlock_t xpcs_lock; unsigned int xpcs_window_def_reg; -- cgit v1.2.3 From 7f5d8e63f3d4dc952548502a2227de780cbcd21f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 2 Mar 2026 11:50:20 -0800 Subject: MAINTAINERS: update the skge/sky2 maintainers Mark the skge and sky2 drivers as orphan. I no longer have any Marvell/SysKonnect boards to test with and mail to Mirko Lindner bounced because Marvell sold off that divsion. Signed-off-by: Stephen Hemminger Link: https://patch.msgid.link/20260302195120.187183-1-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7829ff2180ae..bee85351120e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15372,10 +15372,8 @@ F: drivers/crypto/marvell/ F: include/linux/soc/marvell/octeontx2/ MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2) -M: Mirko Lindner -M: Stephen Hemminger L: netdev@vger.kernel.org -S: Odd fixes +S: Orphan F: drivers/net/ethernet/marvell/sk* MARVELL LIBERTAS WIRELESS DRIVER -- cgit v1.2.3 From 46d0d6f50dab706637f4c18a470aac20a21900d3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Mar 2026 12:34:09 -0800 Subject: net/tcp-md5: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Fixes: 658ddaaf6694 ("tcp: md5: RST: getting md5 key from listener") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20260302203409.13388-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/Kconfig | 1 + net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 3ab6247be585..df922f9f5289 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -762,6 +762,7 @@ config TCP_AO config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO_LIB_MD5 + select CRYPTO_LIB_UTILS help RFC2385 specifies a method of giving MD5 protection to TCP sessions. Its main (only?) use is to protect BGP sessions between core routers diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cdc26e8ad68..202a4e57a218 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -244,6 +244,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include #include @@ -4970,7 +4971,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); - if (memcmp(hash_location, newhash, 16) != 0) { + if (crypto_memneq(hash_location, newhash, 16)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d53d39be291a..910c25cb24e1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,6 +88,7 @@ #include #include +#include #include @@ -839,7 +840,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, goto out; tcp_v4_md5_hash_skb(newhash, key, NULL, skb); - if (memcmp(md5_hash_location, newhash, 16) != 0) + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e46a0efae012..5195a46b951e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -68,6 +68,7 @@ #include #include +#include #include @@ -1048,7 +1049,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, key.type = TCP_KEY_MD5; tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); - if (memcmp(md5_hash_location, newhash, 16) != 0) + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; } #endif -- cgit v1.2.3 From 6ca8379b5d36e22b04e6315c3e49a6083377c862 Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Sat, 21 Feb 2026 11:30:07 +0800 Subject: fgraph: Fix thresh_return clear per-task notrace When tracing_thresh is enabled, function graph tracing uses trace_graph_thresh_return() as the return handler. Unlike trace_graph_return(), it did not clear the per-task TRACE_GRAPH_NOTRACE flag set by the entry handler for set_graph_notrace addresses. This could leave the task permanently in "notrace" state and effectively disable function graph tracing for that task. Mirror trace_graph_return()'s per-task notrace handling by clearing TRACE_GRAPH_NOTRACE and returning early when set. Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260221113007819YgrZsMGABff4Rc-O_fZxL@zte.com.cn Fixes: b84214890a9bc ("function_graph: Move graph notrace bit to shadow stack global var") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shengming Hu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3d8239fee004..817d0f1696b6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -400,14 +400,15 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) { + unsigned long *task_var = fgraph_get_task_var(gops); struct fgraph_times *ftimes; struct trace_array *tr; int size; ftrace_graph_addr_finish(gops, trace); - if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { - trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + if (*task_var & TRACE_GRAPH_NOTRACE) { + *task_var &= ~TRACE_GRAPH_NOTRACE; return; } -- cgit v1.2.3 From b96d0c59cdbb2a22b2545f6f3d5c6276b05761dd Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Sat, 21 Feb 2026 11:33:14 +0800 Subject: fgraph: Fix thresh_return nosleeptime double-adjust trace_graph_thresh_return() called handle_nosleeptime() and then delegated to trace_graph_return(), which calls handle_nosleeptime() again. When sleep-time accounting is disabled this double-adjusts calltime and can produce bogus durations (including underflow). Fix this by computing rettime once, applying handle_nosleeptime() only once, using the adjusted calltime for threshold comparison, and writing the return event directly via __trace_graph_return() when the threshold is met. Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260221113314048jE4VRwIyZEALiYByGK0My@zte.com.cn Fixes: 3c9880f3ab52b ("ftrace: Use a running sleeptime instead of saving on shadow stack") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shengming Hu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 817d0f1696b6..0d2d3a2ea7dd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -403,8 +403,12 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, unsigned long *task_var = fgraph_get_task_var(gops); struct fgraph_times *ftimes; struct trace_array *tr; + unsigned int trace_ctx; + u64 calltime, rettime; int size; + rettime = trace_clock_local(); + ftrace_graph_addr_finish(gops, trace); if (*task_var & TRACE_GRAPH_NOTRACE) { @@ -419,11 +423,13 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, tr = gops->private; handle_nosleeptime(tr, trace, ftimes, size); - if (tracing_thresh && - (trace_clock_local() - ftimes->calltime < tracing_thresh)) + calltime = ftimes->calltime; + + if (tracing_thresh && (rettime - calltime < tracing_thresh)) return; - else - trace_graph_return(trace, gops, fregs); + + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static struct fgraph_ops funcgraph_ops = { -- cgit v1.2.3 From 0a663b764dbdf135a126284f454c9f01f95a87d4 Mon Sep 17 00:00:00 2001 From: Huiwen He Date: Tue, 24 Feb 2026 10:35:44 +0800 Subject: tracing: Fix syscall events activation by ensuring refcount hits zero When multiple syscall events are specified in the kernel command line (e.g., trace_event=syscalls:sys_enter_openat,syscalls:sys_enter_close), they are often not captured after boot, even though they appear enabled in the tracing/set_event file. The issue stems from how syscall events are initialized. Syscall tracepoints require the global reference count (sys_tracepoint_refcount) to transition from 0 to 1 to trigger the registration of the syscall work (TIF_SYSCALL_TRACEPOINT) for tasks, including the init process (pid 1). The current implementation of early_enable_events() with disable_first=true used an interleaved sequence of "Disable A -> Enable A -> Disable B -> Enable B". If multiple syscalls are enabled, the refcount never drops to zero, preventing the 0->1 transition that triggers actual registration. Fix this by splitting early_enable_events() into two distinct phases: 1. Disable all events specified in the buffer. 2. Enable all events specified in the buffer. This ensures the refcount hits zero before re-enabling, allowing syscall events to be properly activated during early boot. The code is also refactored to use a helper function to avoid logic duplication between the disable and enable phases. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260224023544.1250787-1-hehuiwen@kylinos.cn Fixes: ce1039bd3a89 ("tracing: Fix enabling of syscall events on the command line") Signed-off-by: Huiwen He Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 52 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9928da636c9d..9c7f26cbe171 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -4668,26 +4668,22 @@ static __init int event_trace_memsetup(void) return 0; } -__init void -early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +/* + * Helper function to enable or disable a comma-separated list of events + * from the bootup buffer. + */ +static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable) { char *token; - int ret; - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; + while ((token = strsep(&buf, ","))) { if (*token) { - /* Restarting syscalls requires that we stop them first */ - if (disable_first) + if (enable) { + if (ftrace_set_clr_event(tr, token, 1)) + pr_warn("Failed to enable trace event: %s\n", token); + } else { ftrace_set_clr_event(tr, token, 0); - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + } } /* Put back the comma to allow this to be called again */ @@ -4696,6 +4692,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first) } } +/** + * early_enable_events - enable events from the bootup buffer + * @tr: The trace array to enable the events in + * @buf: The buffer containing the comma separated list of events + * @disable_first: If true, disable all events in @buf before enabling them + * + * This function enables events from the bootup buffer. If @disable_first + * is true, it will first disable all events in the buffer before enabling + * them. + * + * For syscall events, which rely on a global refcount to register the + * SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must + * ensure the refcount hits zero before re-enabling them. A simple + * "disable then enable" per-event is not enough if multiple syscalls are + * used, as the refcount will stay above zero. Thus, we need a two-phase + * approach: disable all, then enable all. + */ +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +{ + if (disable_first) + __early_set_events(tr, buf, false); + + __early_set_events(tr, buf, true); +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); -- cgit v1.2.3 From cc337974cd1084f9821179eb66f4e470d9fd2ed8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2026 21:35:46 -0500 Subject: ftrace: Disable preemption in the tracepoint callbacks handling filtered pids When function trace PID filtering is enabled, the function tracer will attach a callback to the fork tracepoint as well as the exit tracepoint that will add the forked child PID to the PID filtering list as well as remove the PID that is exiting. Commit a46023d5616e ("tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast") removed the disabling of preemption when calling tracepoint callbacks. The callbacks used for the PID filtering accounting depended on preemption being disabled, and now the trigger a "suspicious RCU usage" warning message. Make them explicitly disable preemption. Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260302213546.156e3e4f@gandalf.local.home Fixes: a46023d5616e ("tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 827fb9a0bf0d..2f72af0357e5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8611,6 +8611,7 @@ ftrace_pid_follow_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, self, task); @@ -8624,6 +8625,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, NULL, task); -- cgit v1.2.3 From a5dd6f58666f22ae16b98a2177bebc3340d38fe9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 3 Mar 2026 21:57:38 -0500 Subject: tracing: Disable preemption in the tracepoint callbacks handling filtered pids Filtering PIDs for events triggered the following during selftests: [37] event tracing - restricts events based on pid notrace filtering [ 155.874095] [ 155.874869] ============================= [ 155.876037] WARNING: suspicious RCU usage [ 155.877287] 7.0.0-rc1-00004-g8cd473a19bc7 #7 Not tainted [ 155.879263] ----------------------------- [ 155.882839] kernel/trace/trace_events.c:1057 suspicious rcu_dereference_check() usage! [ 155.889281] [ 155.889281] other info that might help us debug this: [ 155.889281] [ 155.894519] [ 155.894519] rcu_scheduler_active = 2, debug_locks = 1 [ 155.898068] no locks held by ftracetest/4364. [ 155.900524] [ 155.900524] stack backtrace: [ 155.902645] CPU: 1 UID: 0 PID: 4364 Comm: ftracetest Not tainted 7.0.0-rc1-00004-g8cd473a19bc7 #7 PREEMPT(lazy) [ 155.902648] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 [ 155.902651] Call Trace: [ 155.902655] [ 155.902659] dump_stack_lvl+0x67/0x90 [ 155.902665] lockdep_rcu_suspicious+0x154/0x1a0 [ 155.902672] event_filter_pid_sched_process_fork+0x9a/0xd0 [ 155.902678] kernel_clone+0x367/0x3a0 [ 155.902689] __x64_sys_clone+0x116/0x140 [ 155.902696] do_syscall_64+0x158/0x460 [ 155.902700] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 155.902702] ? trace_irq_disable+0x1d/0xc0 [ 155.902709] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 155.902711] RIP: 0033:0x4697c3 [ 155.902716] Code: 1f 84 00 00 00 00 00 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 89 c2 85 c0 75 2c 64 48 8b 04 25 10 00 00 [ 155.902718] RSP: 002b:00007ffc41150428 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 [ 155.902721] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004697c3 [ 155.902722] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 [ 155.902724] RBP: 0000000000000000 R08: 0000000000000000 R09: 000000003fccf990 [ 155.902725] R10: 000000003fccd690 R11: 0000000000000246 R12: 0000000000000001 [ 155.902726] R13: 000000003fce8103 R14: 0000000000000001 R15: 0000000000000000 [ 155.902733] [ 155.902747] The tracepoint callbacks recently were changed to allow preemption. The event PID filtering callbacks that were attached to the fork and exit tracepoints expected preemption disabled in order to access the RCU protected PID lists. Add a guard(preempt)() to protect the references to the PID list. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20260303215738.6ab275af@fedora Fixes: a46023d5616e ("tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast") Link: https://patch.msgid.link/20260303131706.96057f61a48a34c43ce1e396@kernel.org Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9c7f26cbe171..b7343fdfd7b0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); @@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); -- cgit v1.2.3 From e39bb9e02b68942f8e9359d2a3efe7d37ae6be0e Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Fri, 27 Feb 2026 10:58:42 +0800 Subject: tracing: Fix WARN_ON in tracing_buffers_mmap_close When a process forks, the child process copies the parent's VMAs but the user_mapped reference count is not incremented. As a result, when both the parent and child processes exit, tracing_buffers_mmap_close() is called twice. On the second call, user_mapped is already 0, causing the function to return -ENODEV and triggering a WARN_ON. Normally, this isn't an issue as the memory is mapped with VM_DONTCOPY set. But this is only a hint, and the application can call madvise(MADVISE_DOFORK) which resets the VM_DONTCOPY flag. When the application does that, it can trigger this issue on fork. Fix it by incrementing the user_mapped reference count without re-mapping the pages in the VMA's open callback. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Cc: Lorenzo Stoakes Link: https://patch.msgid.link/20260227025842.1085206-1-wangqing7171@gmail.com Fixes: cf9f0f7c4c5bb ("tracing: Allow user-space mapping of the ring-buffer") Reported-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3b5dd2030fe08afdf65d Tested-by: syzbot+3b5dd2030fe08afdf65d@syzkaller.appspotmail.com Signed-off-by: Qing Wang Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 21 +++++++++++++++++++++ kernel/trace/trace.c | 13 +++++++++++++ 3 files changed, 35 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 876358cfe1b1..d862fa610270 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma); +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f16f053ef77d..17d0ea0cc3e6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7310,6 +7310,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, return err; } +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask))) + return; + + cpu_buffer = buffer->buffers[cpu]; + + guard(mutex)(&cpu_buffer->mapping_lock); + + if (cpu_buffer->user_mapped) + __rb_inc_dec_mapped(cpu_buffer, true); + else + WARN(1, "Unexpected buffer stat, it should be mapped"); +} + int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 23de3719f495..1e7c032a72d2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8213,6 +8213,18 @@ static inline int get_snapshot_map(struct trace_array *tr) { return 0; } static inline void put_snapshot_map(struct trace_array *tr) { } #endif +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +static void tracing_buffers_mmap_open(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file); +} + static void tracing_buffers_mmap_close(struct vm_area_struct *vma) { struct ftrace_buffer_info *info = vma->vm_file->private_data; @@ -8232,6 +8244,7 @@ static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long a } static const struct vm_operations_struct tracing_buffers_vmops = { + .open = tracing_buffers_mmap_open, .close = tracing_buffers_mmap_close, .may_split = tracing_buffers_may_split, }; -- cgit v1.2.3 From fbdfa8da05b6ae44114fc4f9b3e83e1736fd411c Mon Sep 17 00:00:00 2001 From: Naveen Anandhan Date: Sat, 28 Feb 2026 13:17:35 +0530 Subject: selftests: tc-testing: fix list_categories() crash on list type list_categories() builds a set directly from the 'category' field of each test case. Since 'category' is a list, set(map(...)) attempts to insert lists into a set, which raises: TypeError: unhashable type: 'list' Flatten category lists and collect unique category names using set.update() instead. Signed-off-by: Naveen Anandhan Signed-off-by: David S. Miller --- tools/testing/selftests/tc-testing/tdc_helper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tdc_helper.py b/tools/testing/selftests/tc-testing/tdc_helper.py index 0440d252c4c5..bc447ca57333 100644 --- a/tools/testing/selftests/tc-testing/tdc_helper.py +++ b/tools/testing/selftests/tc-testing/tdc_helper.py @@ -38,10 +38,14 @@ def list_test_cases(testlist): def list_categories(testlist): - """ Show all categories that are present in a test case file. """ - categories = set(map(lambda x: x['category'], testlist)) + """Show all unique categories present in the test cases.""" + categories = set() + for t in testlist: + if 'category' in t: + categories.update(t['category']) + print("Available categories:") - print(", ".join(str(s) for s in categories)) + print(", ".join(sorted(categories))) print("") -- cgit v1.2.3 From 6932256d3a3764f3a5e06e2cb8603be45b6a9fef Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Wed, 25 Feb 2026 15:37:49 -0800 Subject: time/jiffies: Fix sysctl file error on configurations where USER_HZ < HZ Commit 2dc164a48e6fd ("sysctl: Create converter functions with two new macros") incorrectly returns error to user space when jiffies sysctl converter is used. The old overflow check got replaced with an unconditional one: + if (USER_HZ < HZ) + return -EINVAL; which will always be true on configurations with "USER_HZ < HZ". Remove the check; it is no longer needed as clock_t_to_jiffies() returns ULONG_MAX for the overflow case and proc_int_u2k_conv_uop() checks for "> INT_MAX" after conversion Fixes: 2dc164a48e6fd ("sysctl: Create converter functions with two new macros") Reported-by: Colm Harrington Signed-off-by: Gerd Rausch Signed-off-by: Joel Granados --- kernel/time/jiffies.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a5c7d15fce72..9daf8c5d9687 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -256,8 +256,6 @@ EXPORT_SYMBOL(proc_dointvec_jiffies); int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) - return -EINVAL; return proc_dointvec_conv(table, dir, buffer, lenp, ppos, do_proc_int_conv_userhz_jiffies); } -- cgit v1.2.3 From debc1a492b2695d05973994fb0f796dbd9ceaae6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 3 Mar 2026 15:34:20 -0800 Subject: iomap: don't mark folio uptodate if read IO has bytes pending If a folio has ifs metadata attached to it and the folio is partially read in through an async IO helper with the rest of it then being read in through post-EOF zeroing or as inline data, and the helper successfully finishes the read first, then post-EOF zeroing / reading inline will mark the folio as uptodate in iomap_set_range_uptodate(). This is a problem because when the read completion path later calls iomap_read_end(), it will call folio_end_read(), which sets the uptodate bit using XOR semantics. Calling folio_end_read() on a folio that was already marked uptodate clears the uptodate bit. Fix this by not marking the folio as uptodate if the read IO has bytes pending. The folio uptodate state will be set in the read completion path through iomap_end_read() -> folio_end_read(). Reported-by: Wei Gao Suggested-by: Sasha Levin Tested-by: Wei Gao Reviewed-by: Darrick J. Wong Cc: stable@vger.kernel.org # v6.19 Link: https://lore.kernel.org/linux-fsdevel/aYbmy8JdgXwsGaPP@autotest-wegao.qe.prg2.suse.org/ Fixes: b2f35ac4146d ("iomap: add caller-provided callbacks for read and readahead") Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260303233420.874231-2-joannelkoong@gmail.com Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bc82083e420a..00f0efaf12b2 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -80,18 +80,27 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, { struct iomap_folio_state *ifs = folio->private; unsigned long flags; - bool uptodate = true; + bool mark_uptodate = true; if (folio_test_uptodate(folio)) return; if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); - uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + /* + * If a read with bytes pending is in progress, we must not call + * folio_mark_uptodate(). The read completion path + * (iomap_read_end()) will call folio_end_read(), which uses XOR + * semantics to set the uptodate bit. If we set it here, the XOR + * in folio_end_read() will clear it, leaving the folio not + * uptodate. + */ + mark_uptodate = ifs_set_range_uptodate(folio, ifs, off, len) && + !ifs->read_bytes_pending; spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (uptodate) + if (mark_uptodate) folio_mark_uptodate(folio); } -- cgit v1.2.3 From d320f160aa5ff36cdf83c645cca52b615e866e32 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Mar 2026 09:30:02 -0800 Subject: iomap: reject delalloc mappings during writeback Filesystems should never provide a delayed allocation mapping to writeback; they're supposed to allocate the space before replying. This can lead to weird IO errors and crashes in the block layer if the filesystem is being malicious, or if it hadn't set iomap->dev because it's a delalloc mapping. Fix this by failing writeback on delalloc mappings. Currently no filesystems actually misbehave in this manner, but we ought to be stricter about things like that. Cc: stable@vger.kernel.org # v5.5 Fixes: 598ecfbaa742ac ("iomap: lift the xfs writeback code to iomap") Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/20260302173002.GL13829@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 4d1ef8a2cee9..60546fa14dfe 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -215,17 +215,18 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, WARN_ON_ONCE(!folio->private && map_len < dirty_len); switch (wpc->iomap.type) { - case IOMAP_INLINE: - WARN_ON_ONCE(1); - return -EIO; + case IOMAP_UNWRITTEN: + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; case IOMAP_HOLE: return map_len; default: - break; + WARN_ON_ONCE(1); + return -EIO; } - if (wpc->iomap.type == IOMAP_UNWRITTEN) - ioend_flags |= IOMAP_IOEND_UNWRITTEN; if (wpc->iomap.flags & IOMAP_F_SHARED) ioend_flags |= IOMAP_IOEND_SHARED; if (folio_test_dropbehind(folio)) -- cgit v1.2.3 From 126fe7ef12ffe42fdc600fe22df733e96fa418ec Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Mon, 2 Mar 2026 18:42:01 -0800 Subject: mailmap: Add entry for Joe Damato My Fastly email address is no longer used. Add a mailmap entry to reflect that. Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260303024202.2526604-1-joe@dama.to Signed-off-by: Jakub Kicinski --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index e1cf6bb85d33..a02e9aa9aed4 100644 --- a/.mailmap +++ b/.mailmap @@ -396,6 +396,7 @@ Jiri Slaby Jisheng Zhang Jisheng Zhang Jishnu Prakash +Joe Damato Joel Granados Johan Hovold Johan Hovold -- cgit v1.2.3 From 18b43bec5437914e3b51bb08dc6c6e7b0a2df4d1 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 3 Mar 2026 11:37:20 +0800 Subject: mailmap: reflect my gmail as default Use my gmail instead so that I can easily handle those emails that CC me. Signed-off-by: Jason Xing Link: https://patch.msgid.link/20260303033720.84108-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index a02e9aa9aed4..7fefbcc82d1a 100644 --- a/.mailmap +++ b/.mailmap @@ -348,6 +348,7 @@ Jarkko Sakkinen Jason Gunthorpe Jason Gunthorpe Jason Gunthorpe +Jason Xing Javi Merino Jayachandran C -- cgit v1.2.3 From 7f083faf59d14c04e01ec05a7507f036c965acf8 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 28 Feb 2026 23:53:07 +0900 Subject: net: sched: avoid qdisc_reset_all_tx_gt() vs dequeue race for lockless qdiscs When shrinking the number of real tx queues, netif_set_real_num_tx_queues() calls qdisc_reset_all_tx_gt() to flush qdiscs for queues which will no longer be used. qdisc_reset_all_tx_gt() currently serializes qdisc_reset() with qdisc_lock(). However, for lockless qdiscs, the dequeue path is serialized by qdisc_run_begin/end() using qdisc->seqlock instead, so qdisc_reset() can run concurrently with __qdisc_run() and free skbs while they are still being dequeued, leading to UAF. This can easily be reproduced on e.g. virtio-net by imposing heavy traffic while frequently changing the number of queue pairs: iperf3 -ub0 -c $peer -t 0 & while :; do ethtool -L eth0 combined 1 ethtool -L eth0 combined 2 done With KASAN enabled, this leads to reports like: BUG: KASAN: slab-use-after-free in __qdisc_run+0x133f/0x1760 ... Call Trace: ... __qdisc_run+0x133f/0x1760 __dev_queue_xmit+0x248f/0x3550 ip_finish_output2+0xa42/0x2110 ip_output+0x1a7/0x410 ip_send_skb+0x2e6/0x480 udp_send_skb+0xb0a/0x1590 udp_sendmsg+0x13c9/0x1fc0 ... Allocated by task 1270 on cpu 5 at 44.558414s: ... alloc_skb_with_frags+0x84/0x7c0 sock_alloc_send_pskb+0x69a/0x830 __ip_append_data+0x1b86/0x48c0 ip_make_skb+0x1e8/0x2b0 udp_sendmsg+0x13a6/0x1fc0 ... Freed by task 1306 on cpu 3 at 44.558445s: ... kmem_cache_free+0x117/0x5e0 pfifo_fast_reset+0x14d/0x580 qdisc_reset+0x9e/0x5f0 netif_set_real_num_tx_queues+0x303/0x840 virtnet_set_channels+0x1bf/0x260 [virtio_net] ethnl_set_channels+0x684/0xae0 ethnl_default_set_doit+0x31a/0x890 ... Serialize qdisc_reset_all_tx_gt() against the lockless dequeue path by taking qdisc->seqlock for TCQ_F_NOLOCK qdiscs, matching the serialization model already used by dev_reset_queue(). Additionally clear QDISC_STATE_NON_EMPTY after reset so the qdisc state reflects an empty queue, avoiding needless re-scheduling. Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Koichiro Den Link: https://patch.msgid.link/20260228145307.3955532-1-den@valinux.co.jp Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c3a7268b567e..d5d55cb21686 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -778,13 +778,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb) static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; + bool nolock; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { + nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); + spin_unlock_bh(&qdisc->seqlock); + } } } } -- cgit v1.2.3 From 165573e41f2f66ef98940cf65f838b2cb575d9d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 20:55:27 +0000 Subject: tcp: secure_seq: add back ports to TS offset This reverts 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets") tcp_tw_recycle went away in 2017. Zhouyan Deng reported off-path TCP source port leakage via SYN cookie side-channel that can be fixed in multiple ways. One of them is to bring back TCP ports in TS offset randomization. As a bonus, we perform a single siphash() computation to provide both an ISN and a TS offset. Fixes: 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets") Reported-by: Zhouyan Deng Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Acked-by: Florian Westphal Link: https://patch.msgid.link/20260302205527.1982836-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/secure_seq.h | 45 ++++++++++++++++++++++----- include/net/tcp.h | 6 ++-- net/core/secure_seq.c | 80 ++++++++++++++++++------------------------------ net/ipv4/syncookies.c | 11 +++++-- net/ipv4/tcp_input.c | 8 +++-- net/ipv4/tcp_ipv4.c | 37 ++++++++++------------ net/ipv6/syncookies.c | 11 +++++-- net/ipv6/tcp_ipv6.c | 37 ++++++++++------------ 8 files changed, 127 insertions(+), 108 deletions(-) diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index cddebafb9f77..6f996229167b 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -5,16 +5,47 @@ #include struct net; +extern struct net init_net; + +union tcp_seq_and_ts_off { + struct { + u32 seq; + u32 ts_off; + }; + u64 hash64; +}; u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); -u32 secure_tcp_seq(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr); -u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport); -u32 secure_tcpv6_ts_off(const struct net *net, - const __be32 *saddr, const __be32 *daddr); +union tcp_seq_and_ts_off +secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); + +static inline u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + union tcp_seq_and_ts_off ts; + + ts = secure_tcp_seq_and_ts_off(&init_net, saddr, daddr, + sport, dport); + + return ts.seq; +} + +union tcp_seq_and_ts_off +secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr, + const __be32 *daddr, + __be16 sport, __be16 dport); + +static inline u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + union tcp_seq_and_ts_off ts; + + ts = secure_tcpv6_seq_and_ts_off(&init_net, saddr, daddr, + sport, dport); + return ts.seq; +} #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index eb8bf63fdafc..978eea2d5df0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -2464,8 +2465,9 @@ struct tcp_request_sock_ops { struct flowi *fl, struct request_sock *req, u32 tw_isn); - u32 (*init_seq)(const struct sk_buff *skb); - u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); + union tcp_seq_and_ts_off (*init_seq_and_ts_off)( + const struct net *net, + const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 9a3965680451..6a6f2cda5aae 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -20,7 +20,6 @@ #include static siphash_aligned_key_t net_secret; -static siphash_aligned_key_t ts_secret; #define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) @@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); } - -static __always_inline void ts_secret_init(void) -{ - net_get_random_once(&ts_secret, sizeof(ts_secret)); -} #endif #ifdef CONFIG_INET @@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -u32 secure_tcpv6_ts_off(const struct net *net, - const __be32 *saddr, const __be32 *daddr) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - }; - - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash(&combined, offsetofend(typeof(combined), daddr), - &ts_secret); -} -EXPORT_IPV6_MOD(secure_tcpv6_ts_off); - -u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr, + const __be32 *daddr, __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; @@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, .sport = sport, .dport = dport }; - u32 hash; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL(secure_tcpv6_seq); +EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) @@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) -{ - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash_2u32((__force u32)saddr, (__force u32)daddr, - &ts_secret); -} - /* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, * it would be easy enough to have the former function use siphash_4u32, passing * the arguments as separate u32. */ -u32 secure_tcp_seq(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) { - u32 hash; + u32 ports = (__force u32)sport << 16 | (__force u32)dport; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr, + ports, &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL_GPL(secure_tcp_seq); +EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off); u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 061751aabc8e..fc3affd9c801 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -378,9 +378,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(net, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr); + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7b03f2460751..cba89733d121 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7646,6 +7646,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; + union tcp_seq_and_ts_off st; struct request_sock *req; bool want_cookie = false; struct dst_entry *dst; @@ -7715,9 +7716,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; + if (tmp_opt.tstamp_ok || (!want_cookie && !isn)) + st = af_ops->init_seq_and_ts_off(net, skb); + if (tmp_opt.tstamp_ok) { tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); - tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); + tcp_rsk(req)->ts_off = st.ts_off; } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); @@ -7739,7 +7743,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = st.seq; } tcp_ecn_create_request(req, skb, sk, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 910c25cb24e1..c7b2463c2e25 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -105,17 +105,14 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { static DEFINE_MUTEX(tcp_exit_batch_mutex); -static u32 tcp_v4_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcp_seq(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); -} - -static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); + return secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -327,15 +324,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len rt = NULL; if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port)); - WRITE_ONCE(tp->tsoffset, - secure_tcp_ts_off(net, inet->inet_saddr, - inet->inet_daddr)); + WRITE_ONCE(tp->write_seq, st.seq); + WRITE_ONCE(tp->tsoffset, st.ts_off); } atomic_set(&inet->inet_id, get_random_u16()); @@ -1677,8 +1675,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, - .init_seq = tcp_v4_init_seq, - .init_ts_off = tcp_v4_init_ts_off, + .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off, .send_synack = tcp_v4_send_synack, }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7e007f013ec8..4f6f0d751d6c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -151,9 +151,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcpv6_ts_off(net, - ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5195a46b951e..bb09d5ccf599 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -105,18 +105,14 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static u32 tcp_v6_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); -} - -static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); + return secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, @@ -320,14 +316,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, sk_set_txhash(sk); if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport)); - tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32); + WRITE_ONCE(tp->write_seq, st.seq); + tp->tsoffset = st.ts_off; } if (tcp_fastopen_defer_connect(sk, &err)) @@ -817,8 +815,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, - .init_seq = tcp_v6_init_seq, - .init_ts_off = tcp_v6_init_ts_off, + .init_seq_and_ts_off = tcp_v6_init_seq_and_ts_off, .send_synack = tcp_v6_send_synack, }; -- cgit v1.2.3 From f7d92f11bd33a6eb49c7c812255ef4ab13681f0f Mon Sep 17 00:00:00 2001 From: Ian Ray Date: Mon, 2 Mar 2026 18:32:37 +0200 Subject: net: nfc: nci: Fix zero-length proprietary notifications NCI NFC controllers may have proprietary OIDs with zero-length payload. One example is: drivers/nfc/nxp-nci/core.c, NXP_NCI_RF_TXLDO_ERROR_NTF. Allow a zero length payload in proprietary notifications *only*. Before: -- >8 -- kernel: nci: nci_recv_frame: len 3 -- >8 -- After: -- >8 -- kernel: nci: nci_recv_frame: len 3 kernel: nci: nci_ntf_packet: NCI RX: MT=ntf, PBF=0, GID=0x1, OID=0x23, plen=0 kernel: nci: nci_ntf_packet: unknown ntf opcode 0x123 kernel: nfc nfc0: NFC: RF transmitter couldn't start. Bad power and/or configuration? -- >8 -- After fixing the hardware: -- >8 -- kernel: nci: nci_recv_frame: len 27 kernel: nci: nci_ntf_packet: NCI RX: MT=ntf, PBF=0, GID=0x1, OID=0x5, plen=24 kernel: nci: nci_rf_intf_activated_ntf_packet: rf_discovery_id 1 -- >8 -- Fixes: d24b03535e5e ("nfc: nci: Fix uninit-value in nci_dev_up and nci_ntf_packet") Signed-off-by: Ian Ray Link: https://patch.msgid.link/20260302163238.140576-1-ian.ray@gehealthcare.com Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6e9b76e2cc56..e95eef85971f 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1482,10 +1482,20 @@ static bool nci_valid_size(struct sk_buff *skb) unsigned int hdr_size = NCI_CTRL_HDR_SIZE; if (skb->len < hdr_size || - !nci_plen(skb->data) || skb->len < hdr_size + nci_plen(skb->data)) { return false; } + + if (!nci_plen(skb->data)) { + /* Allow zero length in proprietary notifications (0x20 - 0x3F). */ + if (nci_opcode_oid(nci_opcode(skb->data)) >= 0x20 && + nci_mt(skb->data) == NCI_MT_NTF_PKT) + return true; + + /* Disallow zero length otherwise. */ + return false; + } + return true; } -- cgit v1.2.3 From a4c2b8be2e5329e7fac6e8f64ddcb8958155cfcb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 01:56:40 +0000 Subject: net_sched: sch_fq: clear q->band_pkt_count[] in fq_reset() When/if a NIC resets, queues are deactivated by dev_deactivate_many(), then reactivated when the reset operation completes. fq_reset() removes all the skbs from various queues. If we do not clear q->band_pkt_count[], these counters keep growing and can eventually reach sch->limit, preventing new packets to be queued. Many thanks to Praveen for discovering the root cause. Fixes: 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") Diagnosed-by: Praveen Kaligineedi Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260304015640.961780-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 80235e85f844..05084c9af48e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -827,6 +827,7 @@ static void fq_reset(struct Qdisc *sch) for (idx = 0; idx < FQ_BANDS; idx++) { q->band_flows[idx].new_flows.first = NULL; q->band_flows[idx].old_flows.first = NULL; + q->band_pkt_count[idx] = 0; } q->delayed = RB_ROOT; q->flows = 0; -- cgit v1.2.3 From 40bf00ec2ee271df5ba67593991760adf8b5d0ed Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 2 Mar 2026 16:32:56 -0800 Subject: net: devmem: use READ_ONCE/WRITE_ONCE on binding->dev binding->dev is protected on the write-side in mp_dmabuf_devmem_uninstall() against concurrent writes, but due to the concurrent bare reads in net_devmem_get_binding() and validate_xmit_unreadable_skb() it should be wrapped in a READ_ONCE/WRITE_ONCE pair to make sure no compiler optimizations play with the underlying register in unforeseen ways. Doesn't present a critical bug because the known compiler optimizations don't result in bad behavior. There is no tearing on u64, and load omissions/invented loads would only break if additional binding->dev references were inlined together (they aren't right now). This just more strictly follows the linux memory model (i.e., "Lock-Protected Writes With Lockless Reads" in tools/memory-model/Documentation/access-marking.txt). Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260302-devmem-membar-fix-v2-1-5b33c9cbc28b@meta.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/devmem.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4af4cf2d63a4..20610a192ec7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3987,7 +3987,7 @@ static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, if (shinfo->nr_frags > 0) { niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); if (net_is_devmem_iov(niov) && - net_devmem_iov_binding(niov)->dev != dev) + READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev) goto out_free; } diff --git a/net/core/devmem.c b/net/core/devmem.c index 8c9aad776bf4..69d79aee07ef 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -396,7 +396,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, * net_device. */ dst_dev = dst_dev_rcu(dst); - if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { + if (unlikely(!dst_dev) || + unlikely(dst_dev != READ_ONCE(binding->dev))) { err = -ENODEV; goto out_unlock; } @@ -513,7 +514,8 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv, xa_erase(&binding->bound_rxqs, xa_idx); if (xa_empty(&binding->bound_rxqs)) { mutex_lock(&binding->lock); - binding->dev = NULL; + ASSERT_EXCLUSIVE_WRITER(binding->dev); + WRITE_ONCE(binding->dev, NULL); mutex_unlock(&binding->lock); } break; -- cgit v1.2.3 From 7bd4b0c4779f978a6528c9b7937d2ca18e936e2c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:41 -0800 Subject: nfc: nci: free skb on nci_transceive early error paths nci_transceive() takes ownership of the skb passed by the caller, but the -EPROTO, -EINVAL, and -EBUSY error paths return without freeing it. Due to issues clearing NCI_DATA_EXCHANGE fixed by subsequent changes the nci/nci_dev selftest hits the error path occasionally in NIPA, and kmemleak detects leaks: unreferenced object 0xff11000015ce6a40 (size 640): comm "nci_dev", pid 3954, jiffies 4295441246 hex dump (first 32 bytes): 6b 6b 6b 6b 00 a4 00 0c 02 e1 03 6b 6b 6b 6b 6b kkkk.......kkkkk 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk backtrace (crc 7c40cc2a): kmem_cache_alloc_node_noprof+0x492/0x630 __alloc_skb+0x11e/0x5f0 alloc_skb_with_frags+0xc6/0x8f0 sock_alloc_send_pskb+0x326/0x3f0 nfc_alloc_send_skb+0x94/0x1d0 rawsock_sendmsg+0x162/0x4c0 do_syscall_64+0x117/0xfc0 Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e95eef85971f..e859b834c0f1 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1035,18 +1035,23 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return -EPROTO; + } pr_debug("target_idx %d, len %d\n", target->idx, skb->len); if (!ndev->target_active_prot) { pr_err("unable to exchange data, no active target\n"); + kfree_skb(skb); return -EINVAL; } - if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) { + kfree_skb(skb); return -EBUSY; + } /* store cb and context to be used on receiving data */ conn_info->data_exchange_cb = cb; -- cgit v1.2.3 From d42449d2c17cdf06d1f63268557450bd3f051e9a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:42 -0800 Subject: nfc: digital: free skb on digital_in_send error paths digital_in_send() takes ownership of the skb passed by the caller (nfc_data_exchange), make sure it's freed on all error paths. Found looking around the real driver for similar bugs to the one just fixed in nci. Fixes: 2c66daecc409 ("NFC Digital: Add NFC-A technology support") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/digital_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 3670bb33732e..7cb1e6aaae90 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -707,8 +707,10 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, int rc; data_exch = kzalloc_obj(*data_exch); - if (!data_exch) + if (!data_exch) { + kfree_skb(skb); return -ENOMEM; + } data_exch->cb = cb; data_exch->cb_context = cb_context; @@ -731,8 +733,10 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, data_exch); exit: - if (rc) + if (rc) { + kfree_skb(skb); kfree(data_exch); + } return rc; } -- cgit v1.2.3 From 66083581945bd5b8e99fe49b5aeb83d03f62d053 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:43 -0800 Subject: nfc: nci: complete pending data exchange on device close In nci_close_device(), complete any pending data exchange before closing. The data exchange callback (e.g. rawsock_data_exchange_complete) holds a socket reference. NIPA occasionally hits this leak: unreferenced object 0xff1100000f435000 (size 2048): comm "nci_dev", pid 3954, jiffies 4295441245 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 27 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00 '..@............ backtrace (crc ec2b3c5): __kmalloc_noprof+0x4db/0x730 sk_prot_alloc.isra.0+0xe4/0x1d0 sk_alloc+0x36/0x760 rawsock_create+0xd1/0x540 nfc_sock_create+0x11f/0x280 __sock_create+0x22d/0x630 __sys_socket+0x115/0x1d0 __x64_sys_socket+0x72/0xd0 do_syscall_64+0x117/0xfc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 38f04c6b1b68 ("NFC: protect nci_data_exchange transactions") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e859b834c0f1..43d871525dbc 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -567,6 +567,10 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); timer_delete_sync(&ndev->data_timer); + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, + ndev->cur_conn_id, + -ENODEV); mutex_unlock(&ndev->req_lock); return 0; } @@ -598,6 +602,11 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); + timer_delete_sync(&ndev->data_timer); + + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, ndev->cur_conn_id, + -ENODEV); /* Clear flags except NCI_UNREG */ ndev->flags &= BIT(NCI_UNREG); -- cgit v1.2.3 From 0efdc02f4f6d52f8ca5d5889560f325a836ce0a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:44 -0800 Subject: nfc: nci: clear NCI_DATA_EXCHANGE before calling completion callback Move clear_bit(NCI_DATA_EXCHANGE) before invoking the data exchange callback in nci_data_exchange_complete(). The callback (e.g. rawsock_data_exchange_complete) may immediately schedule another data exchange via schedule_work(tx_work). On a multi-CPU system, tx_work can run and reach nci_transceive() before the current nci_data_exchange_complete() clears the flag, causing test_and_set_bit(NCI_DATA_EXCHANGE) to return -EBUSY and the new transfer to fail. This causes intermittent flakes in nci/nci_dev in NIPA: # # RUN NCI.NCI1_0.t4t_tag_read ... # # t4t_tag_read: Test terminated by timeout # # FAIL NCI.NCI1_0.t4t_tag_read # not ok 3 NCI.NCI1_0.t4t_tag_read Fixes: 38f04c6b1b68 ("NFC: protect nci_data_exchange transactions") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/data.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 78f4131af3cf..5f98c73db5af 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -33,7 +33,8 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { kfree_skb(skb); - goto exit; + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + return; } cb = conn_info->data_exchange_cb; @@ -45,6 +46,12 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, timer_delete_sync(&ndev->data_timer); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); + /* Mark the exchange as done before calling the callback. + * The callback (e.g. rawsock_data_exchange_complete) may + * want to immediately queue another data exchange. + */ + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + if (cb) { /* forward skb to nfc core */ cb(cb_context, skb, err); @@ -54,9 +61,6 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, /* no waiting callback, free skb */ kfree_skb(skb); } - -exit: - clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); } /* ----------------- NCI TX Data ----------------- */ -- cgit v1.2.3 From d793458c45df2aed498d7f74145eab7ee22d25aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:45 -0800 Subject: nfc: rawsock: cancel tx_work before socket teardown In rawsock_release(), cancel any pending tx_work and purge the write queue before orphaning the socket. rawsock_tx_work runs on the system workqueue and calls nfc_data_exchange which dereferences the NCI device. Without synchronization, tx_work can race with socket and device teardown when a process is killed (e.g. by SIGKILL), leading to use-after-free or leaked references. Set SEND_SHUTDOWN first so that if tx_work is already running it will see the flag and skip transmitting, then use cancel_work_sync to wait for any in-progress execution to finish, and finally purge any remaining queued skbs. Fixes: 23b7869c0fd0 ("NFC: add the NFC socket raw protocol") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/rawsock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index b049022399ae..f7d7a599fade 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -67,6 +67,17 @@ static int rawsock_release(struct socket *sock) if (sock->type == SOCK_RAW) nfc_sock_unlink(&raw_sk_list, sk); + if (sk->sk_state == TCP_ESTABLISHED) { + /* Prevent rawsock_tx_work from starting new transmits and + * wait for any in-progress work to finish. This must happen + * before the socket is orphaned to avoid a race where + * rawsock_tx_work runs after the NCI device has been freed. + */ + sk->sk_shutdown |= SEND_SHUTDOWN; + cancel_work_sync(&nfc_rawsock(sk)->tx_work); + rawsock_write_queue_purge(sk); + } + sock_orphan(sk); sock_put(sk); -- cgit v1.2.3 From 8c09412e584d9bcc0e71d758ec1008d1c8d1a326 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 3 Mar 2026 11:56:02 +0100 Subject: selftests: mptcp: more stable simult_flows tests By default, the netem qdisc can keep up to 1000 packets under its belly to deal with the configured rate and delay. The simult flows test-case simulates very low speed links, to avoid problems due to slow CPUs and the TCP stack tend to transmit at a slightly higher rate than the (virtual) link constraints. All the above causes a relatively large amount of packets being enqueued in the netem qdiscs - the longer the transfer, the longer the queue - producing increasingly high TCP RTT samples and consequently increasingly larger receive buffer size due to DRS. When the receive buffer size becomes considerably larger than the needed size, the tests results can flake, i.e. because minimal inaccuracy in the pacing rate can lead to a single subflow usage towards the end of the connection for a considerable amount of data. Address the issue explicitly setting netem limits suitable for the configured link speeds and unflake all the affected tests. Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-1-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/simult_flows.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 806aaa7d2d61..d11a8b949aab 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -237,10 +237,13 @@ run_test() for dev in ns2eth1 ns2eth2; do tc -n $ns2 qdisc del dev $dev root >/dev/null 2>&1 done - tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1 - tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2 - tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1 - tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2 + + # keep the queued pkts number low, or the RTT estimator will see + # increasing latency over time. + tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1 limit 50 + tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2 limit 50 + tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1 limit 50 + tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2 limit 50 # time is measured in ms, account for transfer size, aggregated link speed # and header overhead (10%) -- cgit v1.2.3 From fb8d0bccb221080630efcd9660c9f9349e53cc9e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:03 +0100 Subject: mptcp: pm: avoid sending RM_ADDR over same subflow RM_ADDR are sent over an active subflow, the first one in the subflows list. There is then a high chance the initial subflow is picked. With the in-kernel PM, when an endpoint is removed, a RM_ADDR is sent, then linked subflows are closed. This is done for each active MPTCP connection. MPTCP endpoints are likely removed because the attached network is no longer available or usable. In this case, it is better to avoid sending this RM_ADDR over the subflow that is going to be removed, but prefer sending it over another active and non stale subflow, if any. This modification avoids situations where the other end is not notified when a subflow is no longer usable: typically when the endpoint linked to the initial subflow is removed, especially on the server side. Fixes: 8dd5efb1f91b ("mptcp: send ack for rm_addr") Cc: stable@vger.kernel.org Reported-by: Frank Lorenz Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/612 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-2-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7298836469b3..57a456690406 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -212,9 +212,24 @@ void mptcp_pm_send_ack(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); } -void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +static bool subflow_in_rm_list(const struct mptcp_subflow_context *subflow, + const struct mptcp_rm_list *rm_list) +{ + u8 i, id = subflow_get_local_id(subflow); + + for (i = 0; i < rm_list->nr; i++) { + if (rm_list->ids[i] == id) + return true; + } + + return false; +} + +static void +mptcp_pm_addr_send_ack_avoid_list(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { - struct mptcp_subflow_context *subflow, *alt = NULL; + struct mptcp_subflow_context *subflow, *stale = NULL, *same_id = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); @@ -224,19 +239,35 @@ void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) return; mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } + if (!__mptcp_subflow_active(subflow)) + continue; - if (!alt) - alt = subflow; + if (unlikely(subflow->stale)) { + if (!stale) + stale = subflow; + } else if (unlikely(rm_list && + subflow_in_rm_list(subflow, rm_list))) { + if (!same_id) + same_id = subflow; + } else { + goto send_ack; } } - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); + if (same_id) + subflow = same_id; + else if (stale) + subflow = stale; + else + return; + +send_ack: + mptcp_pm_send_ack(msk, subflow, false, false); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + mptcp_pm_addr_send_ack_avoid_list(msk, NULL); } int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, @@ -470,7 +501,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_addr_send_ack(msk); + mptcp_pm_addr_send_ack_avoid_list(msk, rm_list); return 0; } -- cgit v1.2.3 From 560edd99b5f58b2d4bbe3c8e51e1eed68d887b0e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:04 +0100 Subject: selftests: mptcp: join: check RM_ADDR not sent over same subflow This validates the previous commit: RM_ADDR were sent over the first found active subflow which could be the same as the one being removed. It is more likely to loose this notification. For this check, RM_ADDR are explicitly dropped when trying to send them over the initial subflow, when removing the endpoint attached to it. If it is dropped, the test will complain because some RM_ADDR have not been received. Note that only the RM_ADDR are dropped, to allow the linked subflow to be quickly and cleanly closed. To only drop those RM_ADDR, a cBPF byte code is used. If the IPTables commands fail, that's OK, the tests will continue to pass, but not validate this part. This can be ignored: another subtest fully depends on such command, and will be marked as skipped. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 8dd5efb1f91b ("mptcp: send ack for rm_addr") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-3-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index dc1f200aaa81..058ad5a13d24 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -104,6 +104,24 @@ CBPF_MPTCP_SUBOPTION_ADD_ADDR="14, 6 0 0 65535, 6 0 0 0" +# IPv4: TCP hdr of 48B, a first suboption of 12B (DACK8), the RM_ADDR suboption +# generated using "nfbpf_compile '(ip[32] & 0xf0) == 0xc0 && ip[53] == 0x0c && +# (ip[66] & 0xf0) == 0x40'" +CBPF_MPTCP_SUBOPTION_RM_ADDR="13, + 48 0 0 0, + 84 0 0 240, + 21 0 9 64, + 48 0 0 32, + 84 0 0 240, + 21 0 6 192, + 48 0 0 53, + 21 0 4 12, + 48 0 0 66, + 84 0 0 240, + 21 0 1 64, + 6 0 0 65535, + 6 0 0 0" + init_partial() { capout=$(mktemp) @@ -4217,6 +4235,14 @@ endpoint_tests() chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 + # To make sure RM_ADDR are sent over a different subflow, but + # allow the rest to quickly and cleanly close the subflow + local ipt=1 + ip netns exec "${ns2}" ${iptables} -I OUTPUT -s "10.0.1.2" \ + -p tcp -m tcp --tcp-option 30 \ + -m bpf --bytecode \ + "$CBPF_MPTCP_SUBOPTION_RM_ADDR" \ + -j DROP || ipt=0 local i for i in $(seq 3); do pm_nl_del_endpoint $ns2 1 10.0.1.2 @@ -4229,6 +4255,7 @@ endpoint_tests() chk_subflow_nr "after re-add id 0 ($i)" 3 chk_mptcp_info subflows 3 subflows 3 done + [ ${ipt} = 1 ] && ip netns exec "${ns2}" ${iptables} -D OUTPUT 1 mptcp_lib_kill_group_wait $tests_pid @@ -4288,11 +4315,20 @@ endpoint_tests() chk_mptcp_info subflows 2 subflows 2 chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 + # To make sure RM_ADDR are sent over a different subflow, but + # allow the rest to quickly and cleanly close the subflow + local ipt=1 + ip netns exec "${ns1}" ${iptables} -I OUTPUT -s "10.0.1.1" \ + -p tcp -m tcp --tcp-option 30 \ + -m bpf --bytecode \ + "$CBPF_MPTCP_SUBOPTION_RM_ADDR" \ + -j DROP || ipt=0 pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 + [ ${ipt} = 1 ] && ip netns exec "${ns1}" ${iptables} -D OUTPUT 1 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj 4 -- cgit v1.2.3 From 579a752464a64cb5f9139102f0e6b90a1f595ceb Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:05 +0100 Subject: mptcp: pm: in-kernel: always mark signal+subflow endp as used Syzkaller managed to find a combination of actions that was generating this warning: msk->pm.local_addr_used == 0 WARNING: net/mptcp/pm_kernel.c:1071 at __mark_subflow_endp_available net/mptcp/pm_kernel.c:1071 [inline], CPU#1: syz.2.17/961 WARNING: net/mptcp/pm_kernel.c:1071 at mptcp_nl_remove_subflow_and_signal_addr net/mptcp/pm_kernel.c:1103 [inline], CPU#1: syz.2.17/961 WARNING: net/mptcp/pm_kernel.c:1071 at mptcp_pm_nl_del_addr_doit+0x81d/0x8f0 net/mptcp/pm_kernel.c:1210, CPU#1: syz.2.17/961 Modules linked in: CPU: 1 UID: 0 PID: 961 Comm: syz.2.17 Not tainted 6.19.0-08368-gfafda3b4b06b #22 PREEMPT(full) Hardware name: QEMU Ubuntu 25.10 PC v2 (i440FX + PIIX, + 10.1 machine, 1996), BIOS 1.17.0-debian-1.17.0-1build1 04/01/2014 RIP: 0010:__mark_subflow_endp_available net/mptcp/pm_kernel.c:1071 [inline] RIP: 0010:mptcp_nl_remove_subflow_and_signal_addr net/mptcp/pm_kernel.c:1103 [inline] RIP: 0010:mptcp_pm_nl_del_addr_doit+0x81d/0x8f0 net/mptcp/pm_kernel.c:1210 Code: 89 c5 e8 46 30 6f fe e9 21 fd ff ff 49 83 ed 80 e8 38 30 6f fe 4c 89 ef be 03 00 00 00 e8 db 49 df fe eb ac e8 24 30 6f fe 90 <0f> 0b 90 e9 1d ff ff ff e8 16 30 6f fe eb 05 e8 0f 30 6f fe e8 9a RSP: 0018:ffffc90001663880 EFLAGS: 00010293 RAX: ffffffff82de1a6c RBX: 0000000000000000 RCX: ffff88800722b500 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8880158b22d0 R08: 0000000000010425 R09: ffffffffffffffff R10: ffffffff82de18ba R11: 0000000000000000 R12: ffff88800641a640 R13: ffff8880158b1880 R14: ffff88801ec3c900 R15: ffff88800641a650 FS: 00005555722c3500(0000) GS:ffff8880f909d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f66346e0f60 CR3: 000000001607c000 CR4: 0000000000350ef0 Call Trace: genl_family_rcv_msg_doit+0x117/0x180 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x3a8/0x3f0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x16d/0x240 net/netlink/af_netlink.c:2550 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x3e9/0x4c0 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x4aa/0x5b0 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0xc9/0xf0 net/socket.c:742 ____sys_sendmsg+0x272/0x3b0 net/socket.c:2592 ___sys_sendmsg+0x2de/0x320 net/socket.c:2646 __sys_sendmsg net/socket.c:2678 [inline] __do_sys_sendmsg net/socket.c:2683 [inline] __se_sys_sendmsg net/socket.c:2681 [inline] __x64_sys_sendmsg+0x110/0x1a0 net/socket.c:2681 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x143/0x440 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f66346f826d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc83d8bdc8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f6634985fa0 RCX: 00007f66346f826d RDX: 00000000040000b0 RSI: 0000200000000740 RDI: 0000000000000007 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6634985fa8 R13: 00007f6634985fac R14: 0000000000000000 R15: 0000000000001770 The actions that caused that seem to be: - Set the MPTCP subflows limit to 0 - Create an MPTCP endpoint with both the 'signal' and 'subflow' flags - Create a new MPTCP connection from a different address: an ADD_ADDR linked to the MPTCP endpoint will be sent ('signal' flag), but no subflows is initiated ('subflow' flag) - Remove the MPTCP endpoint In this case, msk->pm.local_addr_used has been kept to 0 -- because no subflows have been created -- but the corresponding bit in msk->pm.id_avail_bitmap has been cleared when the ADD_ADDR has been sent. This later causes a splat when removing the MPTCP endpoint because msk->pm.local_addr_used has been kept to 0. Now, if an endpoint has both the signal and subflow flags, but it is not possible to create subflows because of the limits or the c-flag case, then the local endpoint counter is still incremented: the endpoint is used at the end. This avoids issues later when removing the endpoint and calling __mark_subflow_endp_available(), which expects msk->pm.local_addr_used to have been previously incremented if the endpoint was marked as used according to msk->pm.id_avail_bitmap. Note that signal_and_subflow variable is reset to false when the limits and the c-flag case allows subflows creation. Also, local_addr_used is only incremented for non ID0 subflows. Fixes: 85df533a787b ("mptcp: pm: do not ignore 'subflow' if 'signal' flag is also set") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/613 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-4-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index b5316a6c7d1b..b2b9df43960e 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -418,6 +418,15 @@ subflow: } exit: + /* If an endpoint has both the signal and subflow flags, but it is not + * possible to create subflows -- the 'while' loop body above never + * executed -- then still mark the endp as used, which is somehow the + * case. This avoids issues later when removing the endpoint and calling + * __mark_subflow_endp_available(), which expects the increment here. + */ + if (signal_and_subflow && local.addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + mptcp_pm_nl_check_work_pending(msk); } -- cgit v1.2.3 From 1777f349ff41b62dfe27454b69c27b0bc99ffca5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:06 +0100 Subject: selftests: mptcp: join: check removing signal+subflow endp This validates the previous commit: endpoints with both the signal and subflow flags should always be marked as used even if it was not possible to create new subflows due to the MPTCP PM limits. For this test, an extra endpoint is created with both the signal and the subflow flags, and limits are set not to create extra subflows. In this case, an ADD_ADDR is sent, but no subflows are created. Still, the local endpoint is marked as used, and no warning is fired when removing the endpoint, after having sent a RM_ADDR. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 85df533a787b ("mptcp: pm: do not ignore 'subflow' if 'signal' flag is also set") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-5-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 058ad5a13d24..a3144d7298a5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2626,6 +2626,19 @@ remove_tests() chk_rst_nr 0 0 fi + # signal+subflow with limits, remove + if reset "remove signal+subflow with limits"; then + pm_nl_set_limits $ns1 0 0 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,subflow + pm_nl_set_limits $ns2 0 0 + addr_nr_ns1=-1 speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 0 0 0 + chk_add_nr 1 1 + chk_rm_nr 1 0 invert + chk_rst_nr 0 0 + fi + # addresses remove if reset "remove addresses"; then pm_nl_set_limits $ns1 3 3 -- cgit v1.2.3 From 35dfedce442c4060cfe5b98368bc9643fb995716 Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Tue, 3 Mar 2026 14:58:25 +0000 Subject: net: stmmac: Fix error handling in VLAN add and delete paths stmmac_vlan_rx_add_vid() updates active_vlans and the VLAN hash register before writing the HW filter entry. If the filter write fails, it leaves a stale VID in active_vlans and the hash register. stmmac_vlan_rx_kill_vid() has the reverse problem: it clears active_vlans before removing the HW filter. On failure, the VID is gone from active_vlans but still present in the HW filter table. To fix this, reorder the operations to update the hash table first, then attempt the HW filter operation. If the HW filter fails, roll back both the active_vlans bitmap and the hash table by calling stmmac_vlan_update() again. Fixes: ed64639bc1e0 ("net: stmmac: Add support for VLAN Rx filtering") Signed-off-by: Ovidiu Panait Link: https://patch.msgid.link/20260303145828.7845-2-ovidiu.panait.rb@renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index e00aa42a1961..b6a127316ab5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6794,9 +6794,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid if (priv->hw->num_vlan) { ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); - if (ret) + if (ret) { + clear_bit(vid, priv->active_vlans); + stmmac_vlan_update(priv, is_double); goto err_pm_put; + } } + err_pm_put: pm_runtime_put(priv->device); @@ -6820,15 +6824,21 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi is_double = true; clear_bit(vid, priv->active_vlans); + ret = stmmac_vlan_update(priv, is_double); + if (ret) { + set_bit(vid, priv->active_vlans); + goto del_vlan_error; + } if (priv->hw->num_vlan) { ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); - if (ret) + if (ret) { + set_bit(vid, priv->active_vlans); + stmmac_vlan_update(priv, is_double); goto del_vlan_error; + } } - ret = stmmac_vlan_update(priv, is_double); - del_vlan_error: pm_runtime_put(priv->device); -- cgit v1.2.3 From e38200e361cbe331806dc454c76c11c7cd95e1b9 Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Tue, 3 Mar 2026 14:58:26 +0000 Subject: net: stmmac: Improve double VLAN handling The double VLAN bits (EDVLP, ESVL, DOVLTC) are handled inconsistently between the two vlan_update_hash() implementations: - dwxgmac2_update_vlan_hash() explicitly clears the double VLAN bits when is_double is false, meaning that adding a 802.1Q VLAN will disable double VLAN mode: $ ip link add link eth0 name eth0.200 type vlan id 200 protocol 802.1ad $ ip link add link eth0 name eth0.100 type vlan id 100 # Double VLAN bits no longer set - vlan_update_hash() sets these bits and only clears them when the last VLAN has been removed, so double VLAN mode remains enabled even after all 802.1AD VLANs are removed. Address both issues by tracking the number of active 802.1AD VLANs in priv->num_double_vlans. Pass this count to stmmac_vlan_update() so both implementations correctly set the double VLAN bits when any 802.1AD VLAN is active, and clear them only when none remain. Also update vlan_update_hash() to explicitly clear the double VLAN bits when is_double is false, matching the dwxgmac2 behavior. Signed-off-by: Ovidiu Panait Link: https://patch.msgid.link/20260303145828.7845-3-ovidiu.panait.rb@renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 ++++++++++++---- drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c | 8 ++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 51c96a738151..33667a26708c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -323,6 +323,7 @@ struct stmmac_priv { void __iomem *ptpaddr; void __iomem *estaddr; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + unsigned int num_double_vlans; int sfty_irq; int sfty_ce_irq; int sfty_ue_irq; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b6a127316ab5..01d181c2c68e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6775,6 +6775,7 @@ static int stmmac_vlan_update(struct stmmac_priv *priv, bool is_double) static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid) { struct stmmac_priv *priv = netdev_priv(ndev); + unsigned int num_double_vlans; bool is_double = false; int ret; @@ -6786,7 +6787,8 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid is_double = true; set_bit(vid, priv->active_vlans); - ret = stmmac_vlan_update(priv, is_double); + num_double_vlans = priv->num_double_vlans + is_double; + ret = stmmac_vlan_update(priv, num_double_vlans); if (ret) { clear_bit(vid, priv->active_vlans); goto err_pm_put; @@ -6796,11 +6798,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); if (ret) { clear_bit(vid, priv->active_vlans); - stmmac_vlan_update(priv, is_double); + stmmac_vlan_update(priv, priv->num_double_vlans); goto err_pm_put; } } + priv->num_double_vlans = num_double_vlans; + err_pm_put: pm_runtime_put(priv->device); @@ -6813,6 +6817,7 @@ err_pm_put: static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) { struct stmmac_priv *priv = netdev_priv(ndev); + unsigned int num_double_vlans; bool is_double = false; int ret; @@ -6824,7 +6829,8 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi is_double = true; clear_bit(vid, priv->active_vlans); - ret = stmmac_vlan_update(priv, is_double); + num_double_vlans = priv->num_double_vlans - is_double; + ret = stmmac_vlan_update(priv, num_double_vlans); if (ret) { set_bit(vid, priv->active_vlans); goto del_vlan_error; @@ -6834,11 +6840,13 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); if (ret) { set_bit(vid, priv->active_vlans); - stmmac_vlan_update(priv, is_double); + stmmac_vlan_update(priv, priv->num_double_vlans); goto del_vlan_error; } } + priv->num_double_vlans = num_double_vlans; + del_vlan_error: pm_runtime_put(priv->device); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c index b18404dd5a8b..de1a70e1c86e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c @@ -183,6 +183,10 @@ static void vlan_update_hash(struct mac_device_info *hw, u32 hash, value |= VLAN_EDVLP; value |= VLAN_ESVL; value |= VLAN_DOVLTC; + } else { + value &= ~VLAN_EDVLP; + value &= ~VLAN_ESVL; + value &= ~VLAN_DOVLTC; } writel(value, ioaddr + VLAN_TAG); @@ -193,6 +197,10 @@ static void vlan_update_hash(struct mac_device_info *hw, u32 hash, value |= VLAN_EDVLP; value |= VLAN_ESVL; value |= VLAN_DOVLTC; + } else { + value &= ~VLAN_EDVLP; + value &= ~VLAN_ESVL; + value &= ~VLAN_DOVLTC; } writel(value | perfect_match, ioaddr + VLAN_TAG); -- cgit v1.2.3 From bd7ad51253a76fb35886d01cfe9a37f0e4ed6709 Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Tue, 3 Mar 2026 14:58:27 +0000 Subject: net: stmmac: Fix VLAN HW state restore When the network interface is opened or resumed, a DMA reset is performed, which resets all hardware state, including VLAN state. Currently, only the resume path is restoring the VLAN state via stmmac_restore_hw_vlan_rx_fltr(), but that is incomplete: the VLAN hash table and the VLAN_TAG control bits are not restored. Therefore, add stmmac_vlan_restore(), which restores the full VLAN state by updating both the HW filter entries and the hash table, and call it from both the open and resume paths. The VLAN restore is moved outside of phylink_rx_clk_stop_block/unblock in the resume path because receive clock stop is already disabled when stmmac supports VLAN. Also, remove the hash readback code in vlan_restore_hw_rx_fltr() that attempts to restore VTHM by reading VLAN_HASH_TABLE, as it always reads zero after DMA reset, making it dead code. Fixes: 3cd1cfcba26e ("net: stmmac: Implement VLAN Hash Filtering in XGMAC") Fixes: ed64639bc1e0 ("net: stmmac: Add support for VLAN Rx filtering") Signed-off-by: Ovidiu Panait Link: https://patch.msgid.link/20260303145828.7845-4-ovidiu.panait.rb@renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 24 +++++++++++++++++++++-- drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c | 10 ---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 01d181c2c68e..6d4ce35e990f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -156,6 +156,7 @@ static void stmmac_tx_timer_arm(struct stmmac_priv *priv, u32 queue); static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int queue); static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, u32 rxmode, u32 chan); +static int stmmac_vlan_restore(struct stmmac_priv *priv); #ifdef CONFIG_DEBUG_FS static const struct net_device_ops stmmac_netdev_ops; @@ -4107,6 +4108,8 @@ static int __stmmac_open(struct net_device *dev, phylink_start(priv->phylink); + stmmac_vlan_restore(priv); + ret = stmmac_request_irq(dev); if (ret) goto irq_error; @@ -6853,6 +6856,23 @@ del_vlan_error: return ret; } +static int stmmac_vlan_restore(struct stmmac_priv *priv) +{ + int ret; + + if (!(priv->dev->features & NETIF_F_VLAN_FEATURES)) + return 0; + + if (priv->hw->num_vlan) + stmmac_restore_hw_vlan_rx_fltr(priv, priv->dev, priv->hw); + + ret = stmmac_vlan_update(priv, priv->num_double_vlans); + if (ret) + netdev_err(priv->dev, "Failed to restore VLANs\n"); + + return ret; +} + static int stmmac_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct stmmac_priv *priv = netdev_priv(dev); @@ -8277,10 +8297,10 @@ int stmmac_resume(struct device *dev) stmmac_init_coalesce(priv); phylink_rx_clk_stop_block(priv->phylink); stmmac_set_rx_mode(ndev); - - stmmac_restore_hw_vlan_rx_fltr(priv, ndev, priv->hw); phylink_rx_clk_stop_unblock(priv->phylink); + stmmac_vlan_restore(priv); + stmmac_enable_all_queues(priv); stmmac_enable_all_dma_irq(priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c index de1a70e1c86e..fcc34867405e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c @@ -139,9 +139,6 @@ static int vlan_del_hw_rx_fltr(struct net_device *dev, static void vlan_restore_hw_rx_fltr(struct net_device *dev, struct mac_device_info *hw) { - void __iomem *ioaddr = hw->pcsr; - u32 value; - u32 hash; u32 val; int i; @@ -158,13 +155,6 @@ static void vlan_restore_hw_rx_fltr(struct net_device *dev, vlan_write_filter(dev, hw, i, val); } } - - hash = readl(ioaddr + VLAN_HASH_TABLE); - if (hash & VLAN_VLHT) { - value = readl(ioaddr + VLAN_TAG); - value |= VLAN_VTHM; - writel(value, ioaddr + VLAN_TAG); - } } static void vlan_update_hash(struct mac_device_info *hw, u32 hash, -- cgit v1.2.3 From 2cd70e3968f505996d5fefdf7ca684f0f4575734 Mon Sep 17 00:00:00 2001 From: Ovidiu Panait Date: Tue, 3 Mar 2026 14:58:28 +0000 Subject: net: stmmac: Defer VLAN HW configuration when interface is down VLAN register accesses on the MAC side require the PHY RX clock to be active. When the network interface is down, the PHY is suspended and the RX clock is unavailable, causing VLAN operations to fail with timeouts. The VLAN core automatically removes VID 0 after the interface goes down and re-adds it when it comes back up, so these timeouts happen during normal interface down/up: # ip link set end1 down renesas-gbeth 15c40000.ethernet end1: Timeout accessing MAC_VLAN_Tag_Filter renesas-gbeth 15c40000.ethernet end1: failed to kill vid 0081/0 Adding VLANs while the interface is down also fails: # ip link add link end1 name end1.10 type vlan id 10 renesas-gbeth 15c40000.ethernet end1: Timeout accessing MAC_VLAN_Tag_Filter RTNETLINK answers: Device or resource busy To fix this, check if the interface is up before accessing VLAN registers. The software state is always kept up to date regardless of interface state. When the interface is brought up, stmmac_vlan_restore() is called to write the VLAN state to hardware. Fixes: ed64639bc1e0 ("net: stmmac: Add support for VLAN Rx filtering") Signed-off-by: Ovidiu Panait Link: https://patch.msgid.link/20260303145828.7845-5-ovidiu.panait.rb@renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c | 42 +++++++++++++---------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6d4ce35e990f..6827c99bde8c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6769,6 +6769,9 @@ static int stmmac_vlan_update(struct stmmac_priv *priv, bool is_double) hash = 0; } + if (!netif_running(priv->dev)) + return 0; + return stmmac_update_vlan_hash(priv, priv->hw, hash, pmatch, is_double); } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c index fcc34867405e..e24efe3bfedb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_vlan.c @@ -76,7 +76,9 @@ static int vlan_add_hw_rx_fltr(struct net_device *dev, } hw->vlan_filter[0] = vid; - vlan_write_single(dev, vid); + + if (netif_running(dev)) + vlan_write_single(dev, vid); return 0; } @@ -97,12 +99,15 @@ static int vlan_add_hw_rx_fltr(struct net_device *dev, return -EPERM; } - ret = vlan_write_filter(dev, hw, index, val); + if (netif_running(dev)) { + ret = vlan_write_filter(dev, hw, index, val); + if (ret) + return ret; + } - if (!ret) - hw->vlan_filter[index] = val; + hw->vlan_filter[index] = val; - return ret; + return 0; } static int vlan_del_hw_rx_fltr(struct net_device *dev, @@ -115,7 +120,9 @@ static int vlan_del_hw_rx_fltr(struct net_device *dev, if (hw->num_vlan == 1) { if ((hw->vlan_filter[0] & VLAN_TAG_VID) == vid) { hw->vlan_filter[0] = 0; - vlan_write_single(dev, 0); + + if (netif_running(dev)) + vlan_write_single(dev, 0); } return 0; } @@ -124,22 +131,23 @@ static int vlan_del_hw_rx_fltr(struct net_device *dev, for (i = 0; i < hw->num_vlan; i++) { if ((hw->vlan_filter[i] & VLAN_TAG_DATA_VEN) && ((hw->vlan_filter[i] & VLAN_TAG_DATA_VID) == vid)) { - ret = vlan_write_filter(dev, hw, i, 0); - if (!ret) - hw->vlan_filter[i] = 0; - else - return ret; + if (netif_running(dev)) { + ret = vlan_write_filter(dev, hw, i, 0); + if (ret) + return ret; + } + + hw->vlan_filter[i] = 0; } } - return ret; + return 0; } static void vlan_restore_hw_rx_fltr(struct net_device *dev, struct mac_device_info *hw) { - u32 val; int i; /* Single Rx VLAN Filter */ @@ -149,12 +157,8 @@ static void vlan_restore_hw_rx_fltr(struct net_device *dev, } /* Extended Rx VLAN Filter Enable */ - for (i = 0; i < hw->num_vlan; i++) { - if (hw->vlan_filter[i] & VLAN_TAG_DATA_VEN) { - val = hw->vlan_filter[i]; - vlan_write_filter(dev, hw, i, val); - } - } + for (i = 0; i < hw->num_vlan; i++) + vlan_write_filter(dev, hw, i, hw->vlan_filter[i]); } static void vlan_update_hash(struct mac_device_info *hw, u32 hash, -- cgit v1.2.3 From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Mar 2026 17:26:31 +0100 Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock After acquiring netdev_queue::_xmit_lock the number of the CPU owning the lock is recorded in netdev_queue::xmit_lock_owner. This works as long as the BH context is not preemptible. On PREEMPT_RT the softirq context is preemptible and without the softirq-lock it is possible to have multiple user in __dev_queue_xmit() submitting a skb on the same CPU. This is fine in general but this means also that the current CPU is recorded as netdev_queue::xmit_lock_owner. This in turn leads to the recursion alert and the skb is dropped. Instead checking the for CPU number, that owns the lock, PREEMPT_RT can check if the lockowner matches the current task. Add netif_tx_owned() which returns true if the current context owns the lock by comparing the provided CPU number with the recorded number. This resembles the current check by negating the condition (the current check returns true if the lock is not owned). On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare the current task against it. Use the new helper in __dev_queue_xmit() and netif_local_xmit_active() which provides a similar check. Update comments regarding pairing READ_ONCE(). Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reported-by: Bert Karwatzki Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 27 ++++++++++++++++++++++----- net/core/dev.c | 5 +---- net/core/netpoll.c | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4e6e00bb90a..67e25f6d15a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } @@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } @@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; @@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq) static inline void __netif_tx_unlock(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + /* Pairs with READ_ONCE() in netif_tx_owned() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } @@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev) local_bh_enable(); } +#ifndef CONFIG_PREEMPT_RT +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + return READ_ONCE(txq->xmit_lock_owner) == cpu; +} + +#else +static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu) +{ + return rt_mutex_owner(&txq->_xmit_lock.lock) == current; +} + +#endif + static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; diff --git a/net/core/dev.c b/net/core/dev.c index 20610a192ec7..14a83f2035b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4818,10 +4818,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - /* Other cpus might concurrently change txq->xmit_lock_owner - * to -1 or to their cpu id, but not to our id. - */ - if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (!netif_tx_owned(txq, cpu)) { bool is_list = false; if (dev_xmit_recursion()) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a8558a52884f..cd74beffd209 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + if (netif_tx_owned(txq, smp_processor_id())) return 1; } -- cgit v1.2.3 From def602e498a4f951da95c95b1b8ce8ae68aa733a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 Mar 2026 23:12:37 +0100 Subject: netfilter: nf_tables: unconditionally bump set->nelems before insertion In case that the set is full, a new element gets published then removed without waiting for the RCU grace period, while RCU reader can be walking over it already. To address this issue, add the element transaction even if set is full, but toggle the set_full flag to report -ENFILE so the abort path safely unwinds the set to its previous state. As for element updates, decrement set->nelems to restore it. A simpler fix is to call synchronize_rcu() in the error path. However, with a large batch adding elements to already maxed-out set, this could cause noticeable slowdown of such batches. Fixes: 35d0ac9070ef ("netfilter: nf_tables: fix set->nelems counting with no NLM_F_EXCL") Reported-by: Inseo An Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd7f7e4e2a43..df67932d3e09 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7170,6 +7170,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; + bool set_full = false; u64 expiration; u64 timeout; int err, i; @@ -7461,10 +7462,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_elem_free; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = nft_set_maxsize(set), nelems; + + nelems = atomic_inc_return(&set->nelems); + if (nelems > max) + set_full = true; + } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err_elem_free; + goto err_set_size; } ext->genmask = nft_genmask_cur(ctx->net); @@ -7516,7 +7525,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans); - goto err_elem_free; + goto err_set_size; } } } @@ -7534,23 +7543,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = nft_set_maxsize(set); - - if (!atomic_add_unless(&set->nelems, 1, max)) { - err = -ENFILE; - goto err_set_full; - } - } - nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans); - return 0; -err_set_full: - nft_setelem_remove(ctx->net, set, elem.priv); + return set_full ? -ENFILE : 0; + err_element_clash: kfree(trans); +err_set_size: + if (!(flags & NFT_SET_ELEM_CATCHALL)) + atomic_dec(&set->nelems); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: -- cgit v1.2.3 From fb7fb4016300ac622c964069e286dc83166a5d52 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 Mar 2026 23:28:15 +0100 Subject: netfilter: nf_tables: clone set on flush only Syzbot with fault injection triggered a failing memory allocation with GFP_KERNEL which results in a WARN splat: iter.err WARNING: net/netfilter/nf_tables_api.c:845 at nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845, CPU#0: syz.0.17/5992 Modules linked in: CPU: 0 UID: 0 PID: 5992 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026 RIP: 0010:nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845 Code: 8b 05 86 5a 4e 09 48 3b 84 24 a0 00 00 00 75 62 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc e8 63 6d fa f7 90 <0f> 0b 90 43 +80 7c 35 00 00 0f 85 23 fe ff ff e9 26 fe ff ff 89 d9 RSP: 0018:ffffc900045af780 EFLAGS: 00010293 RAX: ffffffff89ca45bd RBX: 00000000fffffff4 RCX: ffff888028111e40 RDX: 0000000000000000 RSI: 00000000fffffff4 RDI: 0000000000000000 RBP: ffffc900045af870 R08: 0000000000400dc0 R09: 00000000ffffffff R10: dffffc0000000000 R11: fffffbfff1d141db R12: ffffc900045af7e0 R13: 1ffff920008b5f24 R14: dffffc0000000000 R15: ffffc900045af920 FS: 000055557a6a5500(0000) GS:ffff888125496000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb5ea271fc0 CR3: 000000003269e000 CR4: 00000000003526f0 Call Trace: __nft_release_table+0xceb/0x11f0 net/netfilter/nf_tables_api.c:12115 nft_rcv_nl_event+0xc25/0xdb0 net/netfilter/nf_tables_api.c:12187 notifier_call_chain+0x19d/0x3a0 kernel/notifier.c:85 blocking_notifier_call_chain+0x6a/0x90 kernel/notifier.c:380 netlink_release+0x123b/0x1ad0 net/netlink/af_netlink.c:761 __sock_release net/socket.c:662 [inline] sock_close+0xc3/0x240 net/socket.c:1455 Restrict set clone to the flush set command in the preparation phase. Add NFT_ITER_UPDATE_CLONE and use it for this purpose, update the rbtree and pipapo backends to only clone the set when this iteration type is used. As for the existing NFT_ITER_UPDATE type, update the pipapo backend to use the existing set clone if available, otherwise use the existing set representation. After this update, there is no need to clone a set that is being deleted, this includes bound anonymous set. An alternative approach to NFT_ITER_UPDATE_CLONE is to add a .clone interface and call it from the flush set path. Reported-by: syzbot+4924a0edc148e8b4b342@syzkaller.appspotmail.com Fixes: 3f1d886cc7c3 ("netfilter: nft_set_pipapo: move cloning of match info to insert/removal path") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 10 +++++++++- net/netfilter/nft_set_hash.c | 1 + net/netfilter/nft_set_pipapo.c | 11 +++++++++-- net/netfilter/nft_set_rbtree.c | 8 +++++--- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 426534a711b0..ea6f29ad7888 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -320,11 +320,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) * @NFT_ITER_UNSPEC: unspecified, to catch errors * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state + * @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, + NFT_ITER_UPDATE_CLONE, }; struct nft_set; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index df67932d3e09..058f7004cb2b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -833,6 +833,11 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, } } +/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation + * phase, the set clone might already exist from a previous command, or it might + * be a set that is going away and does not require a clone. The netns and + * netlink release paths also need to work on the live set. + */ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { @@ -7903,9 +7908,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { + /* The set backend might need to clone the set, do it now from the + * preparation phase, use NFT_ITER_UPDATE_CLONE iterator type. + */ struct nft_set_iter iter = { .genmask = genmask, - .type = NFT_ITER_UPDATE, + .type = NFT_ITER_UPDATE_CLONE, .fn = nft_setelem_flush, }; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 739b992bde59..b0e571c8e3f3 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, { switch (iter->type) { case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: /* only relevant for netlink dumps which use READ type */ WARN_ON_ONCE(iter->skip != 0); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 7ef4b44471d3..c091898df710 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2144,13 +2144,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_match *m; switch (iter->type) { - case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: m = pipapo_maybe_clone(set); if (!m) { iter->err = -ENOMEM; return; } - + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_UPDATE: + if (priv->clone) + m = priv->clone; + else + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); nft_pipapo_do_walk(ctx, set, m, iter); break; case NFT_ITER_READ: diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3f02e4478216..ee3d4f5b9ff7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -861,13 +861,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_rbtree *priv = nft_set_priv(set); switch (iter->type) { - case NFT_ITER_UPDATE: - lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); - + case NFT_ITER_UPDATE_CLONE: if (nft_array_may_resize(set) < 0) { iter->err = -ENOMEM; break; } + fallthrough; + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: -- cgit v1.2.3 From 9df95785d3d8302f7c066050117b04cd3c2048c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Mar 2026 16:31:32 +0100 Subject: netfilter: nft_set_pipapo: split gc into unlink and reclaim phase Yiming Qian reports Use-after-free in the pipapo set type: Under a large number of expired elements, commit-time GC can run for a very long time in a non-preemptible context, triggering soft lockup warnings and RCU stall reports (local denial of service). We must split GC in an unlink and a reclaim phase. We cannot queue elements for freeing until pointers have been swapped. Expired elements are still exposed to both the packet path and userspace dumpers via the live copy of the data structure. call_rcu() does not protect us: dump operations or element lookups starting after call_rcu has fired can still observe the free'd element, unless the commit phase has made enough progress to swap the clone and live pointers before any new reader has picked up the old version. This a similar approach as done recently for the rbtree backend in commit 35f83a75529a ("netfilter: nft_set_rbtree: don't gc elements on insert"). Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Yiming Qian Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 5 ++++ net/netfilter/nf_tables_api.c | 5 ---- net/netfilter/nft_set_pipapo.c | 51 +++++++++++++++++++++++++++++++++------ net/netfilter/nft_set_pipapo.h | 2 ++ 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ea6f29ad7888..e2d2bfc1f989 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1863,6 +1863,11 @@ struct nft_trans_gc { struct rcu_head rcu; }; +static inline int nft_trans_gc_space(const struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + static inline void nft_ctx_update(struct nft_ctx *ctx, const struct nft_trans *trans) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 058f7004cb2b..1862bd7fe804 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10493,11 +10493,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) schedule_work(&trans_gc_work); } -static int nft_trans_gc_space(struct nft_trans_gc *trans) -{ - return NFT_TRANS_GC_BATCHCOUNT - trans->count; -} - struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index c091898df710..a34632ae6048 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1680,11 +1680,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, } /** - * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * pipapo_gc_scan() - Drop expired entries from set and link them to gc list * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); @@ -1697,6 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) if (!gc) return; + list_add(&gc->list, &priv->gc_head); + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; @@ -1724,9 +1726,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - return; + if (!nft_trans_gc_space(gc)) { + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + list_add(&gc->list, &priv->gc_head); + } nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); @@ -1740,10 +1746,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) } } - gc = nft_trans_gc_catchall_sync(gc); + priv->last_gc = jiffies; +} + +/** + * pipapo_gc_queue() - Free expired elements + * @set: nftables API set representation + */ +static void pipapo_gc_queue(struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_trans_gc *gc, *next; + + /* always do a catchall cycle: */ + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + if (gc) + nft_trans_gc_queue_sync_done(gc); + } + + /* always purge queued gc elements. */ + list_for_each_entry_safe(gc, next, &priv->gc_head, list) { + list_del(&gc->list); nft_trans_gc_queue_sync_done(gc); - priv->last_gc = jiffies; } } @@ -1797,6 +1823,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * * We also need to create a new working copy for subsequent insertions and * deletions. + * + * After the live copy has been replaced by the clone, we can safely queue + * expired elements that have been collected by pipapo_gc_scan() for + * memory reclaim. */ static void nft_pipapo_commit(struct nft_set *set) { @@ -1807,7 +1837,7 @@ static void nft_pipapo_commit(struct nft_set *set) return; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + pipapo_gc_scan(set, priv->clone); old = rcu_replace_pointer(priv->match, priv->clone, nft_pipapo_transaction_mutex_held(set)); @@ -1815,6 +1845,8 @@ static void nft_pipapo_commit(struct nft_set *set) if (old) call_rcu(&old->rcu, pipapo_reclaim_match); + + pipapo_gc_queue(set); } static void nft_pipapo_abort(const struct nft_set *set) @@ -2279,6 +2311,7 @@ static int nft_pipapo_init(const struct nft_set *set, f->mt = NULL; } + INIT_LIST_HEAD(&priv->gc_head); rcu_assign_pointer(priv->match, m); return 0; @@ -2328,6 +2361,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; + WARN_ON_ONCE(!list_empty(&priv->gc_head)); + m = rcu_dereference_protected(priv->match, true); if (priv->clone) { diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index eaab422aa56a..9aee9a9eaeb7 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -156,12 +156,14 @@ struct nft_pipapo_match { * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding * @last_gc: Timestamp of last garbage collection run, jiffies + * @gc_head: list of nft_trans_gc to queue up for mem reclaim */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; unsigned long last_gc; + struct list_head gc_head; }; struct nft_pipapo_elem; -- cgit v1.2.3 From 0abc73c8a40fd64ac1739c90bb4f42c418d27a5e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 3 Mar 2026 18:56:39 +0100 Subject: net: ethernet: mtk_eth_soc: Reset prog ptr to old_prog in case of error in mtk_xdp_setup() Reset eBPF program pointer to old_prog and do not decrease its ref-count if mtk_open routine in mtk_xdp_setup() fails. Fixes: 7c26c20da5d42 ("net: ethernet: mtk_eth_soc: add basic XDP support") Suggested-by: Paolo Valerio Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260303-mtk-xdp-prog-ptr-fix-v2-1-97b6dbbe240f@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index e5e2ffa9c542..ddc321a02fda 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3748,12 +3748,21 @@ static int mtk_xdp_setup(struct net_device *dev, struct bpf_prog *prog, mtk_stop(dev); old_prog = rcu_replace_pointer(eth->prog, prog, lockdep_rtnl_is_held()); + + if (netif_running(dev) && need_update) { + int err; + + err = mtk_open(dev); + if (err) { + rcu_assign_pointer(eth->prog, old_prog); + + return err; + } + } + if (old_prog) bpf_prog_put(old_prog); - if (netif_running(dev) && need_update) - return mtk_open(dev); - return 0; } -- cgit v1.2.3 From f26b098d937488e8f5c617d465760a10bfcc7f13 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 5 Mar 2026 10:31:17 +0100 Subject: ftrace: Add MAINTAINERS entries for all ftrace headers There is currently no entry for ftrace_irq.h and ftrace_regs.h. Add a generic entry for all *ftrace* headers to include them and prevent overlooking future ftrace headers. Cc: Masami Hiramatsu Cc: Mark Rutland Link: https://patch.msgid.link/20260305093117.853700-1-jmarchan@redhat.com Signed-off-by: Jerome Marchand Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 61bf550fd37c..b8d1ad952827 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10484,7 +10484,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/ftrace* F: arch/*/*/*/*ftrace* F: arch/*/*/*ftrace* -F: include/*/ftrace.h +F: include/*/*ftrace* F: kernel/trace/fgraph.c F: kernel/trace/ftrace* F: samples/ftrace -- cgit v1.2.3 From 6be2681514261324c8ee8a1c6f76cefdf700220f Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Wed, 25 Feb 2026 19:14:50 +0800 Subject: selftests/harness: order TEST_F and XFAIL_ADD constructors TEST_F() allocates and registers its struct __test_metadata via mmap() inside its constructor, and only then assigns the _##fixture_##test##_object pointer. XFAIL_ADD() runs in a constructor too and reads _##fixture_##test##_object to initialize xfail->test. If XFAIL_ADD runs first, xfail->test can be NULL and the expected failure will be reported as FAIL. Use constructor priorities to ensure TEST_F registration runs before XFAIL_ADD, without adding extra state or runtime lookups. Fixes: 2709473c9386 ("selftests: kselftest_harness: support using xfail") Signed-off-by: Sun Jian Link: https://patch.msgid.link/20260225111451.347923-1-sun.jian.kdev@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest_harness.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 16a119a4656c..4afaef01c22e 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -76,6 +76,9 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) memset(s, c, n); } +#define KSELFTEST_PRIO_TEST_F 20000 +#define KSELFTEST_PRIO_XFAIL 20001 + #define TEST_TIMEOUT_DEFAULT 30 /* Utilities exposed to the test definitions */ @@ -465,7 +468,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) fixture_name##_teardown(_metadata, self, variant); \ } \ static struct __test_metadata *_##fixture_name##_##test_name##_object; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST_F))) \ _register_##fixture_name##_##test_name(void) \ { \ struct __test_metadata *object = mmap(NULL, sizeof(*object), \ @@ -880,7 +883,7 @@ struct __test_xfail { .fixture = &_##fixture_name##_fixture_object, \ .variant = &_##fixture_name##_##variant_name##_object, \ }; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_XFAIL))) \ _register_##fixture_name##_##variant_name##_##test_name##_xfail(void) \ { \ _##fixture_name##_##variant_name##_##test_name##_xfail.test = \ -- cgit v1.2.3 From c952291593e54415a7bb74c4a7187a7c7c7e8651 Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Wed, 25 Feb 2026 19:14:51 +0800 Subject: selftests: net: tun: don't abort XFAIL cases The tun UDP tunnel GSO fixture contains XFAIL-marked variants intended to exercise failure paths (e.g. EMSGSIZE / "Message too long"). Using ASSERT_EQ() in these tests aborts the subtest, which prevents the harness from classifying them as XFAIL and can make the overall net: tun test fail. Switch the relevant ASSERT_EQ() checks to EXPECT_EQ() so the subtests continue running and the failures are correctly reported and accounted as XFAIL where applicable. Signed-off-by: Sun Jian Link: https://patch.msgid.link/20260225111451.347923-2-sun.jian.kdev@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 8a5cd5cb5472..cf106a49b55e 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -944,8 +944,8 @@ TEST_F(tun_vnet_udptnl, send_gso_packet) ASSERT_EQ(ret, off); ret = receive_gso_packet_from_tunnel(self, variant, &r_num_mss); - ASSERT_EQ(ret, variant->data_size); - ASSERT_EQ(r_num_mss, variant->r_num_mss); + EXPECT_EQ(ret, variant->data_size); + EXPECT_EQ(r_num_mss, variant->r_num_mss); } TEST_F(tun_vnet_udptnl, recv_gso_packet) @@ -955,18 +955,18 @@ TEST_F(tun_vnet_udptnl, recv_gso_packet) int ret, gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; ret = send_gso_packet_into_tunnel(self, variant); - ASSERT_EQ(ret, variant->data_size); + EXPECT_EQ(ret, variant->data_size); memset(&vnet_hdr, 0, sizeof(vnet_hdr)); ret = receive_gso_packet_from_tun(self, variant, &vnet_hdr); - ASSERT_EQ(ret, variant->data_size); + EXPECT_EQ(ret, variant->data_size); if (!variant->no_gso) { - ASSERT_EQ(vh->gso_size, variant->gso_size); + EXPECT_EQ(vh->gso_size, variant->gso_size); gso_type |= (variant->tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4) : (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6); - ASSERT_EQ(vh->gso_type, gso_type); + EXPECT_EQ(vh->gso_type, gso_type); } } -- cgit v1.2.3 From f9b5bf12eb65e1e708a883a1e24712e81ed2f340 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:01 -0800 Subject: MAINTAINERS: remove Johan Hedberg from Bluetooth subsystem We have not seen emails or tags from Johan in over 5 years, and there is no recent mailing list activity. Marcel Holtmann hasn't provided any tags in the Bluetooth subsystem in over 5 years, but he is active on the Bluetooth mailing list, providing informal review. Luiz Augusto von Dentz is very active, handling essentially all commits and reviews (12% coverage, but Luiz is the sole active committer). Subsystem BLUETOOTH SUBSYSTEM Changes 50 / 411 (12%) Last activity: 2026-02-23 Marcel Holtmann : Johan Hedberg : Luiz Augusto von Dentz : Author 138d7eca445e 2026-02-23 00:00:00 164 Committer 138d7eca445e 2026-02-23 00:00:00 361 Tags 138d7eca445e 2026-02-23 00:00:00 362 Top reviewers: [15]: pmenzel@molgen.mpg.de [8]: keescook@chromium.org [5]: willemb@google.com [4]: horms@kernel.org [3]: kuniyu@amazon.com [3]: luiz.von.dentz@intel.com INACTIVE MAINTAINER Johan Hedberg Acked-by: Marcel Holtmann Link: https://patch.msgid.link/20260303215339.2333548-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index bee85351120e..4eb755b0a0d1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4615,7 +4615,6 @@ F: drivers/bluetooth/ BLUETOOTH SUBSYSTEM M: Marcel Holtmann -M: Johan Hedberg M: Luiz Augusto von Dentz L: linux-bluetooth@vger.kernel.org S: Supported -- cgit v1.2.3 From 80f8a19fe7cf5b1b5d541c7aaa37aa9d7653bda5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:02 -0800 Subject: MAINTAINERS: remove Manish Chopra from QLogic QL4xxx (now orphan) We have not seen tags from Manish for the QL4xxx driver in over 5 years, and there is no mailing list activity since Oct 2023. There has been no maintainer activity in this subsystem at all. Since there is no other maintainer for this driver it becomes an Orphan. Subsystem QLOGIC QL4xxx ETHERNET DRIVER Changes 40 / 74 (54%) (No activity) Top reviewers: [30]: horms@kernel.org [2]: jiri@nvidia.com [2]: shannon.nelson@amd.com [1]: saeedm@nvidia.com [1]: aleksandr.loktionov@intel.com [1]: kory.maincent@bootlin.com INACTIVE MAINTAINER Manish Chopra Link: https://patch.msgid.link/20260303215339.2333548-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4eb755b0a0d1..a83af9a89141 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21451,9 +21451,8 @@ S: Supported F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER -M: Manish Chopra L: netdev@vger.kernel.org -S: Maintained +S: Orphan F: drivers/net/ethernet/qlogic/qed/ F: drivers/net/ethernet/qlogic/qede/ F: include/linux/qed/ -- cgit v1.2.3 From e07f796a86b2f0fda976268a08c321579022fcbc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:03 -0800 Subject: MAINTAINERS: remove Jerin Jacob from Marvell OcteonTX2 We have not seen tags from Jerin for OcteonTX2 in over 5 years. Recent lore activity is in DPDK (non-kernel), not Linux. Sunil, Linu, Geetha, hariprasad, and Subbaraya are active, though the review coverage isn't great (38%). Subsystem MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER Changes 53 / 138 (38%) Last activity: 2026-02-18 Sunil Goutham : Author fc1b2901e0fe 2024-03-08 00:00:00 1 Tags 70f8986ecef1 2025-06-16 00:00:00 9 Linu Cherian : Author a861e5809f3e 2025-10-30 00:00:00 7 Tags a861e5809f3e 2025-10-30 00:00:00 7 Geetha sowjanya : Author 70e9a5760abf 2026-01-29 00:00:00 16 Tags 70e9a5760abf 2026-01-29 00:00:00 20 Jerin Jacob : hariprasad : Author 45be47bf5d7d 2026-02-18 00:00:00 22 Tags 45be47bf5d7d 2026-02-18 00:00:00 25 Subbaraya Sundeep : Author 47a1208776d7 2025-10-30 00:00:00 20 Tags 47a1208776d7 2025-10-30 00:00:00 30 Top reviewers: [36]: horms@kernel.org [4]: jacob.e.keller@intel.com [4]: kalesh-anakkur.purayil@broadcom.com [3]: vadim.fedorenko@linux.dev [2]: shaojijie@huawei.com [2]: jiri@nvidia.com INACTIVE MAINTAINER Jerin Jacob Link: https://patch.msgid.link/20260303215339.2333548-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a83af9a89141..de0f5a38b525 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15469,7 +15469,6 @@ MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER M: Sunil Goutham M: Linu Cherian M: Geetha sowjanya -M: Jerin Jacob M: hariprasad M: Subbaraya Sundeep L: netdev@vger.kernel.org -- cgit v1.2.3 From 77f72ef58fb849cbe39fd69a20f9ae5076cd442b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:05 -0800 Subject: MAINTAINERS: remove Sean Wang from MediaTek Ethernet and switch We have not seen tags from Sean in over 5 years, with only one mailing list post since 2024. Felix and Lorenzo are active for the Ethernet driver, and Chester, Daniel and DENG Qingfang are active for the switch driver. Subsystem MEDIATEK ETHERNET DRIVER Changes 55 / 113 (48%) Last activity: 2025-10-12 Felix Fietkau : Author d4736737110f 2025-09-02 00:00:00 3 Tags d4736737110f 2025-09-02 00:00:00 4 Sean Wang : Lorenzo Bianconi : Author 96326447d466 2025-08-13 00:00:00 35 Tags 3abc0e55ea1f 2025-10-12 00:00:00 40 Top reviewers: [26]: horms@kernel.org [5]: andrew@lunn.ch [4]: jacob.e.keller@intel.com [3]: shannon.nelson@amd.com [3]: michal.swiatkowski@linux.intel.com INACTIVE MAINTAINER Sean Wang Subsystem MEDIATEK SWITCH DRIVER Changes 26 / 70 (37%) Last activity: 2025-12-01 Chester A. Unal : Tags 585943b7ad30 2025-12-01 00:00:00 7 Daniel Golle : Author 497041d76301 2025-04-23 00:00:00 2 Tags 3b87e60d2131 2025-12-01 00:00:00 14 DENG Qingfang : Sean Wang : Top reviewers: [4]: andrew@lunn.ch [4]: florian.fainelli@broadcom.com [4]: arinc.unal@arinc9.com [2]: olteanv@gmail.com INACTIVE MAINTAINER Sean Wang Acked-by: Chester A. Unal Link: https://patch.msgid.link/20260303215339.2333548-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index de0f5a38b525..706fb325bc67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16157,7 +16157,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau -M: Sean Wang M: Lorenzo Bianconi L: netdev@vger.kernel.org S: Maintained @@ -16351,7 +16350,6 @@ MEDIATEK SWITCH DRIVER M: Chester A. Unal M: Daniel Golle M: DENG Qingfang -M: Sean Wang L: netdev@vger.kernel.org S: Maintained F: drivers/net/dsa/mt7530-mdio.c -- cgit v1.2.3 From 593cdf14523bfb837b947012cdc6545a53ea67b5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:06 -0800 Subject: MAINTAINERS: remove DENG Qingfang from MediaTek switch We have not seen tags from DENG Qingfang for the MediaTek switch driver in over 5 years. He is active upstream with PPP/PPPoE patches in net-next. Chester and Daniel are active. Subsystem MEDIATEK SWITCH DRIVER Changes 26 / 70 (37%) Last activity: 2025-12-01 Chester A. Unal : Tags 585943b7ad30 2025-12-01 00:00:00 7 Daniel Golle : Author 497041d76301 2025-04-23 00:00:00 2 Tags 3b87e60d2131 2025-12-01 00:00:00 14 DENG Qingfang : Sean Wang : Top reviewers: [4]: andrew@lunn.ch [4]: florian.fainelli@broadcom.com [4]: arinc.unal@arinc9.com [2]: olteanv@gmail.com INACTIVE MAINTAINER DENG Qingfang Acked-by: Chester A. Unal Link: https://patch.msgid.link/20260303215339.2333548-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 706fb325bc67..49bf6c8da74c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16349,7 +16349,6 @@ F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER M: Chester A. Unal M: Daniel Golle -M: DENG Qingfang L: netdev@vger.kernel.org S: Maintained F: drivers/net/dsa/mt7530-mdio.c -- cgit v1.2.3 From 34f49454b25cd2e532f137174ef59b778f1a09f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:07 -0800 Subject: MAINTAINERS: replace Clark Wang with Frank Li for Freescale FEC We have not seen tags from Clark for FEC in over 5 years. He has some limited recent activity on the mailing list in other NXP subsystems (stmmac, phy). Wei Fang and Shenwei Wang are active, with decent review coverage (61%). Frank Li has been reviewing code actively more recenty, let's make it official. Subsystem FREESCALE IMX / MXC FEC DRIVER Changes 57 / 92 (61%) Last activity: 2026-02-10 Wei Fang : Author 25eb3058eb70 2026-02-10 00:00:00 33 Tags 25eb3058eb70 2026-02-10 00:00:00 61 Shenwei Wang : Author d466c16026e9 2025-09-14 00:00:00 6 Tags d466c16026e9 2025-09-14 00:00:00 6 Clark Wang : Top reviewers: [23]: Frank.Li@nxp.com [17]: andrew@lunn.ch [4]: csokas.bence@prolan.hu [3]: horms@kernel.org [2]: maxime.chevallier@bootlin.com INACTIVE MAINTAINER Clark Wang Reviewed-by: Wei Fang Link: https://patch.msgid.link/20260303215339.2333548-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 49bf6c8da74c..b21c87332d14 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10167,8 +10167,8 @@ F: drivers/i2c/busses/i2c-cpm.c FREESCALE IMX / MXC FEC DRIVER M: Wei Fang +R: Frank Li R: Shenwei Wang -R: Clark Wang L: imx@lists.linux.dev L: netdev@vger.kernel.org S: Maintained -- cgit v1.2.3 From 60da2d2752ac4513fc3a91a0558dfd8176d2313f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:08 -0800 Subject: MAINTAINERS: remove Jonathan Lemon from OpenCompute PTP We have not seen emails or tags from Jonathan in over 5 years, and there is no recent mailing list activity. Vadim Fedorenko is active. Subsystem OPENCOMPUTE PTP CLOCK DRIVER Changes 49 / 130 (37%) Last activity: 2025-11-25 Jonathan Lemon : Vadim Fedorenko : Author d3ca2ef0c915 2025-09-19 00:00:00 5 Tags 648282e2d1e5 2025-11-25 00:00:00 20 Top reviewers: [7]: horms@kernel.org [4]: jiri@nvidia.com [3]: richardcochran@gmail.com [2]: aleksandr.loktionov@intel.com INACTIVE MAINTAINER Jonathan Lemon Add Jonathan to CREDITS as the initial author of ptp_ocp. Acked-by: Vadim Fedorenko Link: https://patch.msgid.link/20260303215339.2333548-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d74c8b2b7ed3..fb9b4dc193a0 100644 --- a/CREDITS +++ b/CREDITS @@ -2415,6 +2415,10 @@ S: Am Muehlenweg 38 S: D53424 Remagen S: Germany +N: Jonathan Lemon +E: jonathan.lemon@gmail.com +D: OpenCompute PTP clock driver (ptp_ocp) + N: Colin Leroy E: colin@colino.net W: http://www.geekounet.org/ diff --git a/MAINTAINERS b/MAINTAINERS index b21c87332d14..d63fe814b750 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19803,7 +19803,6 @@ F: arch/*/boot/dts/ F: include/dt-bindings/ OPENCOMPUTE PTP CLOCK DRIVER -M: Jonathan Lemon M: Vadim Fedorenko L: netdev@vger.kernel.org S: Maintained -- cgit v1.2.3 From 9ede3e910f088fb3ffec0ed821f0b03951000d23 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:09 -0800 Subject: MAINTAINERS: replace Taras Chornyi with Elad Nachman for Marvell Prestera We have not seen emails or tags from Taras in over 5 years, and there is no recent mailing list activity. Elad Nachman has been providing reviews in the last couple of years and is the top reviewer for this subsystem. Subsystem MARVELL PRESTERA ETHERNET SWITCH DRIVER Changes 39 / 157 (24%) (No activity) Top reviewers: [8]: enachman@marvell.com [6]: horms@kernel.org [4]: idosch@nvidia.com [3]: andrew@lunn.ch [3]: jacob.e.keller@intel.com [3]: jiri@nvidia.com INACTIVE MAINTAINER Taras Chornyi Link: https://patch.msgid.link/20260303215339.2333548-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d63fe814b750..497822aac6aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15483,7 +15483,7 @@ S: Supported F: drivers/perf/marvell_pem_pmu.c MARVELL PRESTERA ETHERNET SWITCH DRIVER -M: Taras Chornyi +M: Elad Nachman S: Supported W: https://github.com/Marvell-switching/switchdev-prestera F: drivers/net/ethernet/marvell/prestera/ -- cgit v1.2.3 From 4d37e68c46d15beb4af782b19d7f1b382316776c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:10 -0800 Subject: MAINTAINERS: remove Claudiu Manoil and Alexandre Belloni from Ocelot switch We have not seen tags from Claudiu for the Ocelot switch driver in over 5 years. He is active upstream in other NXP subsystems (ENETC, gianfar), with 46 emails on lore since 2024. We have not seen tags from Alexandre for the Ocelot switch driver in over 5 years. He is very active upstream in other subsystems (RTC, I3C, Atmel/Microchip SoC), with over 1,200 emails on lore since 2024. Vladimir Oltean is active. Subsystem OCELOT ETHERNET SWITCH DRIVER Changes 180 / 494 (36%) Last activity: 2026-02-12 Vladimir Oltean : Author c22ba07c827f 2026-02-10 00:00:00 33 Tags 026f6513c588 2026-02-12 00:00:00 39 Claudiu Manoil : Alexandre Belloni : Top reviewers: [49]: f.fainelli@gmail.com [19]: horms@kernel.org [10]: richardcochran@gmail.com [9]: jacob.e.keller@intel.com [8]: colin.foster@in-advantage.com INACTIVE MAINTAINER Claudiu Manoil Acked-by: Claudiu Manoil Acked-by: Alexandre Belloni Link: https://patch.msgid.link/20260303215339.2333548-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 497822aac6aa..d050d756beec 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19216,8 +19216,6 @@ F: tools/objtool/ OCELOT ETHERNET SWITCH DRIVER M: Vladimir Oltean -M: Claudiu Manoil -M: Alexandre Belloni M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Supported -- cgit v1.2.3 From ed6579002548f3d9c00380e2aeaa8305d3ed8fc5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 13:53:11 -0800 Subject: MAINTAINERS: remove Thomas Falcon from IBM ibmvnic We have not seen emails or tags from Thomas's IBM address (tlfalcon@linux.ibm.com) in over 5 years. Looks like Thomas is active in perf tooling at Intel (thomas.falcon@intel.com). Subsystem IBM Power SRIOV Virtual NIC Device Driver Changes 49 / 134 (36%) Last activity: 2025-08-26 Haren Myneni : Tags 3c14917953a5 2025-08-26 00:00:00 2 Rick Lindsley : Nick Child : Author d93a6caab5d7 2025-03-25 00:00:00 14 Tags d93a6caab5d7 2025-03-25 00:00:00 16 Thomas Falcon : Top reviewers: [22]: drt@linux.ibm.com [13]: horms@kernel.org [9]: ricklind@linux.vnet.ibm.com [3]: davemarq@linux.ibm.com INACTIVE MAINTAINER Thomas Falcon Move Thomas to CREDITS as the initial author of ibmvnic. Acked-by: Thomas Falcon Link: https://patch.msgid.link/20260303215339.2333548-12-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index fb9b4dc193a0..9091bac3d2da 100644 --- a/CREDITS +++ b/CREDITS @@ -1242,6 +1242,10 @@ N: Veaceslav Falico E: vfalico@gmail.com D: Co-maintainer and co-author of the network bonding driver. +N: Thomas Falcon +E: tlfalcon@linux.ibm.com +D: Initial author of the IBM ibmvnic network driver + N: János Farkas E: chexum@shadow.banki.hu D: romfs, various (mostly networking) fixes diff --git a/MAINTAINERS b/MAINTAINERS index d050d756beec..3fcb17815e00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12212,7 +12212,6 @@ IBM Power SRIOV Virtual NIC Device Driver M: Haren Myneni M: Rick Lindsley R: Nick Child -R: Thomas Falcon L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/ibm/ibmvnic.* -- cgit v1.2.3 From e5e890630533bdc15b26a34bb8e7ef539bdf1322 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 4 Mar 2026 13:03:56 +0100 Subject: net: bridge: fix nd_tbl NULL dereference when IPv6 is disabled When booting with the 'ipv6.disable=1' parameter, the nd_tbl is never initialized because inet6_init() exits before ndisc_init() is called which initializes it. Then, if neigh_suppress is enabled and an ICMPv6 Neighbor Discovery packet reaches the bridge, br_do_suppress_nd() will dereference ipv6_stub->nd_tbl which is NULL, passing it to neigh_lookup(). This causes a kernel NULL pointer dereference. BUG: kernel NULL pointer dereference, address: 0000000000000268 Oops: 0000 [#1] PREEMPT SMP NOPTI [...] RIP: 0010:neigh_lookup+0x16/0xe0 [...] Call Trace: ? neigh_lookup+0x16/0xe0 br_do_suppress_nd+0x160/0x290 [bridge] br_handle_frame_finish+0x500/0x620 [bridge] br_handle_frame+0x353/0x440 [bridge] __netif_receive_skb_core.constprop.0+0x298/0x1110 __netif_receive_skb_one_core+0x3d/0xa0 process_backlog+0xa0/0x140 __napi_poll+0x2c/0x170 net_rx_action+0x2c4/0x3a0 handle_softirqs+0xd0/0x270 do_softirq+0x3f/0x60 Fix this by replacing IS_ENABLED(IPV6) call with ipv6_mod_enabled() in the callers. This is in essence disabling NS/NA suppression when IPv6 is disabled. Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Reported-by: Guruprasad C P Closes: https://lore.kernel.org/netdev/CAHXs0ORzd62QOG-Fttqa2Cx_A_VFp=utE2H2VTX5nqfgs7LDxQ@mail.gmail.com/ Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260304120357.9778-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/bridge/br_device.c | 2 +- net/bridge/br_input.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ee01122f466f..f7502e62dd35 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -74,7 +74,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 1405f1061a54..2cbae0f9ae1f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -170,7 +170,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + -- cgit v1.2.3 From 168ff39e4758897d2eee4756977d036d52884c7e Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 4 Mar 2026 13:03:57 +0100 Subject: net: vxlan: fix nd_tbl NULL dereference when IPv6 is disabled When booting with the 'ipv6.disable=1' parameter, the nd_tbl is never initialized because inet6_init() exits before ndisc_init() is called which initializes it. If an IPv6 packet is injected into the interface, route_shortcircuit() is called and a NULL pointer dereference happens on neigh_lookup(). BUG: kernel NULL pointer dereference, address: 0000000000000380 Oops: Oops: 0000 [#1] SMP NOPTI [...] RIP: 0010:neigh_lookup+0x20/0x270 [...] Call Trace: vxlan_xmit+0x638/0x1ef0 [vxlan] dev_hard_start_xmit+0x9e/0x2e0 __dev_queue_xmit+0xbee/0x14e0 packet_sendmsg+0x116f/0x1930 __sys_sendto+0x1f5/0x200 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x12f/0x1590 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by adding an early check on route_shortcircuit() when protocol is ETH_P_IPV6. Note that ipv6_mod_enabled() cannot be used here because VXLAN can be built-in even when IPv6 is built as a module. Fixes: e15a00aafa4b ("vxlan: add ipv6 route short circuit support") Signed-off-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260304120357.9778-2-fmancera@suse.de Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 05558b6afecd..17c941aac32d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2130,6 +2130,11 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct ipv6hdr *pip6; + /* check if nd_tbl is not initiliazed due to + * ipv6.disable=1 set during boot + */ + if (!ipv6_stub->nd_tbl) + return false; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return false; pip6 = ipv6_hdr(skb); -- cgit v1.2.3 From 21ec92774d1536f71bdc90b0e3d052eff99cf093 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 19:38:13 +0800 Subject: net: ipv6: fix panic when IPv4 route references loopback IPv6 nexthop When a standalone IPv6 nexthop object is created with a loopback device (e.g., "ip -6 nexthop add id 100 dev lo"), fib6_nh_init() misclassifies it as a reject route. This is because nexthop objects have no destination prefix (fc_dst=::), causing fib6_is_reject() to match any loopback nexthop. The reject path skips fib_nh_common_init(), leaving nhc_pcpu_rth_output unallocated. If an IPv4 route later references this nexthop, __mkroute_output() dereferences NULL nhc_pcpu_rth_output and panics. Simplify the check in fib6_nh_init() to only match explicit reject routes (RTF_REJECT) instead of using fib6_is_reject(). The loopback promotion heuristic in fib6_is_reject() is handled separately by ip6_route_info_create_nh(). After this change, the three cases behave as follows: 1. Explicit reject route ("ip -6 route add unreachable 2001:db8::/64"): RTF_REJECT is set, enters reject path, skips fib_nh_common_init(). No behavior change. 2. Implicit loopback reject route ("ip -6 route add 2001:db8::/32 dev lo"): RTF_REJECT is not set, takes normal path, fib_nh_common_init() is called. ip6_route_info_create_nh() still promotes it to reject afterward. nhc_pcpu_rth_output is allocated but unused, which is harmless. 3. Standalone nexthop object ("ip -6 nexthop add id 100 dev lo"): RTF_REJECT is not set, takes normal path, fib_nh_common_init() is called. nhc_pcpu_rth_output is properly allocated, fixing the crash when IPv4 routes reference this nexthop. Suggested-by: Ido Schimmel Fixes: 493ced1ac47c ("ipv4: Allow routes to use nexthop objects") Reported-by: syzbot+334190e097a98a1b81bb@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/698f8482.a70a0220.2c38d7.00ca.GAE@google.com/T/ Signed-off-by: Jiayuan Chen Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260304113817.294966-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7db0c837196c..08cd86f49bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3583,7 +3583,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; - int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; @@ -3625,11 +3624,10 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, fib6_nh->fib_nh_weight = 1; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes + /* Reset the nexthop device to the loopback device in case of reject + * routes. */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { + if (cfg->fc_flags & RTF_REJECT) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { -- cgit v1.2.3 From 46c1ef0cfcea50aaf0b52316fdab94bf4b45795b Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 19:38:14 +0800 Subject: selftests: net: add test for IPv4 route with loopback IPv6 nexthop Add a regression test for a kernel panic that occurs when an IPv4 route references an IPv6 nexthop object created on the loopback device. The test creates an IPv6 nexthop on lo, binds an IPv4 route to it, then triggers a route lookup via ping to verify the kernel does not crash. ./fib_nexthops.sh Tests passed: 249 Tests failed: 0 Reviewed-by: Ido Schimmel Signed-off-by: Jiayuan Chen Reviewed-by: David Ahern Link: https://patch.msgid.link/20260304113817.294966-3-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 21026b667667..6eb7f95e70e1 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1672,6 +1672,17 @@ ipv4_withv6_fcnal() run_cmd "$IP ro replace 172.16.101.1/32 via inet6 2001:db8:50::1 dev veth1" log_test $? 2 "IPv4 route with invalid IPv6 gateway" + + # Test IPv4 route with loopback IPv6 nexthop + # Regression test: loopback IPv6 nexthop was misclassified as reject + # route, skipping nhc_pcpu_rth_output allocation, causing panic when + # an IPv4 route references it and triggers __mkroute_output(). + run_cmd "$IP -6 nexthop add id 20 dev lo" + run_cmd "$IP ro add 172.20.20.0/24 nhid 20" + run_cmd "ip netns exec $me ping -c1 -W1 172.20.20.1" + log_test $? 1 "IPv4 route with loopback IPv6 nexthop (no crash)" + run_cmd "$IP ro del 172.20.20.0/24" + run_cmd "$IP nexthop del id 20" } ipv4_fcnal_runtime() -- cgit v1.2.3 From e2cedd400c3ec0302ffca2490e8751772906ac23 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Wed, 4 Mar 2026 09:06:02 -0500 Subject: net/sched: act_ife: Fix metalist update behavior Whenever an ife action replace changes the metalist, instead of replacing the old data on the metalist, the current ife code is appending the new metadata. Aside from being innapropriate behavior, this may lead to an unbounded addition of metadata to the metalist which might cause an out of bounds error when running the encode op: [ 138.423369][ C1] ================================================================== [ 138.424317][ C1] BUG: KASAN: slab-out-of-bounds in ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.424906][ C1] Write of size 4 at addr ffff8880077f4ffe by task ife_out_out_bou/255 [ 138.425778][ C1] CPU: 1 UID: 0 PID: 255 Comm: ife_out_out_bou Not tainted 7.0.0-rc1-00169-gfbdfa8da05b6 #624 PREEMPT(full) [ 138.425795][ C1] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 138.425800][ C1] Call Trace: [ 138.425804][ C1] [ 138.425808][ C1] dump_stack_lvl (lib/dump_stack.c:122) [ 138.425828][ C1] print_report (mm/kasan/report.c:379 mm/kasan/report.c:482) [ 138.425839][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425844][ C1] ? __virt_addr_valid (./arch/x86/include/asm/preempt.h:95 (discriminator 1) ./include/linux/rcupdate.h:975 (discriminator 1) ./include/linux/mmzone.h:2207 (discriminator 1) arch/x86/mm/physaddr.c:54 (discriminator 1)) [ 138.425853][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425859][ C1] kasan_report (mm/kasan/report.c:221 mm/kasan/report.c:597) [ 138.425868][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425878][ C1] kasan_check_range (mm/kasan/generic.c:186 (discriminator 1) mm/kasan/generic.c:200 (discriminator 1)) [ 138.425884][ C1] __asan_memset (mm/kasan/shadow.c:84 (discriminator 2)) [ 138.425889][ C1] ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425893][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:171) [ 138.425898][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425903][ C1] ife_encode_meta_u16 (net/sched/act_ife.c:57) [ 138.425910][ C1] ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114) [ 138.425916][ C1] ? __asan_memcpy (mm/kasan/shadow.c:105 (discriminator 3)) [ 138.425921][ C1] ? __pfx_ife_encode_meta_u16 (net/sched/act_ife.c:45) [ 138.425927][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425931][ C1] tcf_ife_act (net/sched/act_ife.c:847 net/sched/act_ife.c:879) To solve this issue, fix the replace behavior by adding the metalist to the ife rcu data structure. Fixes: aa9fd9a325d51 ("sched: act: ife: update parameters via rcu handling") Reported-by: Ruitong Liu Tested-by: Ruitong Liu Co-developed-by: Victor Nogueira Signed-off-by: Victor Nogueira Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260304140603.76500-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_ife.h | 4 +- net/sched/act_ife.c | 93 +++++++++++++++++++++------------------------ 2 files changed, 45 insertions(+), 52 deletions(-) diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index c7f24a2da1ca..24d4d5a62b3c 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -13,15 +13,13 @@ struct tcf_ife_params { u8 eth_src[ETH_ALEN]; u16 eth_type; u16 flags; - + struct list_head metalist; struct rcu_head rcu; }; struct tcf_ife_info { struct tc_action common; struct tcf_ife_params __rcu *params; - /* list of metaids allowed */ - struct list_head metalist; }; #define to_ife(a) ((struct tcf_ife_info *)a) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 79df81d12894..d5e8a91bb4eb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -293,8 +293,8 @@ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool atomic, bool exists) + struct tcf_ife_params *p, u32 metaid, void *metaval, + int len, bool atomic) { struct tcf_meta_info *mi = NULL; int ret = 0; @@ -313,45 +313,40 @@ static int __add_metainfo(const struct tcf_meta_ops *ops, } } - if (exists) - spin_lock_bh(&ife->tcf_lock); - list_add_tail(&mi->metalist, &ife->metalist); - if (exists) - spin_unlock_bh(&ife->tcf_lock); + list_add_tail(&mi->metalist, &p->metalist); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, - bool exists) + struct tcf_ife_params *p, u32 metaid) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); + ret = __add_metainfo(ops, p, metaid, NULL, 0, true); if (ret) module_put(ops->owner); return ret; } -static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool exists) +static int add_metainfo(struct tcf_ife_params *p, u32 metaid, void *metaval, + int len) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); + ret = __add_metainfo(ops, p, metaid, metaval, len, false); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } -static int use_all_metadata(struct tcf_ife_info *ife, bool exists) +static int use_all_metadata(struct tcf_ife_params *p) { struct tcf_meta_ops *o; int rc = 0; @@ -359,7 +354,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { - rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); + rc = add_metainfo_and_get_ops(o, p, o->metaid); if (rc == 0) installed += 1; } @@ -371,7 +366,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) return -EINVAL; } -static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) +static int dump_metalist(struct sk_buff *skb, struct tcf_ife_params *p) { struct tcf_meta_info *e; struct nlattr *nest; @@ -379,14 +374,14 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) int total_encoded = 0; /*can only happen on decode */ - if (list_empty(&ife->metalist)) + if (list_empty(&p->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry(e, &p->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } @@ -403,13 +398,11 @@ out_nlmsg_trim: return -1; } -/* under ife->tcf_lock */ -static void _tcf_ife_cleanup(struct tc_action *a) +static void __tcf_ife_cleanup(struct tcf_ife_params *p) { - struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_safe(e, n, &p->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) @@ -422,18 +415,23 @@ static void _tcf_ife_cleanup(struct tc_action *a) } } +static void tcf_ife_cleanup_params(struct rcu_head *head) +{ + struct tcf_ife_params *p = container_of(head, struct tcf_ife_params, + rcu); + + __tcf_ife_cleanup(p); + kfree(p); +} + static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - spin_lock_bh(&ife->tcf_lock); - _tcf_ife_cleanup(a); - spin_unlock_bh(&ife->tcf_lock); - p = rcu_dereference_protected(ife->params, 1); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); } static int load_metalist(struct nlattr **tb, bool rtnl_held) @@ -455,8 +453,7 @@ static int load_metalist(struct nlattr **tb, bool rtnl_held) return 0; } -static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, - bool exists, bool rtnl_held) +static int populate_metalist(struct tcf_ife_params *p, struct nlattr **tb) { int len = 0; int rc = 0; @@ -468,7 +465,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = add_metainfo(ife, i, val, len, exists); + rc = add_metainfo(p, i, val, len); if (rc) return rc; } @@ -523,6 +520,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, p = kzalloc_obj(*p); if (!p) return -ENOMEM; + INIT_LIST_HEAD(&p->metalist); if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, @@ -567,8 +565,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) @@ -600,8 +596,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } if (tb[TCA_IFE_METALST]) { - err = populate_metalist(ife, tb2, exists, - !(flags & TCA_ACT_FLAGS_NO_RTNL)); + err = populate_metalist(p, tb2); if (err) goto metadata_parse_err; } else { @@ -610,7 +605,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, * as we can. You better have at least one else we are * going to bail out */ - err = use_all_metadata(ife, exists); + err = use_all_metadata(p); if (err) goto metadata_parse_err; } @@ -626,13 +621,14 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + __tcf_ife_cleanup(p); kfree(p); tcf_idr_release(*a, bind); return err; @@ -679,7 +675,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; - if (dump_metalist(skb, ife)) { + if (dump_metalist(skb, p)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } @@ -693,13 +689,13 @@ nla_put_failure: return -1; } -static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, +static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_params *p, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ @@ -716,10 +712,13 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; + struct tcf_ife_params *p; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; + p = rcu_dereference_bh(ife->params); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); @@ -745,7 +744,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { + if (find_decode_metaid(skb, p, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ @@ -769,12 +768,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, /*XXX: check if we can do this at install time instead of current * send data path **/ -static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) +static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_params *p) { - struct tcf_meta_info *e, *n; + struct tcf_meta_info *e; int tot_run_sz = 0, run_sz = 0; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; @@ -795,7 +794,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ - u16 metalen = ife_get_sz(skb, ife); + u16 metalen = ife_get_sz(skb, p); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; @@ -833,25 +832,21 @@ drop: if (!ife_meta) goto drop; - spin_lock(&ife->tcf_lock); - /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ - spin_unlock(&ife->tcf_lock); goto drop; } skboff += err; } - spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) -- cgit v1.2.3 From 5d1271ff4c131cd4a601dabdfdcdc1fe1252493f Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 4 Mar 2026 09:06:03 -0500 Subject: selftests/tc-testing: Add tests exercising act_ife metalist replace behaviour Add 2 test cases to exercise fix in act_ife's internal metalist behaviour. - Update decode ife action into encode with tcindex metadata - Update decode ife action into encode with multiple metadata Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260304140603.76500-2-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/actions/ife.json | 99 ++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json index f587a32e44c4..808aef4afe22 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json @@ -1279,5 +1279,104 @@ "teardown": [ "$TC actions flush action ife" ] + }, + { + "id": "f2a0", + "name": "Update decode ife action with encode metadata", + "category": [ + "actions", + "ife" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ife", + 0, + 1, + 255 + ], + "$TC actions add action ife decode index 10" + ], + "cmdUnderTest": "$TC actions replace action ife encode use tcindex 1 index 10", + "expExitCode": "0", + "verifyCmd": "$TC -j actions get action ife index 10", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "ife", + "mode": "encode", + "control_action": { + "type": "pipe" + }, + "type": "0xed3e", + "tcindex": 1, + "index": 10, + "ref": 1, + "bind": 0, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC actions flush action ife" + ] + }, + { + "id": "d352", + "name": "Update decode ife action into encode with multiple metadata", + "category": [ + "actions", + "ife" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action ife", + 0, + 1, + 255 + ], + "$TC actions add action ife decode index 10" + ], + "cmdUnderTest": "$TC actions replace action ife encode use tcindex 1 use mark 22 index 10", + "expExitCode": "0", + "verifyCmd": "$TC -j actions get action ife index 10", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "ife", + "mode": "encode", + "control_action": { + "type": "pipe" + }, + "type": "0xed3e", + "tcindex": 1, + "mark": 22, + "index": 10, + "ref": 1, + "bind": 0, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC actions flush action ife" + ] } ] -- cgit v1.2.3 From 88b6b7f7b216108a09887b074395fa7b751880b1 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:42 +0100 Subject: xdp: use modulo operation to calculate XDP frag tailroom The current formula for calculating XDP tailroom in mbuf packets works only if each frag has its own page (if rxq->frag_size is PAGE_SIZE), this defeats the purpose of the parameter overall and without any indication leads to negative calculated tailroom on at least half of frags, if shared pages are used. There are not many drivers that set rxq->frag_size. Among them: * i40e and enetc always split page uniformly between frags, use shared pages * ice uses page_pool frags via libeth, those are power-of-2 and uniformly distributed across page * idpf has variable frag_size with XDP on, so current API is not applicable * mlx5, mtk and mvneta use PAGE_SIZE or 0 as frag_size for page_pool As for AF_XDP ZC, only ice, i40e and idpf declare frag_size for it. Modulo operation yields good results for aligned chunks, they are all power-of-2, between 2K and PAGE_SIZE. Formula without modulo fails when chunk_size is 2K. Buffers in unaligned mode are not distributed uniformly, so modulo operation would not work. To accommodate unaligned buffers, we could define frag_size as data + tailroom, and hence do not subtract offset when calculating tailroom, but this would necessitate more changes in the drivers. Define rxq->frag_size as an even portion of a page that fully belongs to a single frag. When calculating tailroom, locate the data start within such portion by performing a modulo operation on page offset. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Acked-by: Jakub Kicinski Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-2-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0d5d5a17acb2..d6fafb3633b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4155,7 +4155,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + tailroom = rxq->frag_size - skb_frag_size(frag) - + skb_frag_off(frag) % rxq->frag_size; if (unlikely(offset > tailroom)) return -EINVAL; -- cgit v1.2.3 From 16394d80539937d348dd3b9ea32415c54e67a81b Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:43 +0100 Subject: xsk: introduce helper to determine rxq->frag_size rxq->frag_size is basically a step between consecutive strictly aligned frames. In ZC mode, chunk size fits exactly, but if chunks are unaligned, there is no safe way to determine accessible space to grow tailroom. Report frag_size to be zero, if chunks are unaligned, chunk_size otherwise. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-3-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index aefc368449d5..6b9ebae2dc95 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -51,6 +51,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { @@ -337,6 +342,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) return 0; } +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return 0; +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { -- cgit v1.2.3 From 02852b47c706772af795d3e28fca99fc9b923b2c Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:44 +0100 Subject: ice: fix rxq info registering in mbuf packets XDP RxQ info contains frag_size, which depends on the MTU. This makes the old way of registering RxQ info before calculating new buffer sizes invalid. Currently, it leads to frag_size being outdated, making it sometimes impossible to grow tailroom in a mbuf packet. E.g. fragments are actually 3K+, but frag size is still as if MTU was 1500. Always register new XDP RxQ info after reconfiguring memory pools. Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-4-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_base.c | 26 +++++++------------------- drivers/net/ethernet/intel/ice/ice_ethtool.c | 1 + drivers/net/ethernet/intel/ice/ice_txrx.c | 4 +++- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +++ 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index d85b0a4baa37..165ef8afa8b0 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -666,23 +666,12 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || ring->vsi->type == ICE_VSI_LB) { - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index, - ring->q_vector->napi.napi_id, - ring->rx_buf_len); - if (err) - return err; - } - ice_rx_xsk_pool(ring); err = ice_realloc_rx_xdp_bufs(ring, ring->xsk_pool); if (err) return err; if (ring->xsk_pool) { - xdp_rxq_info_unreg(&ring->xdp_rxq); - rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, @@ -705,14 +694,13 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) if (err) return err; - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index, - ring->q_vector->napi.napi_id, - ring->rx_buf_len); - if (err) - goto err_destroy_fq; - } + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->q_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + goto err_destroy_fq; + xdp_rxq_info_attach_page_pool(&ring->xdp_rxq, ring->pp); } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index b9be10b58856..301947d53ede 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3342,6 +3342,7 @@ process_rx: rx_rings[i].cached_phctime = pf->ptp.cached_phc_time; rx_rings[i].desc = NULL; rx_rings[i].xdp_buf = NULL; + rx_rings[i].xdp_rxq = (struct xdp_rxq_info){ }; /* this is to allow wr32 to have something to write to * during early allocation of Rx buffers diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index a5bbce68f76c..a2cd4cf37734 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -560,7 +560,9 @@ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring) i = 0; } - if (rx_ring->vsi->type == ICE_VSI_PF && + if ((rx_ring->vsi->type == ICE_VSI_PF || + rx_ring->vsi->type == ICE_VSI_SF || + rx_ring->vsi->type == ICE_VSI_LB) && xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); xdp_rxq_info_unreg(&rx_ring->xdp_rxq); diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index c673094663a3..0643017541c3 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -899,6 +899,9 @@ void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring) u16 ntc = rx_ring->next_to_clean; u16 ntu = rx_ring->next_to_use; + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + while (ntc != ntu) { struct xdp_buff *xdp = *ice_xdp_buf(rx_ring, ntc); -- cgit v1.2.3 From e142dc4ef0f451b7ef99d09aaa84e9389af629d7 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:45 +0100 Subject: ice: change XDP RxQ frag_size from DMA write length to xdp.frame_sz The only user of frag_size field in XDP RxQ info is bpf_xdp_frags_increase_tail(). It clearly expects whole buff size instead of DMA write size. Different assumptions in ice driver configuration lead to negative tailroom. This allows to trigger kernel panic, when using XDP_ADJUST_TAIL_GROW_MULTI_BUFF xskxceiver test and changing packet size to 6912 and the requested offset to a huge value, e.g. XSK_UMEM__MAX_FRAME_SIZE * 100. Due to other quirks of the ZC configuration in ice, panic is not observed in ZC mode, but tailroom growing still fails when it should not. Use fill queue buffer truesize instead of DMA write size in XDP RxQ info. Fix ZC mode too by using the new helper. Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-5-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_base.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 165ef8afa8b0..1667f686ff75 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -661,7 +661,6 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) { struct device *dev = ice_pf_to_dev(ring->vsi->back); u32 num_bufs = ICE_DESC_UNUSED(ring); - u32 rx_buf_len; int err; if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF || @@ -672,12 +671,12 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return err; if (ring->xsk_pool) { - rx_buf_len = - xsk_pool_get_rx_frame_size(ring->xsk_pool); + u32 frag_size = + xsk_pool_get_rx_frag_step(ring->xsk_pool); err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index, ring->q_vector->napi.napi_id, - rx_buf_len); + frag_size); if (err) return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -697,7 +696,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index, ring->q_vector->napi.napi_id, - ring->rx_buf_len); + ring->truesize); if (err) goto err_destroy_fq; -- cgit v1.2.3 From 8f497dc8a61429cc004720aa8e713743355d80cf Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:46 +0100 Subject: i40e: fix registering XDP RxQ info Current way of handling XDP RxQ info in i40e has a problem, where frag_size is not updated when xsk_buff_pool is detached or when MTU is changed, this leads to growing tail always failing for multi-buffer packets. Couple XDP RxQ info registering with buffer allocations and unregistering with cleaning the ring. Fixes: a045d2f2d03d ("i40e: set xdp_rxq_info::frag_size") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-6-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 34 ++++++++++++++++------------- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 5 +++-- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 7b9e147d7365..781ec5aa814b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3583,18 +3583,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (ring->vsi->type != I40E_VSI_MAIN) goto skip; - if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { - err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->queue_index, - ring->q_vector->napi.napi_id, - ring->rx_buf_len); - if (err) - return err; - } - ring->xsk_pool = i40e_xsk_pool(ring); if (ring->xsk_pool) { - xdp_rxq_info_unreg(&ring->xdp_rxq); ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->queue_index, @@ -3606,17 +3596,23 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) MEM_TYPE_XSK_BUFF_POOL, NULL); if (err) - return err; + goto unreg_xdp; dev_info(&vsi->back->pdev->dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->queue_index); } else { + err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->queue_index, + ring->q_vector->napi.napi_id, + ring->rx_buf_len); + if (err) + return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err) - return err; + goto unreg_xdp; } skip: @@ -3654,7 +3650,8 @@ skip: dev_info(&vsi->back->pdev->dev, "Failed to clear LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", ring->queue_index, pf_q, err); - return -ENOMEM; + err = -ENOMEM; + goto unreg_xdp; } /* set the context in the HMC */ @@ -3663,7 +3660,8 @@ skip: dev_info(&vsi->back->pdev->dev, "Failed to set LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n", ring->queue_index, pf_q, err); - return -ENOMEM; + err = -ENOMEM; + goto unreg_xdp; } /* configure Rx buffer alignment */ @@ -3671,7 +3669,8 @@ skip: if (I40E_2K_TOO_SMALL_WITH_PADDING) { dev_info(&vsi->back->pdev->dev, "2k Rx buffer is too small to fit standard MTU and skb_shared_info\n"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto unreg_xdp; } clear_ring_build_skb_enabled(ring); } else { @@ -3701,6 +3700,11 @@ skip: } return 0; +unreg_xdp: + if (ring->vsi->type == I40E_VSI_MAIN) + xdp_rxq_info_unreg(&ring->xdp_rxq); + + return err; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 34db7d8866b0..894f2d06d39d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1470,6 +1470,9 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) if (!rx_ring->rx_bi) return; + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) + xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + if (rx_ring->xsk_pool) { i40e_xsk_clean_rx_ring(rx_ring); goto skip_free; @@ -1527,8 +1530,6 @@ skip_free: void i40e_free_rx_resources(struct i40e_ring *rx_ring) { i40e_clean_rx_ring(rx_ring); - if (rx_ring->vsi->type == I40E_VSI_MAIN) - xdp_rxq_info_unreg(&rx_ring->xdp_rxq); rx_ring->xdp_prog = NULL; kfree(rx_ring->rx_bi); rx_ring->rx_bi = NULL; -- cgit v1.2.3 From c69d22c6c46a1d792ba8af3d8d6356fdc0e6f538 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:47 +0100 Subject: i40e: use xdp.frame_sz as XDP RxQ info frag_size The only user of frag_size field in XDP RxQ info is bpf_xdp_frags_increase_tail(). It clearly expects whole buffer size instead of DMA write size. Different assumptions in i40e driver configuration lead to negative tailroom. Set frag_size to the same value as frame_sz in shared pages mode, use new helper to set frag_size when AF_XDP ZC is active. Fixes: a045d2f2d03d ("i40e: set xdp_rxq_info::frag_size") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-7-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 781ec5aa814b..926d001b2150 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3569,6 +3569,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) u16 pf_q = vsi->base_queue + ring->queue_index; struct i40e_hw *hw = &vsi->back->hw; struct i40e_hmc_obj_rxq rx_ctx; + u32 xdp_frame_sz; int err = 0; bool ok; @@ -3578,6 +3579,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) memset(&rx_ctx, 0, sizeof(rx_ctx)); ring->rx_buf_len = vsi->rx_buf_len; + xdp_frame_sz = i40e_rx_pg_size(ring) / 2; /* XDP RX-queue info only needed for RX rings exposed to XDP */ if (ring->vsi->type != I40E_VSI_MAIN) @@ -3585,11 +3587,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->xsk_pool = i40e_xsk_pool(ring); if (ring->xsk_pool) { + xdp_frame_sz = xsk_pool_get_rx_frag_step(ring->xsk_pool); ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->queue_index, ring->q_vector->napi.napi_id, - ring->rx_buf_len); + xdp_frame_sz); if (err) return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -3605,7 +3608,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->queue_index, ring->q_vector->napi.napi_id, - ring->rx_buf_len); + xdp_frame_sz); if (err) return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -3616,7 +3619,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) } skip: - xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); + xdp_init_buff(&ring->xdp, xdp_frame_sz, &ring->xdp_rxq); rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT)); -- cgit v1.2.3 From 75d9228982f23d68066ca0b7d87014c3eb8ddc85 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:48 +0100 Subject: libeth, idpf: use truesize as XDP RxQ info frag_size The only user of frag_size field in XDP RxQ info is bpf_xdp_frags_increase_tail(). It clearly expects whole buffer size instead of DMA write size. Different assumptions in idpf driver configuration lead to negative tailroom. To make it worse, buffer sizes are not actually uniform in idpf when splitq is enabled, as there are several buffer queues, so rxq->rx_buf_size is meaningless in this case. Use truesize of the first bufq in AF_XDP ZC, as there is only one. Disable growing tail for regular splitq. Fixes: ac8a861f632e ("idpf: prepare structures to support XDP") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-8-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/xdp.c | 6 +++++- drivers/net/ethernet/intel/idpf/xsk.c | 1 + drivers/net/ethernet/intel/libeth/xsk.c | 1 + include/net/libeth/xsk.h | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c index 6ac9c6624c2a..cbccd4546768 100644 --- a/drivers/net/ethernet/intel/idpf/xdp.c +++ b/drivers/net/ethernet/intel/idpf/xdp.c @@ -47,12 +47,16 @@ static int __idpf_xdp_rxq_info_init(struct idpf_rx_queue *rxq, void *arg) { const struct idpf_vport *vport = rxq->q_vector->vport; const struct idpf_q_vec_rsrc *rsrc; + u32 frag_size = 0; bool split; int err; + if (idpf_queue_has(XSK, rxq)) + frag_size = rxq->bufq_sets[0].bufq.truesize; + err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx, rxq->q_vector->napi.napi_id, - rxq->rx_buf_size); + frag_size); if (err) return err; diff --git a/drivers/net/ethernet/intel/idpf/xsk.c b/drivers/net/ethernet/intel/idpf/xsk.c index 676cbd80774d..d95d3efdfd36 100644 --- a/drivers/net/ethernet/intel/idpf/xsk.c +++ b/drivers/net/ethernet/intel/idpf/xsk.c @@ -403,6 +403,7 @@ int idpf_xskfq_init(struct idpf_buf_queue *bufq) bufq->pending = fq.pending; bufq->thresh = fq.thresh; bufq->rx_buf_size = fq.buf_len; + bufq->truesize = fq.truesize; if (!idpf_xskfq_refill(bufq)) netdev_err(bufq->pool->netdev, diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c index 846e902e31b6..4882951d5c9c 100644 --- a/drivers/net/ethernet/intel/libeth/xsk.c +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -167,6 +167,7 @@ int libeth_xskfq_create(struct libeth_xskfq *fq) fq->pending = fq->count; fq->thresh = libeth_xdp_queue_threshold(fq->count); fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); + fq->truesize = xsk_pool_get_rx_frag_step(fq->pool); return 0; } diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 481a7b28e6f2..82b5d21aae87 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -597,6 +597,7 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, * @pending: current number of XSkFQEs to refill * @thresh: threshold below which the queue is refilled * @buf_len: HW-writeable length per each buffer + * @truesize: step between consecutive buffers, 0 if none exists * @nid: ID of the closest NUMA node with memory */ struct libeth_xskfq { @@ -614,6 +615,8 @@ struct libeth_xskfq { u32 thresh; u32 buf_len; + u32 truesize; + int nid; }; -- cgit v1.2.3 From f8e18abf183dbd636a8725532c7f5aa58957de84 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:49 +0100 Subject: net: enetc: use truesize as XDP RxQ info frag_size The only user of frag_size field in XDP RxQ info is bpf_xdp_frags_increase_tail(). It clearly expects truesize instead of DMA write size. Different assumptions in enetc driver configuration lead to negative tailroom. Set frag_size to the same value as frame_sz. Fixes: 2768b2e2f7d2 ("net: enetc: register XDP RX queues with frag_size") Reviewed-by: Aleksandr Loktionov Reviewed-by: Vladimir Oltean Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-9-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 70768392912c..a146ceaf2ed6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -3467,7 +3467,7 @@ static int enetc_int_vector_init(struct enetc_ndev_priv *priv, int i, priv->rx_ring[i] = bdr; err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, - ENETC_RXB_DMA_SIZE_XDP); + ENETC_RXB_TRUESIZE); if (err) goto free_vector; -- cgit v1.2.3 From 8821e857759be9db3cde337ad328b71fe5c8a55f Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:50 +0100 Subject: xdp: produce a warning when calculated tailroom is negative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many ethernet drivers report xdp Rx queue frag size as being the same as DMA write size. However, the only user of this field, namely bpf_xdp_frags_increase_tail(), clearly expects a truesize. Such difference leads to unspecific memory corruption issues under certain circumstances, e.g. in ixgbevf maximum DMA write size is 3 KB, so when running xskxceiver's XDP_ADJUST_TAIL_GROW_MULTI_BUFF, 6K packet fully uses all DMA-writable space in 2 buffers. This would be fine, if only rxq->frag_size was properly set to 4K, but value of 3K results in a negative tailroom, because there is a non-zero page offset. We are supposed to return -EINVAL and be done with it in such case, but due to tailroom being stored as an unsigned int, it is reported to be somewhere near UINT_MAX, resulting in a tail being grown, even if the requested offset is too much (it is around 2K in the abovementioned test). This later leads to all kinds of unspecific calltraces. [ 7340.337579] xskxceiver[1440]: segfault at 1da718 ip 00007f4161aeac9d sp 00007f41615a6a00 error 6 [ 7340.338040] xskxceiver[1441]: segfault at 7f410000000b ip 00000000004042b5 sp 00007f415bffecf0 error 4 [ 7340.338179] in libc.so.6[61c9d,7f4161aaf000+160000] [ 7340.339230] in xskxceiver[42b5,400000+69000] [ 7340.340300] likely on CPU 6 (core 0, socket 6) [ 7340.340302] Code: ff ff 01 e9 f4 fe ff ff 0f 1f 44 00 00 4c 39 f0 74 73 31 c0 ba 01 00 00 00 f0 0f b1 17 0f 85 ba 00 00 00 49 8b 87 88 00 00 00 <4c> 89 70 08 eb cc 0f 1f 44 00 00 48 8d bd f0 fe ff ff 89 85 ec fe [ 7340.340888] likely on CPU 3 (core 0, socket 3) [ 7340.345088] Code: 00 00 00 ba 00 00 00 00 be 00 00 00 00 89 c7 e8 31 ca ff ff 89 45 ec 8b 45 ec 85 c0 78 07 b8 00 00 00 00 eb 46 e8 0b c8 ff ff <8b> 00 83 f8 69 74 24 e8 ff c7 ff ff 8b 00 83 f8 0b 74 18 e8 f3 c7 [ 7340.404334] Oops: general protection fault, probably for non-canonical address 0x6d255010bdffc: 0000 [#1] SMP NOPTI [ 7340.405972] CPU: 7 UID: 0 PID: 1439 Comm: xskxceiver Not tainted 6.19.0-rc1+ #21 PREEMPT(lazy) [ 7340.408006] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014 [ 7340.409716] RIP: 0010:lookup_swap_cgroup_id+0x44/0x80 [ 7340.410455] Code: 83 f8 1c 73 39 48 ba ff ff ff ff ff ff ff 03 48 8b 04 c5 20 55 fa bd 48 21 d1 48 89 ca 83 e1 01 48 d1 ea c1 e1 04 48 8d 04 90 <8b> 00 48 83 c4 10 d3 e8 c3 cc cc cc cc 31 c0 e9 98 b7 dd 00 48 89 [ 7340.412787] RSP: 0018:ffffcc5c04f7f6d0 EFLAGS: 00010202 [ 7340.413494] RAX: 0006d255010bdffc RBX: ffff891f477895a8 RCX: 0000000000000010 [ 7340.414431] RDX: 0001c17e3fffffff RSI: 00fa070000000000 RDI: 000382fc7fffffff [ 7340.415354] RBP: 00fa070000000000 R08: ffffcc5c04f7f8f8 R09: ffffcc5c04f7f7d0 [ 7340.416283] R10: ffff891f4c1a7000 R11: ffffcc5c04f7f9c8 R12: ffffcc5c04f7f7d0 [ 7340.417218] R13: 03ffffffffffffff R14: 00fa06fffffffe00 R15: ffff891f47789500 [ 7340.418229] FS: 0000000000000000(0000) GS:ffff891ffdfaa000(0000) knlGS:0000000000000000 [ 7340.419489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7340.420286] CR2: 00007f415bfffd58 CR3: 0000000103f03002 CR4: 0000000000772ef0 [ 7340.421237] PKRU: 55555554 [ 7340.421623] Call Trace: [ 7340.421987] [ 7340.422309] ? softleaf_from_pte+0x77/0xa0 [ 7340.422855] swap_pte_batch+0xa7/0x290 [ 7340.423363] zap_nonpresent_ptes.constprop.0.isra.0+0xd1/0x270 [ 7340.424102] zap_pte_range+0x281/0x580 [ 7340.424607] zap_pmd_range.isra.0+0xc9/0x240 [ 7340.425177] unmap_page_range+0x24d/0x420 [ 7340.425714] unmap_vmas+0xa1/0x180 [ 7340.426185] exit_mmap+0xe1/0x3b0 [ 7340.426644] __mmput+0x41/0x150 [ 7340.427098] exit_mm+0xb1/0x110 [ 7340.427539] do_exit+0x1b2/0x460 [ 7340.427992] do_group_exit+0x2d/0xc0 [ 7340.428477] get_signal+0x79d/0x7e0 [ 7340.428957] arch_do_signal_or_restart+0x34/0x100 [ 7340.429571] exit_to_user_mode_loop+0x8e/0x4c0 [ 7340.430159] do_syscall_64+0x188/0x6b0 [ 7340.430672] ? __do_sys_clone3+0xd9/0x120 [ 7340.431212] ? switch_fpu_return+0x4e/0xd0 [ 7340.431761] ? arch_exit_to_user_mode_prepare.isra.0+0xa1/0xc0 [ 7340.432498] ? do_syscall_64+0xbb/0x6b0 [ 7340.433015] ? __handle_mm_fault+0x445/0x690 [ 7340.433582] ? count_memcg_events+0xd6/0x210 [ 7340.434151] ? handle_mm_fault+0x212/0x340 [ 7340.434697] ? do_user_addr_fault+0x2b4/0x7b0 [ 7340.435271] ? clear_bhb_loop+0x30/0x80 [ 7340.435788] ? clear_bhb_loop+0x30/0x80 [ 7340.436299] ? clear_bhb_loop+0x30/0x80 [ 7340.436812] ? clear_bhb_loop+0x30/0x80 [ 7340.437323] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 7340.437973] RIP: 0033:0x7f4161b14169 [ 7340.438468] Code: Unable to access opcode bytes at 0x7f4161b1413f. [ 7340.439242] RSP: 002b:00007ffc6ebfa770 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca [ 7340.440173] RAX: fffffffffffffe00 RBX: 00000000000005a1 RCX: 00007f4161b14169 [ 7340.441061] RDX: 00000000000005a1 RSI: 0000000000000109 RDI: 00007f415bfff990 [ 7340.441943] RBP: 00007ffc6ebfa7a0 R08: 0000000000000000 R09: 00000000ffffffff [ 7340.442824] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 7340.443707] R13: 0000000000000000 R14: 00007f415bfff990 R15: 00007f415bfff6c0 [ 7340.444586] [ 7340.444922] Modules linked in: rfkill intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel vfat fat kvm snd_pcm irqbypass rapl iTCO_wdt snd_timer intel_pmc_bxt iTCO_vendor_support snd ixgbevf virtio_net soundcore i2c_i801 pcspkr libeth_xdp net_failover i2c_smbus lpc_ich failover libeth virtio_balloon joydev 9p fuse loop zram lz4hc_compress lz4_compress 9pnet_virtio 9pnet netfs ghash_clmulni_intel serio_raw qemu_fw_cfg [ 7340.449650] ---[ end trace 0000000000000000 ]--- The issue can be fixed in all in-tree drivers, but we cannot just trust OOT drivers to not do this. Therefore, make tailroom a signed int and produce a warning when it is negative to prevent such mistakes in the future. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Reviewed-by: Aleksandr Loktionov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Martin KaFai Lau Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-10-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index d6fafb3633b0..a77d23fe2359 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4150,13 +4150,14 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; - unsigned int tailroom; + int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag) % rxq->frag_size; + WARN_ON_ONCE(tailroom < 0); if (unlikely(offset > tailroom)) return -EINVAL; -- cgit v1.2.3