From dad0d91cc2c3e6b6fb285ccfe7ddf71525797198 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:18 +0200 Subject: mm/slab: Add kvfree_atomic() helper kvmalloc() now supports non-sleeping GFP flags, including the vmalloc fallback path. This means it may return vmalloc memory even for GFP_ATOMIC and GFP_NOWAIT allocations. Freeing such memory with kvfree() may then end up calling vfree(), which is not safe for non-sleeping contexts. Introduce kvfree_atomic() helper for such cases. It mirrors kvfree(), but uses vfree_atomic() for vmalloced memory. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka (SUSE) Acked-by: Harry Yoo (Oracle) Signed-off-by: Herbert Xu --- include/linux/slab.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95..2b5ab488e96b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -- cgit v1.2.3 From 1f7305d87aa23db2579df222eba504a333c2c978 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 May 2026 17:52:03 +0100 Subject: KVM: arm64: Work around C1-Pro erratum 4193714 for protected guests C1-Pro cores with SME have an erratum where TLBI+DSB does not complete all outstanding SME accesses. Instead a DSB needs to be executed on the affected CPUs. The implication is that pages cannot be unmapped from the host Stage 2 and then provided to a protected guest or to the hypervisor. Host SME accesses may still complete after this point. This erratum breaks pKVM's guarantees, and the workaround is hard to implement as EL2 and EL1 share a security state meaning EL1 can mask IPIs sent by EL2, leading to interrupt blackouts. Instead, do this in EL3. This has the advantage of a separate security state, meaning lower EL cannot mask the IPI. It is also simpler for EL3 to know about CPUs that are off or in PSCI's CPU_SUSPEND. Add the needed hook to host_stage2_set_owner_metadata_locked(). This covers the cases where the host loses access to a page: __pkvm_host_donate_guest() __pkvm_guest_unshare_host() host_stage2_set_owner_locked() when owner_id == PKVM_ID_HYP Since pKVM relies on the firmware call for correctness, check for the firmware counterpart during protected KVM initialisation and fail the pKVM initialisation if it is missing. Signed-off-by: James Morse Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Mark Rutland Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Cc: Vincent Donnefort Cc: Lorenzo Pieralisi Cc: Sudeep Holla Link: https://patch.msgid.link/20260505165205.2690919-1-catalin.marinas@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 21 +++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 23 ++++++++++++++++++++++- include/linux/arm-smccc.h | 6 ++++++ 3 files changed, 49 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8..34c9950884d5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -4,6 +4,7 @@ * Author: Christoffer Dall */ +#include #include #include #include @@ -2638,6 +2639,22 @@ static int init_pkvm_host_sve_state(void) return 0; } +static int pkvm_check_sme_dvmsync_fw_call(void) +{ + struct arm_smccc_res res; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_4193714)) + return 0; + + arm_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + if (res.a0) { + kvm_err("pKVM requires firmware support for C1-Pro erratum 4193714\n"); + return -ENODEV; + } + + return 0; +} + /* * Finalizes the initialization of hyp mode, once everything else is initialized * and the initialziation process cannot fail. @@ -2838,6 +2855,10 @@ static int __init init_hyp_mode(void) if (err) goto out_err; + err = pkvm_check_sme_dvmsync_fw_call(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927c..a3050e2b65b1 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -5,6 +5,7 @@ */ #include + #include #include #include @@ -14,6 +15,7 @@ #include +#include #include #include #include @@ -29,6 +31,19 @@ static struct hyp_pool host_s2_pool; static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); #define current_vm (*this_cpu_ptr(&__current_vm)) +static void pkvm_sme_dvmsync_fw_call(void) +{ + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) { + struct arm_smccc_res res; + + /* + * Ignore the return value. Probing for the workaround + * availability took place in init_hyp_mode(). + */ + hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + } +} + static void guest_lock_component(struct pkvm_hyp_vm *vm) { hyp_spin_lock(&vm->lock); @@ -574,8 +589,14 @@ static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, addr, size, &host_s2_pool, KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); - if (!ret) + if (!ret) { + /* + * After stage2 maintenance has happened, but before the page + * owner has changed. + */ + pkvm_sme_dvmsync_fw_call(); __host_update_page_state(addr, size, PKVM_NOPAGE); + } return ret; } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d01..e7195750d21b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit v1.2.3 From 57c347a2e2473bfb5c1f1132a3209c55efbe640b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:02 -0700 Subject: platform/x86: intel: Add notifiers support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases a driver using services of vsec_tpmi driver requires some processing before vsec_tpmi exits. For example a children using debugfs can't use debugfs as this will be deleted by the vsec_tpmi driver. This is the case when unbind using PCI driver interface. In this case the remove callback of vsec_tpmi driver is called first, then remove callback of its children. Add support of blocking chain notifiers support. Notify on successful probe and before clean up in the remove callback. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-3-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec_tpmi.c | 19 +++++++++++++++++++ include/linux/intel_tpmi.h | 6 ++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index a38014e81e85..16fd7aa41f20 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,20 @@ struct tpmi_feature_state { /* Used during auxbus device creation */ static DEFINE_IDA(intel_vsec_tpmi_ida); +static BLOCKING_NOTIFIER_HEAD(tpmi_notify_list); + +int tpmi_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_register_notifier, "INTEL_TPMI"); + +int tpmi_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_unregister_notifier, "INTEL_TPMI"); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev) { struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev); @@ -832,6 +847,8 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) return ret; } + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_INIT, auxdev); + return 0; } @@ -845,6 +862,8 @@ static void tpmi_remove(struct auxiliary_device *auxdev) { struct intel_tpmi_info *tpmi_info = auxiliary_get_drvdata(auxdev); + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_EXIT, auxdev); + debugfs_remove_recursive(tpmi_info->dbgfs_dir); } diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb..15f02422e9ca 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); -- cgit v1.2.3 From c73370c677646e86fc4b1780fb07027bdf847375 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 28 Apr 2026 16:58:56 +0100 Subject: btrfs: tracepoints: fix sleep while in atomic context in btrfs_sync_file() The trace event btrfs_sync_file() is called in an atomic context (all trace events are) and its call to dput(), which is needed due to the call to dget_parent(), can sleep, triggering a kernel splat. This can be reproduced by enabling the trace event and running btrfs/056 from fstests for example. The splat shown in dmesg is the following: [53.919] BUG: sleeping function called from invalid context at fs/dcache.c:970 [53.947] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 32773, name: xfs_io [53.988] preempt_count: 2, expected: 0 [53.967] RCU nest depth: 0, expected: 0 [53.943] Preemption disabled at: [53.944] [<0000000000000000>] 0x0 [54.078] CPU: 0 UID: 0 PID: 32773 Comm: xfs_io Tainted: G W 7.1.0-rc1-btrfs-next-232+ #1 PREEMPT(full) [54.070] Tainted: [W]=WARN [54.071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [54.072] Call Trace: [54.074] [54.076] dump_stack_lvl+0x56/0x80 [54.079] __might_resched.cold+0xd6/0x10f [54.072] dput.part.0+0x24/0x110 [54.078] trace_event_raw_event_btrfs_sync_file+0x75/0x140 [btrfs] [54.089] btrfs_sync_file+0x1ed/0x530 [btrfs] [54.087] ? __handle_mm_fault+0x8ae/0xed0 [54.089] btrfs_do_write_iter+0x172/0x210 [btrfs] [54.091] vfs_write+0x21f/0x450 [54.094] __x64_sys_pwrite64+0x8d/0xc0 [54.096] ? do_user_addr_fault+0x20c/0x670 [54.099] do_syscall_64+0x60/0xf20 [54.092] ? clear_bhb_loop+0x60/0xb0 [54.094] entry_SYSCALL_64_after_hwframe+0x76/0x7e So stop using dget_parent() and dput() and access the parent dentry directly as dentry->d_parent. This is also what ext4 is doing in its equivalent trace event ext4_sync_file_enter(). Fixes: a85b46db143f ("btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file()") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ad7a2d76c1d..ec1df8b94517 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -771,10 +771,8 @@ TRACE_EVENT(btrfs_sync_file, TP_fast_assign( struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); - struct dentry *parent = dget_parent(dentry); - struct inode *parent_inode = d_inode(parent); + struct inode *parent_inode = d_inode(dentry->d_parent); - dput(parent); TP_fast_assign_fsid(btrfs_sb(inode->i_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->parent = btrfs_ino(BTRFS_I(parent_inode)); -- cgit v1.2.3 From b62eb8dcf2c47d4d676a434efbd57c4f776f7829 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:14 +0200 Subject: netfilter: x_tables: allocate hook ops while under mutex arp/ip(6)t_register_table() add the table to the per-netns list via xt_register_table() before allocating the per-netns hook ops copy via kmemdup_array(). This leaves a window where the table is visible in the list with ops=NULL. If the pernet exit happens runs concurrently the pre_exit callback finds the table via xt_find_table() and passes the NULL ops pointer to nf_unregister_net_hooks(), causing a NULL dereference: general protection fault in nf_unregister_net_hooks+0xbc/0x150 RIP: nf_unregister_net_hooks (net/netfilter/core.c:613) Call Trace: ipt_unregister_table_pre_exit iptable_mangle_net_pre_exit ops_pre_exit_list cleanup_net Fix by moving the ops allocation into the xtables core so the table is never in the list without valid ops. Also ensure the table is no longer processing packets before its torn down on error unwind. nf_register_net_hooks might have published at least one hook; call synchronize_rcu() if there was an error. audit log register message gets deferred until all operations have passed, this avoids need to emit another ureg message in case of error unwinding. Based on earlier patch by Tristan Madani. Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Fixes: ee177a54413a ("netfilter: ip6_tables: pass table pointer via nf_hook_ops") Fixes: ae689334225f ("netfilter: ip_tables: pass table pointer via nf_hook_ops") Link: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + net/ipv4/netfilter/arp_tables.c | 35 +++----------------------- net/ipv4/netfilter/ip_tables.c | 41 +++---------------------------- net/ipv6/netfilter/ip6_tables.c | 38 +++-------------------------- net/netfilter/x_tables.c | 50 ++++++++++++++++++++++++++++++++------ 5 files changed, 55 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118..cb4b694dd9e4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,6 +305,7 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97ead883e4a1..c02e46a0271a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1522,13 +1522,11 @@ int arpt_register_table(struct net *net, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1543,7 +1541,7 @@ int arpt_register_table(struct net *net, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; @@ -1553,31 +1551,6 @@ int arpt_register_table(struct net *net, return PTR_ERR(new_table); } - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __arpt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 23c8deff8095..488c5945ebb2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1724,13 +1724,11 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1745,7 +1743,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; @@ -1755,37 +1753,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - /* No template? No need to do anything. This is used by 'nat' table, it registers - * with the nat core instead of the netfilter core. - */ - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ipt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d585ac3c1113..dbe7c7acd702 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1733,13 +1733,11 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1754,7 +1752,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; @@ -1764,34 +1762,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ip6t_unregister_table(net, new_table); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bb0cb3959551..06f27bea9eed 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1542,7 +1542,6 @@ xt_replace_table(struct xt_table *table, unsigned int num_counters, private = do_replace_table(table, num_counters, newinfo, error); if (private) audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : AUDIT_XT_OP_REPLACE, GFP_KERNEL); @@ -1552,20 +1551,32 @@ EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t, *table = NULL; + struct nf_hook_ops *ops = NULL; struct xt_table_info *private; - struct xt_table *t, *table; - int ret; + unsigned int num_ops; + int ret = -EINVAL; + + num_ops = hweight32(input_table->valid_hooks); + if (num_ops == 0) + goto out; + + ret = -ENOMEM; + if (template_ops) { + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); + if (!ops) + goto out; + } /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); - if (!table) { - ret = -ENOMEM; + if (!table) goto out; - } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ @@ -1579,7 +1590,7 @@ struct xt_table *xt_register_table(struct net *net, /* Simplifies replace_table code. */ table->private = bootstrap; - if (!xt_replace_table(table, 0, newinfo, &ret)) + if (!do_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; @@ -1588,14 +1599,37 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; + if (ops) { + int i; + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) { + mutex_unlock(&xt[table->af].mutex); + /* nf_register_net_hooks() might have published a + * base chain before internal error unwind. + */ + synchronize_rcu(); + goto out; + } + + table->ops = ops; + } + + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REGISTER, GFP_KERNEL); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); - kfree(table); out: + kfree(table); + kfree(ops); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); -- cgit v1.2.3 From 527d6931473b75d90e38942aae6537d1a527f1fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:15 +0200 Subject: netfilter: x_tables: add and use xt_unregister_table_pre_exit Remove the copypasted variants of _pre_exit and add one single function in the xtables core. ebtables is not compatible with x_tables and therefore unchanged. This is a preparation patch to reduce noise in the followup bug fixes. Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_arp/arp_tables.h | 1 - include/linux/netfilter_ipv4/ip_tables.h | 1 - include/linux/netfilter_ipv6/ip6_tables.h | 1 - net/ipv4/netfilter/arp_tables.c | 9 --------- net/ipv4/netfilter/arptable_filter.c | 2 +- net/ipv4/netfilter/ip_tables.c | 9 --------- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_nat.c | 1 + net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 9 --------- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_nat.c | 1 + net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- net/netfilter/x_tables.c | 29 +++++++++++++++++++++++++++++ 19 files changed, 41 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index cb4b694dd9e4..74486714ae20 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -309,6 +309,7 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa4..05631a25e622 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4d..13593391d605 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c76..c6d5b927830d 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c02e46a0271a..bd348b7bad2c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1554,15 +1554,6 @@ int arpt_register_table(struct net *net, return ret; } -void arpt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} -EXPORT_SYMBOL(arpt_unregister_table_pre_exit); - void arpt_unregister_table(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 78cd5ee24448..393d9a8c7739 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 488c5945ebb2..864489928fb5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1756,14 +1756,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } -void ipt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ipt_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); @@ -1854,7 +1846,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 3ab908b74795..b2fbd9651d61 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 385d945d8ebe..a99e61996197 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 625a1ca13b1b..8fc4912e790d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,6 +129,7 @@ static int iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { ipt_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); } static void __net_exit iptable_nat_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0e7f53964d0a..42511721e538 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index d885443cb267..4646bf6d7d2b 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dbe7c7acd702..edf50bc7787e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1765,14 +1765,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ip6t_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); @@ -1864,7 +1856,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index e8992693e14a..f05a9e4b2c67 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 8dd4cd0c47bd..afa4a5703e43 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 5be723232df8..bb8aa3fc42b4 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,6 +131,7 @@ static int ip6table_nat_table_init(struct net *net) static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { ip6t_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); } static void __net_exit ip6table_nat_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index fc9f6754028f..32d2da81c52a 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 4df14a9bae78..3dfd8d6ea4b9 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 06f27bea9eed..9c1e896c7b03 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1650,6 +1650,35 @@ void *xt_unregister_table(struct xt_table *table) return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); + +/** + * xt_unregister_table_pre_exit - pre-shutdown unregister of a table + * @net: network namespace + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Unregisters the specified netfilter table from the given network namespace + * and also unregisters the hooks from netfilter core: no new packets will be + * processed. + */ +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + + if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; + } + } + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_table_pre_exit); #endif #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From b4597d5fd7d2f8cebfffd40dffb5e003cc78964c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:17 +0200 Subject: netfilter: x_tables: add and use xtables_unregister_table_exit Previous change added xtables_unregister_table_pre_exit to detach the table from the packetpath and to unlink it from the active table list. In case of rmmod, userspace that is doing set/getsockopt for this table will not be able to re-instantiate the table: 1. The larval table has been removed already 2. existing instantiated table is no longer on the xt pernet table list. This adds the second stage helper: unlink the table from the dying list, free the hook ops (if any) and do the audit notification. It replaces xt_unregister_table(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/ipv4/netfilter/arp_tables.c | 9 ++--- net/ipv4/netfilter/ip_tables.c | 9 ++--- net/ipv4/netfilter/iptable_nat.c | 5 ++- net/ipv6/netfilter/ip6_tables.c | 9 ++--- net/ipv6/netfilter/ip6table_nat.c | 5 ++- net/netfilter/x_tables.c | 81 +++++++++++++++++++++++++++++--------- 7 files changed, 83 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 74486714ae20..5a1c5c336fa4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -308,8 +308,8 @@ struct xt_table *xt_register_table(struct net *net, const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index bd348b7bad2c..ad2259678c78 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len static void __arpt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; + void *loc_cpu_entry; struct arpt_entry *iter; - private = xt_unregister_table(table); - /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) @@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int arpt_register_table(struct net *net, @@ -1556,7 +1555,7 @@ int arpt_register_table(struct net *net, void arpt_unregister_table(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 864489928fb5..5cbdb0815857 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ipt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ipt_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1718,6 +1716,7 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ipt_register_table(struct net *net, const struct xt_table *table, @@ -1758,7 +1757,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, void ipt_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 8fc4912e790d..a0df72554025 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net) } ret = ipt_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); + synchronize_rcu(); ipt_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index edf50bc7787e..9d9c3763f2f5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ip6t_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1727,6 +1725,7 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ip6t_register_table(struct net *net, const struct xt_table *table, @@ -1767,7 +1766,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, void ip6t_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index bb8aa3fc42b4..c2394e2c94b5 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net) } ret = ip6t_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); + synchronize_rcu(); ip6t_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9c1e896c7b03..4e6708c23922 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; + + /* stash area used during netns exit */ + struct list_head dead_tables[NFPROTO_NUMPROTO]; }; struct compat_delta { @@ -1634,23 +1637,6 @@ out: } EXPORT_SYMBOL_GPL(xt_register_table); -void *xt_unregister_table(struct xt_table *table) -{ - struct xt_table_info *private; - - mutex_lock(&xt[table->af].mutex); - private = table->private; - list_del(&table->list); - mutex_unlock(&xt[table->af].mutex); - audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); - kfree(table->ops); - kfree(table); - - return private; -} -EXPORT_SYMBOL_GPL(xt_unregister_table); - /** * xt_unregister_table_pre_exit - pre-shutdown unregister of a table * @net: network namespace @@ -1660,6 +1646,14 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); * Unregisters the specified netfilter table from the given network namespace * and also unregisters the hooks from netfilter core: no new packets will be * processed. + * + * This must be called prior to xt_unregister_table_exit() from the pernet + * .pre_exit callback. After this call, the table is no longer visible to + * the get/setsockopt path. In case of rmmod, module exit path must have + * called xt_unregister_template() prior to unregistering pernet ops to + * prevent re-instantiation of the table. + * + * See also: xt_unregister_table_exit() */ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) { @@ -1669,6 +1663,7 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &xt_net->dead_tables[af]); mutex_unlock(&xt[af].mutex); if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ @@ -1679,6 +1674,50 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_table_pre_exit); + +/** + * xt_unregister_table_exit - remove a table during namespace teardown + * @net: the network namespace from which to unregister the table + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Completes the unregister process for a table. This must be called from + * the pernet ops .exit callback. This is the second stage after + * xt_unregister_table_pre_exit(). + * + * pair with xt_unregister_table_pre_exit() during namespace shutdown. + * + * Return: the unregistered table or NULL if the table was never + * instantiated. The caller needs to kfree() the table after it + * has removed the family specific matches/targets. + */ +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *table; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(table, &xt_net->dead_tables[af], list) { + struct nf_hook_ops *ops = NULL; + + if (strcmp(table->name, name) != 0) + continue; + + list_del(&table->list); + + audit_log_nfcfg(table->name, table->af, table->private->number, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + swap(table->ops, ops); + mutex_unlock(&xt[af].mutex); + + kfree(ops); + return table; + } + mutex_unlock(&xt[af].mutex); + + return NULL; +} +EXPORT_SYMBOL_GPL(xt_unregister_table_exit); #endif #ifdef CONFIG_PROC_FS @@ -2125,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { INIT_LIST_HEAD(&xt_net->tables[i]); + INIT_LIST_HEAD(&xt_net->dead_tables[i]); + } return 0; } @@ -2135,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i])); + } } static struct pernet_operations xt_net_ops = { -- cgit v1.2.3 From dcb0f9aefdd604d36710fda53c25bd7cf4a3e37a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 May 2026 13:00:28 +0200 Subject: netfilter: nf_conntrack_expect: restore helper propagation via expectation A recent series to fix expectations broke helper propagation via expectation, this mechanism is used by the sip and h323 helper. This also propagates the conntrack helper to expected connections. I changed semantics of exp->helper which now tells us the actual helper that created the expectation. Add an explicit assign_helper field to expectations for this purpose and update helpers to use it. Restore this feature for userspace conntrack helper via ctnetlink nfqueue integration so it is again possible to attach a helper to an expectation, where it makes sense. This is not restored via ctnetlink expectation creation as there is no client for such feature. Use the expectation layer 4 protocol number for the helper lookup for consistency. Make sure the expectation using this helper propagation mechanism also go away when the helper is unregistered. Fixes: 9c42bc9db90a ("netfilter: nf_conntrack_expect: honor expectation helper field") Fixes: 917b61fa2042 ("netfilter: ctnetlink: ignore explicit helper on new expectations") Reported-by: Ilya Maximets Tested-by: Ilya Maximets Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 5 ++++- net/netfilter/nf_conntrack_broadcast.c | 1 + net/netfilter/nf_conntrack_core.c | 7 +++++-- net/netfilter/nf_conntrack_expect.c | 1 + net/netfilter/nf_conntrack_h323_main.c | 12 ++++++------ net/netfilter/nf_conntrack_helper.c | 5 +++++ net/netfilter/nf_conntrack_netlink.c | 18 ++++++++++++++++-- net/netfilter/nf_conntrack_sip.c | 2 +- 8 files changed, 39 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index e9a8350e7ccf..80f50fd0f7ad 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -45,9 +45,12 @@ struct nf_conntrack_expect { void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); - /* Helper to assign to new connection */ + /* Helper that created this expectation */ struct nf_conntrack_helper __rcu *helper; + /* Helper to assign to new connection */ + struct nf_conntrack_helper __rcu *assign_helper; + /* The conntrack of the master connection */ struct nf_conn *master; diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4f39bf7c843f..75e53fde6b29 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b08189226320..8ba5b22a1eef 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1811,14 +1811,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { + struct nf_conntrack_helper *assign_helper; + /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; - if (exp->helper) { + assign_helper = rcu_dereference(exp->assign_helper); + if (assign_helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) - rcu_assign_pointer(help->helper, exp->helper); + rcu_assign_pointer(help->helper, assign_helper); } #ifdef CONFIG_NF_CONNTRACK_MARK diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 24d0576d84b7..8e943efbdf0a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3f5c50455b71..b2fe6554b9cf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); + rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a715304a53d8..b594cd244fe1 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -400,6 +400,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); + if (this == me) + return true; + + this = rcu_dereference_protected(exp->assign_helper, + lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eda5fe4a75c8..d7209d124111 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2860,6 +2861,7 @@ static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { + struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; @@ -2875,8 +2877,18 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + assign_helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + tuple.dst.protonum); + if (!assign_helper) + return -EOPNOTSUPP; + } + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - &tuple, &mask); + assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3515,6 +3527,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { @@ -3568,6 +3581,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; @@ -3623,7 +3637,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1eb55907d470..d24bfa9e8234 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1383,7 +1383,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); -- cgit v1.2.3 From 307abfac04a254c09c5705d816b33354acee97a0 Mon Sep 17 00:00:00 2001 From: Jianpeng Chang Date: Fri, 8 May 2026 09:56:36 +0900 Subject: kprobes: skip non-symbol addresses in kprobe_add_ksym_blacklist() When kprobe_add_area_blacklist() iterates through a section like .kprobes.text, the start address may not correspond to a named symbol. On ARM64 with CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS=y (introduced by commit baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS")), the compiler flag -fpatchable-function-entry=4,2 inserts 2 NOPs before each function entry point for ftrace call_ops. These pre-function NOPs sit at the section base address, before the first named function symbol. The compiler emits a $x mapping symbol at offset 0x00 to mark the start of code, but find_kallsyms_symbol() ignores mapping symbols. Without CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS (e.g. defconfig), no pre-function NOPs are inserted, the first function starts at offset 0x00, and the bug does not trigger. This only affects modules that have a .kprobes.text section (i.e. those using the __kprobes annotation). Modules using NOKPROBE_SYMBOL() instead (like kretprobe_example.ko) blacklist exact function addresses via the _kprobe_blacklist section and are not affected. For kprobe_example.ko on ARM64 with -fpatchable-function-entry=4,2, the .kprobes.text section layout is: offset 0x00: $x + 2 NOPs (mapping symbol + ftrace preamble) offset 0x08: handler_post (64 bytes) offset 0x50: handler_pre (68 bytes) kprobe_add_area_blacklist() starts iterating from the section base address (offset 0x00), which only has the $x mapping symbol. kprobe_add_ksym_blacklist() then calls kallsyms_lookup_size_offset() for this address, which goes through: kallsyms_lookup_size_offset() -> module_address_lookup() -> find_kallsyms_symbol() find_kallsyms_symbol() scans all module symbols to find the closest preceding symbol. Since no named text symbol exists at offset 0x00, find_kallsyms_symbol() picks __UNIQUE_ID_vermagic (a .modinfo symbol whose address is in the temporary image) as the "best" match. The computed "size" = next_text_symbol - modinfo_symbol spans across these two unrelated memory regions, creating a blacklist entry with a bogus range of tens of terabytes. Whether this causes a visible failure depends on address randomization, here is what happens on Raspberry Pi 4/5: - On RPi5, the bogus size was ~35 TB. start + size stayed within 64-bit range, so the blacklist entry covered the entire kernel text. register_kprobe() in the module's own init function failed with -EINVAL. - On RPi4, the bogus size was ~75 TB. start + size overflowed 64 bits and wrapped to a small address near zero. The range check (addr >= start && addr < end) then failed because end wrapped around, so the bogus entry was accidentally harmless and kprobes worked by luck. The same bug exists on both machines, but randomization determines whether the integer overflow masks it or not. Fix this by adding notrace to the __kprobes macro. Functions in .kprobes.text are kprobe infrastructure handlers that should never be traced by ftrace. With notrace, the compiler stops inserting them and the non-symbol gap at the section start disappears entirely. Link: https://lore.kernel.org/all/20260506012706.2785785-1-jianpeng.chang.cn@windriver.com/ Fixes: baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS") Signed-off-by: Jianpeng Chang Signed-off-by: Masami Hiramatsu (Google) --- include/asm-generic/kprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h index 060eab094e5a..5290a2b2e15a 100644 --- a/include/asm-generic/kprobes.h +++ b/include/asm-generic/kprobes.h @@ -14,7 +14,7 @@ static unsigned long __used \ _kbl_addr_##fname = (unsigned long)fname; # define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname) /* Use this to forbid a kprobes attach on very low level functions */ -# define __kprobes __section(".kprobes.text") +# define __kprobes notrace __section(".kprobes.text") # define nokprobe_inline __always_inline #else # define NOKPROBE_SYMBOL(fname) -- cgit v1.2.3 From 411c1cf430392c905e39f12bc305dd994da0b426 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 May 2026 15:20:23 +0100 Subject: arm64/entry: Fix arm64-specific rseq brokenness Mathias Stearn reports that since v6.19, there are two big issues affecting rseq: (1) On arm64 specifically, rseq critical sections aren't aborted when they should be. (2) The 'cpu_id_start' field is no longer written by the kernel in all cases it used to be, including some cases where TCMalloc depends on the kernel clobbering the field. This patch fixes issue #1. This patch DOES NOT fix issue #2, which will need to be addressed by other patches. The arm64-specific brokenness is a result of commits: 2fc0e4b4126c ("rseq: Record interrupt from user space") 39a167560a61 ("rseq: Optimize event setting") The first commit failed to add a call to rseq_note_user_irq_entry() on arm64. Thus arm64 never sets rseq_event::user_irq to record that it may be necessary to abort an active rseq critical section upon return to userspace. On its own, this commit had no functional impact as the value of rseq_event::user_irq was not consumed. The second commit relied upon rseq_event::user_irq to determine whether or not to bother to perform rseq work when returning to userspace. As rseq_event::user_irq wasn't set on arm64, this work would be skipped, and consequently an active rseq critical section would not be aborted. Fix this by giving arm64 syscall-specific entry/exit paths, and performing the relevant logic in syscall and non-syscall paths, including calling rseq_note_user_irq_entry() for non-syscall entry. Currently arm64 cannot use syscall_enter_from_user_mode(), syscall_exit_to_user_mode(), and irqentry_exit_to_user_mode(), due to ordering constraints with exception masking, and risk of ABI breakage for syscall tracing/audit/etc. For the moment the entry/exit logic is left as arm64-specific, directly using enter_from_user_mode() and exit_to_user_mode(), but mirroring the generic code. I intend to follow up with refactoring/cleanup, as we did for kernel mode entry paths in commit: 041aa7a85390 ("entry: Split preemption from irqentry_exit_to_kernel_mode()") ... which will allow arm64 to use the GENERIC_IRQ_ENTRY functions directly. Fixes: 39a167560a61 ("rseq: Optimize event setting") Reported-by: Mathias Stearn Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lore.kernel.org/regressions/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com/ Link: https://patch.msgid.link/20260508142023.3268622-1-mark.rutland@arm.com --- arch/arm64/kernel/entry-common.c | 31 ++++++++++++++++++++++++------- include/linux/irq-entry-common.h | 8 -------- include/linux/rseq_entry.h | 19 ------------------- 3 files changed, 24 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index cb54335465f6..c7a23f7c2212 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -62,6 +62,13 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_exit_to_kernel_mode_after_preempt(regs, state); } +static __always_inline void arm64_syscall_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + mte_disable_tco_entry(current); + sme_enter_from_user_mode(); +} + /* * Handle IRQ/context state management when entering from user mode. * Before this function is called it is not safe to call regular kernel code, @@ -70,20 +77,30 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); mte_disable_tco_entry(current); sme_enter_from_user_mode(); } +static __always_inline void arm64_syscall_exit_to_user_mode(struct pt_regs *regs) +{ + local_irq_disable(); + syscall_exit_to_user_mode_prepare(regs); + local_daif_mask(); + sme_exit_to_user_mode(); + mte_check_tfsr_exit(); + exit_to_user_mode(); +} + /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ - static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare_legacy(regs); + irqentry_exit_to_user_mode_prepare(regs); local_daif_mask(); sme_exit_to_user_mode(); mte_check_tfsr_exit(); @@ -92,7 +109,7 @@ static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } /* @@ -716,12 +733,12 @@ static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); fpsimd_syscall_exit(); } @@ -868,11 +885,11 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc_compat(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf04..1fabf0f5ea8e 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 2d0295df5107..63bc72086e75 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -749,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -782,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ -- cgit v1.2.3 From b2ed01e7ad3de80333e9b962a44024b094bc0b2b Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Tue, 28 Apr 2026 11:44:42 +0200 Subject: drm/ttm: Fix ttm_bo_swapout() infinite LRU walk on swapout failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ttm_tt_swapout() fails, the current code calls ttm_resource_add_bulk_move() followed by ttm_resource_move_to_lru_tail() to restore the resource's bulk_move membership. However, ttm_resource_move_to_lru_tail() places the resource at the tail of the LRU list which, relative to the walk cursor's hitch node (placed immediately after the resource when it was yielded), puts the resource *in front of the* the hitch. The next list_for_each_entry_continue() from the hitch finds the same resource again, causing an infinite loop. Fix by deferring del_bulk_move to the success path only. On the success path, TTM_TT_FLAG_SWAPPED has just been set by ttm_tt_swapout() but the resource is still tracked in the bulk_move range, so ttm_resource_del_bulk_move()'s !ttm_resource_unevictable() guard would incorrectly skip the removal. Introduce ttm_resource_del_bulk_move_unevictable() which bypasses that guard. Reported-by: Jatin Kataria Fixes: fc5d96670eb2 ("drm/ttm: Move swapped objects off the manager's LRU list") Cc: Christian König Cc: Matthew Brost Cc: Cc: # v6.13+ Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström Reviewed-by: Christian König Tested-by: Boqun Feng Link: https://patch.msgid.link/20260428094442.16985-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 16 ++++++---------- drivers/gpu/drm/ttm/ttm_resource.c | 13 +++++++++++++ include/drm/ttm/ttm_resource.h | 2 ++ 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index d85f0a37ac35..293401705542 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1177,17 +1177,13 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo) bdev->funcs->swap_notify(bo); if (ttm_tt_is_populated(tt)) { - spin_lock(&bdev->lru_lock); - ttm_resource_del_bulk_move(bo->resource, bo); - spin_unlock(&bdev->lru_lock); - ret = ttm_tt_swapout(bdev, tt, swapout_walk->gfp_flags); - - spin_lock(&bdev->lru_lock); - if (ret) - ttm_resource_add_bulk_move(bo->resource, bo); - ttm_resource_move_to_lru_tail(bo->resource); - spin_unlock(&bdev->lru_lock); + if (!ret) { + spin_lock(&bdev->lru_lock); + ttm_resource_del_bulk_move_unevictable(bo->resource, bo); + ttm_resource_move_to_lru_tail(bo->resource); + spin_unlock(&bdev->lru_lock); + } } out: diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 9f36631d48b6..0e5f1582f13d 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -292,6 +292,19 @@ void ttm_resource_del_bulk_move(struct ttm_resource *res, ttm_lru_bulk_move_del(bo->bulk_move, res); } +/* + * Remove a resource from its bulk_move, bypassing the unevictable check. + * Use only when the resource is known to still be tracked in the range despite + * the BO having just become unevictable; asserts that this is the case. + */ +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo) +{ + WARN_ON_ONCE(!ttm_resource_unevictable(res, bo)); + if (bo->bulk_move) + ttm_lru_bulk_move_del(bo->bulk_move, res); +} + /* Move a resource to the LRU or bulk tail */ void ttm_resource_move_to_lru_tail(struct ttm_resource *res) { diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index 33e80f30b8b8..a5d386583fb6 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -448,6 +448,8 @@ void ttm_resource_add_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); void ttm_resource_del_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo); void ttm_resource_move_to_lru_tail(struct ttm_resource *res); void ttm_resource_init(struct ttm_buffer_object *bo, -- cgit v1.2.3 From e68eadffb724b36ffd3d5619e0efcaf29ec2a175 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:16 +0800 Subject: ipv6: flowlabel: enforce per-netns limit for unprivileged callers fl_size, fl_ht and ip6_fl_lock in net/ipv6/ip6_flowlabel.c are file scope and shared across netns. mem_check() reads fl_size to decide whether to deny non-CAP_NET_ADMIN callers. capable() runs against init_user_ns, so an unprivileged user in any non-init userns can push fl_size past FL_MAX_SIZE - FL_MAX_SIZE / 4 and starve every other unprivileged userns on the host. Add struct netns_ipv6::flowlabel_count, bumped and decremented next to fl_size in fl_intern, ip6_fl_gc and ip6_fl_purge. The new field fills the existing 4-byte hole after ipmr_seq, so struct netns_ipv6 stays the same size on 64-bit builds. Bump FL_MAX_SIZE from 4096 to 8192. It has been 4096 since the file was added. Machines and connection counts have grown. mem_check() folds an extra per-netns ceiling into the existing non-CAP_NET_ADMIN conditional. The ceiling is half of the total budget that unprivileged callers have ever been able to use, i.e. (FL_MAX_SIZE - FL_MAX_SIZE / 4) / 2 = 3072 entries. With FL_MAX_SIZE doubled, this preserves the original per-user reach of 3K (what an unprivileged caller could already obtain before this change), while forcing an attacker to spread allocations across at least two netns to exhaust the global non-CAP_NET_ADMIN budget. CAP_NET_ADMIN against init_user_ns still bypasses both caps. The previous patch took ip6_fl_lock across mem_check and fl_intern, so the new flowlabel_count read in mem_check and the new flowlabel_count++ in fl_intern run under the same critical section. flowlabel_count is therefore plain int, like fl_size. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-3-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 499e4288170f..875916d60bfe 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -119,6 +119,7 @@ struct netns_ipv6 { struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; atomic_t ipmr_seq; + int flowlabel_count; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a8974643195a..b1ccdf0dc646 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,7 +36,7 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; - fl_free(fl); fl_size--; + fl->fl_net->ipv6.flowlabel_count--; + fl_free(fl); continue; } if (!sched || time_before(ttd, sched)) @@ -197,6 +198,7 @@ static void __net_exit ip6_fl_purge(struct net *net) *flp = fl->next; fl_free(fl); fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -243,6 +245,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -460,6 +463,9 @@ done: static int mem_check(struct sock *sk) { + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); int room; struct ipv6_fl_socklist *sfl; int count = 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; -- cgit v1.2.3 From f2ab4fd02777c4081be38c35f939e4dc529b8952 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 7 May 2026 14:04:26 +0200 Subject: net: nsh: fix incorrect header length macros NSH header length is a 6-bit field that encodes the total length of the header in 4-byte words. So the maximum length is 0b111111 * 4, which is 252 and not 256. The maximum context length is the same number minus the length of the base header (8), so 244. These macros are used to validate push_nsh() action in openvswitch. Miscalculation here doesn't cause any real issues. In the worst case the oversized context is truncated while building the header, so we'll construct and send a broken packet, which is not a big problem, as any receiver should validate the fields. No invalid memory accesses will happen during the header push. But we should fix the macros to reject the incorrect actions in the first place. Using previously defined values and calculating the length instead of defining numbers directly, so it's easier to understand where they come from and harder to make a mistake. Fixes: 1f0b7744c505 ("net: add NSH header structures and helpers") Signed-off-by: Ilya Maximets Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20260507120434.2962505-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/nsh.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/nsh.h b/include/net/nsh.h index 16a751093896..15a26c590815 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -247,10 +247,10 @@ struct nshhdr { #define NSH_M_TYPE1_LEN 24 /* NSH header maximum Length. */ -#define NSH_HDR_MAX_LEN 256 +#define NSH_HDR_MAX_LEN ((NSH_LEN_MASK >> NSH_LEN_SHIFT) * 4) /* NSH context headers maximum Length. */ -#define NSH_CTX_HDRS_MAX_LEN 248 +#define NSH_CTX_HDRS_MAX_LEN (NSH_HDR_MAX_LEN - NSH_BASE_HDR_LEN) static inline struct nshhdr *nsh_hdr(struct sk_buff *skb) { -- cgit v1.2.3 From efda25ee84325385f859d10872590e90ce837243 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 6 May 2026 20:07:13 +0000 Subject: genetlink: free the skb on 'group >= family->n_mcgrps' These methods generally consume ownership of the provided skb, so even if an error path is encountered, the skb is freed. This is because the very first thing they do after some initial setup is to unconditionally consume the skb via consume_skb(skb). Any subsequent errors lead to the core netlink layer freeing the skb. However, there is one check that occurs before ownership is passed, which is the check for the group index. So if this error condition is encountered, then the skb is leaked. This error condition is generally considered a violation of the netlink API, so it's not expected to occur under normal circumstances. For the same reason, no callers check for this error condition, and no callers need to be adjusted. However, we should still follow the same ownership semantics of the rest of the function. Thus, free the skb in this codepath. Suggested-by: Andrew Lunn Suggested-by: Matthew Maurer Fixes: 2a94fe48f32c ("genetlink: make multicast groups const, prevent abuse") Link: https://lore.kernel.org/r/845b36ba-7b3a-41f2-acb2-b284f253e2ca@lunn.ch Signed-off-by: Alice Ryhl Link: https://patch.msgid.link/20260506-genlmsg-return-v2-1-a63ee2a055d6@google.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 4 +++- net/netlink/genetlink.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 7b84f2cef8b1..d70510ac31ab 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -489,8 +489,10 @@ genlmsg_multicast_netns_filtered(const struct genl_family *family, netlink_filter_fn filter, void *filter_data) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + nlmsg_free(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d251d894afd4..0da39eaed255 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); @@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return; + } group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, -- cgit v1.2.3 From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Replace SCX_TASK_OFF_TASKS flag with SCX_TASK_DEAD state SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup task iteration would skip them. This can be expressed better with a task state. Replace the flag with SCX_TASK_DEAD. scx_disable_and_exit_task() resets state to NONE on its way out, so sched_ext_dead() now sets DEAD after the wrapper returns. The validation matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's predecessor to INIT or ENABLED so the new DEAD value cannot silently transition to READY. Prepares for the following enable vs dead race fix. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 9 +++++---- kernel/sched/ext.c | 17 +++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068a..9f1a326ad03e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 81841277a54f..2fc4a12711f9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -723,17 +723,22 @@ static void scx_set_task_state(struct task_struct *p, u32 state) switch (state) { case SCX_TASK_NONE: + warn = prev_state == SCX_TASK_DEAD; break; case SCX_TASK_INIT: warn = prev_state != SCX_TASK_NONE; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_INIT || + prev_state == SCX_TASK_ENABLED); break; case SCX_TASK_ENABLED: warn = prev_state != SCX_TASK_READY; break; + case SCX_TASK_DEAD: + warn = prev_state != SCX_TASK_NONE; + break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", prev_state, state, p->comm, p->pid); @@ -972,11 +977,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) /* * cgroup_task_dead() removes the dead tasks from cset->tasks * after sched_ext_dead() and cgroup iteration may see tasks - * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS - * is set by sched_ext_dead() under @p's rq lock. Test it to + * which already finished sched_ext_dead(). %SCX_TASK_DEAD is + * set by sched_ext_dead() under @p's rq lock. Test it to * avoid visiting tasks which are already dead from SCX POV. */ - if (p->scx.flags & SCX_TASK_OFF_TASKS) { + if (scx_get_task_state(p) == SCX_TASK_DEAD) { __scx_task_iter_rq_unlock(iter); continue; } @@ -3847,7 +3852,7 @@ void sched_ext_dead(struct task_struct *p) * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. * - * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. @@ -3858,7 +3863,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); - p->scx.flags |= SCX_TASK_OFF_TASKS; + scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } } -- cgit v1.2.3 From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 10 +++++---- kernel/sched/ext.c | 56 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9f1a326ad03e..2129e18ada58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_ent_flags { * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext @@ -116,10 +117,11 @@ enum scx_ent_flags { SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2fc4a12711f9..29fa9ffe7c7b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state) case SCX_TASK_NONE: warn = prev_state == SCX_TASK_DEAD; break; - case SCX_TASK_INIT: + case SCX_TASK_INIT_BEGIN: warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_INIT_BEGIN; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: @@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state) warn = prev_state != SCX_TASK_READY; break; case SCX_TASK_DEAD: - warn = prev_state != SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_NONE || + prev_state == SCX_TASK_INIT_BEGIN); break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", @@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); ret = __scx_init_task(sch, p, true); - if (unlikely(ret)) + if (unlikely(ret)) { + scx_set_task_state(p, SCX_TASK_NONE); return ret; + } scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, sch); } @@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p) * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. + * + * %INIT_BEGIN means ops.init_task() is running for @p. Don't call + * into ops; transition to %DEAD so the post-init recheck unwinds + * via scx_sub_init_cancel_task(). */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_disable_and_exit_task(scx_task_sched(p), p); + if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) + scx_disable_and_exit_task(scx_task_sched(p), p); scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } @@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch) * $p having already been initialized, and then enable. */ scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); @@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { + struct rq_flags rf; + struct rq *rq; + /* * @p may already be dead, have lost all its usages counts and * be waiting for RCU grace period before being freed. @p can't @@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!tryget_task_struct(p)) continue; + /* + * Set %INIT_BEGIN under the iter's rq lock so that a concurrent + * sched_ext_dead() does not call ops.exit_task() on @p while + * ops.init_task() is running. If sched_ext_dead() runs before + * this store, it has already removed @p from scx_tasks and the + * iter won't visit @p; if it runs after, it observes + * %INIT_BEGIN and transitions to %DEAD without calling ops, + * leaving the post-init recheck below to unwind. + */ + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_task_iter_unlock(&sti); ret = __scx_init_task(sch, p, false); + + rq = task_rq_lock(p, &rf); + if (unlikely(ret)) { + if (scx_get_task_state(p) != SCX_TASK_DEAD) + scx_set_task_state(p, SCX_TASK_NONE); + task_rq_unlock(rq, p, &rf); put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", @@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } - scx_set_task_state(p, SCX_TASK_INIT); - scx_set_task_sched(p, sch); - scx_set_task_state(p, SCX_TASK_READY); + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. + * ops.exit_task() is owed to the sched __scx_init_task() + * ran against; call it now. + */ + scx_sub_init_cancel_task(sch, p); + } else { + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); + scx_set_task_state(p, SCX_TASK_READY); + } + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); -- cgit v1.2.3 From 657b594b2084b39a4bc6d8493aa2140cb00cea49 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 7 May 2026 16:46:29 +0900 Subject: fprobe: Fix unregister_fprobe() to wait for RCU grace period Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") changed fprobe to register struct fprobe to an rcu-hlist, but it forgot to wait for RCU GP. Thus there can be use-after-free if the fprobe is released right after unregistering. This can be happened on fprobe event and sample module code. To fix this issue, add synchronize_rcu() in unregister_fprobe(). Note that BPF is OK because fprobe is used as a part of bpf_kprobe_multi_link. This unregisters its fprobe in bpf_kprobe_multi_link_release() and it is deallocated via bpf_kprobe_multi_link_dealloc(), which is invoked from bpf_link_defer_dealloc_rcu_gp() RCU callback. For BPF, this also introduced unregister_fprobe_async() which does NOT wait for RCU grace priod. Link: https://lore.kernel.org/all/177813998919.256460.2809243930741138224.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 +++++ kernel/trace/bpf_trace.c | 3 ++- kernel/trace/fprobe.c | 23 +++++++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f3..be1b38c981d4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..a02bd258677e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a773..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) -- cgit v1.2.3 From dec85d2fbd20de3711a71e65397dfdb40c3fa953 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:02 +0000 Subject: irqchip/gic-v5: Move LPI allocation into the LPI domain The IPI and ITS MSI domains currently allocate and release LPIs directly, then pass the selected LPI ID to the parent LPI domain. This leaks the LPI domain's allocation policy into its child domains and forces each child to duplicate part of the parent domain's teardown. Make the LPI domain allocate LPIs in its .alloc() callback and release them in a matching .free() callback. Child domains can then request a parent interrupt without passing an implementation-specific LPI ID, and the LPI lifetime is tied to the domain that owns the LPI namespace. Remove the gicv5_alloc_lpi() and gicv5_free_lpi() wrappers now that no external caller needs to manage LPIs directly. This is a preparatory change for an actual leakage problem in the allocation code and therefore tagged with the same Fixes tag. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-2-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5-its.c | 14 ++-------- drivers/irqchip/irq-gic-v5.c | 53 +++++++++++++++++++------------------- include/linux/irqchip/arm-gic-v5.h | 3 --- 3 files changed, 28 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 36a8d1368f0e..36d03f82ef68 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -929,8 +929,8 @@ static void gicv5_its_free_eventid(struct gicv5_its_dev *its_dev, u32 event_id_b static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - u32 device_id, event_id_base, lpi; struct gicv5_its_dev *its_dev; + u32 device_id, event_id_base; msi_alloc_info_t *info = arg; irq_hw_number_t hwirq; struct irq_data *irqd; @@ -949,16 +949,8 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) { - pr_debug("Failed to find free LPI!\n"); - goto out_free_irqs; - } - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); if (ret) { - gicv5_free_lpi(lpi); goto out_free_irqs; } @@ -983,7 +975,6 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi out_free_irqs: while (--i >= 0) { irqd = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(irqd->parent_data->hwirq); irq_domain_reset_irq_data(irqd); irq_domain_free_irqs_parent(domain, virq + i, 1); } @@ -1013,7 +1004,6 @@ static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int vi for (i = 0; i < nr_irqs; i++) { d = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(d->parent_data->hwirq); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); } diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 6b0903be8ebf..15a2a04398d2 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -59,16 +59,6 @@ static void release_lpi(u32 lpi) ida_free(&lpi_ida, lpi); } -int gicv5_alloc_lpi(void) -{ - return alloc_lpi(); -} - -void gicv5_free_lpi(u32 lpi) -{ - release_lpi(lpi); -} - static void gicv5_ppi_priority_init(void) { write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR0_EL1); @@ -806,18 +796,36 @@ static void gicv5_lpi_config_reset(struct irq_data *d) gicv5_lpi_irq_write_pending_state(d, false); } +static void gicv5_irq_lpi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + + if (WARN_ON_ONCE(nr_irqs != 1)) + return; + + d = irq_domain_get_irq_data(domain, virq); + + release_lpi(d->hwirq); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(d); +} + static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { irq_hw_number_t hwirq; struct irq_data *irqd; - u32 *lpi = arg; int ret; if (WARN_ON_ONCE(nr_irqs != 1)) return -EINVAL; - hwirq = *lpi; + ret = alloc_lpi(); + if (ret < 0) + return ret; + hwirq = ret; irqd = irq_domain_get_irq_data(domain, virq); @@ -826,8 +834,10 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi irqd_set_single_target(irqd); ret = gicv5_irs_iste_alloc(hwirq); - if (ret < 0) + if (ret < 0) { + release_lpi(hwirq); return ret; + } gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); gicv5_lpi_config_reset(irqd); @@ -837,7 +847,7 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = { .alloc = gicv5_irq_lpi_domain_alloc, - .free = gicv5_irq_domain_free, + .free = gicv5_irq_lpi_domain_free, }; void __init gicv5_init_lpi_domain(void) @@ -859,21 +869,12 @@ static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int vi { struct irq_data *irqd; int ret, i; - u32 lpi; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); + if (ret) return ret; - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) { - gicv5_free_lpi(lpi); - return ret; - } - irqd = irq_domain_get_irq_data(domain, virq + i); irq_domain_set_hwirq_and_chip(domain, virq + i, i, @@ -899,8 +900,6 @@ static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int vi if (!d) return; - gicv5_free_lpi(d->parent_data->hwirq); - irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce68294..f78787e654f4 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif -- cgit v1.2.3 From 4314a44564eb1565349fed7a4192344c5f46fc85 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:12 +0800 Subject: bpf: Fix out-of-bounds read in bpf_patch_call_args() The interpreters_args array only accommodates stack depths up to MAX_BPF_STACK (512 bytes). However, do_misc_fixups() may allow a larger stack depth if JIT is requested. If JIT compilation later fails and falls back to the interpreter, the verifier invokes bpf_patch_call_args() with this oversized stack depth. This causes a load-time out-of-bounds (OOB) read when calculating the interpreter function pointer index. Fix this by changing bpf_patch_call_args() to return an int and explicitly rejecting the JIT fallback (returning -EINVAL) if the stack depth exceeds MAX_BPF_STACK. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260506094714.419842-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 6 +++++- kernel/bpf/fixups.c | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..52b30e9ea431 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2917,7 +2917,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..63044ebe5721 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2394,13 +2394,17 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); + /* Prevent out-of-bounds read to interpreters_args */ + if (stack_depth > MAX_BPF_STACK) + return -EINVAL; insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; + return 0; } #endif #endif diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index fba9e8c00878..df8f48091321 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1416,7 +1416,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) depth = get_callee_stack_depth(env, insn, i); if (depth < 0) return depth; - bpf_patch_call_args(insn, depth); + err = bpf_patch_call_args(insn, depth); + if (err) { + verbose(env, "stack depth %d exceeds interpreter stack depth limit\n", + depth); + return err; + } } err = 0; #endif -- cgit v1.2.3 From 58a8f3e2501dc14b8e00e883d6aaf0600a239da7 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:13 +0800 Subject: bpf: Fix s16 truncation for large bpf-to-bpf call offsets Currently, the BPF instruction set allows bpf-to-bpf calls (or internal calls, pseudo calls) to use a 32-bit imm field to represent the relative jump offset. However, when JIT is disabled or falls back to the interpreter, the verifier invokes bpf_patch_call_args() to rewrite the call instruction. In this function, the 32-bit imm is downcast to s16 and stored in the off field. void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } If the original imm exceeds the s16 range (i.e., a jump offset greater than 32767 instructions), this downcast silently truncates the offset, resulting in an incorrect call target. Fix this by: 1. In bpf_patch_call_args(), keeping the imm field unchanged and using the off field to store the index of the interpreter function. 2. In ___bpf_prog_run() for the JMP_CALL_ARGS case, retrieving the interpreter function pointer from the interpreters_args array using the off field as the index, and passing the original imm to calculate the last argument of the interpreter function. After these changes, the truncation issue is resolved, and __bpf_call_base_args is also no longer needed and can be removed, which makes the code cleaner. Performance: In ___bpf_prog_run() for the JMP_CALL_ARGS case, changing the retrieval of the interpreter function pointer from pointer addition to direct array indexing improves performance. The possible reason is that the latter has better instruction-level parallelism. See the v5 discussion [1] for more details. [1] https://lore.kernel.org/bpf/f120c3c4-6999-414a-b514-518bb64b4758@zju.edu.cn/ To avoid requiring bpftool changes, keep the new imm/off encoding internal and restore the legacy xlated dump layout in bpf_insn_prepare_dump(). For bpf-to-bpf call offsets that do not fit in s16, export off as 0 instead of a truncated and misleading value. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Suggested-by: Xu Kuohai Suggested-by: Puranjay Mohan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260506094714.419842-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/filter.h | 3 --- kernel/bpf/core.c | 21 ++++++++++++++------- kernel/bpf/fixups.c | 6 +++--- kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 52b30e9ea431..cd191c5fdb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2918,6 +2918,12 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 ua #ifndef CONFIG_BPF_JIT_ALWAYS_ON int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +s32 bpf_call_args_imm(s16 idx); +#else +static inline s32 bpf_call_args_imm(s16 idx) +{ + return 0; +} #endif struct btf *bpf_get_btf_vmlinux(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..88a241aac36a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1151,9 +1151,6 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 63044ebe5721..6aa2a8b24030 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1771,6 +1771,9 @@ static u32 abs_s32(s32 x) return x >= 0 ? (u32)x : -(u32)x; } +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn); + /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -2077,10 +2080,9 @@ select_insn: CONT; JMP_CALL_ARGS: - BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, - BPF_R3, BPF_R4, - BPF_R5, - insn + insn->off + 1); + BPF_R0 = interpreters_args[insn->off](BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5, + insn + insn->imm + 1); CONT; JMP_TAIL_CALL: { @@ -2400,12 +2402,17 @@ int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) /* Prevent out-of-bounds read to interpreters_args */ if (stack_depth > MAX_BPF_STACK) return -EINVAL; - insn->off = (s16) insn->imm; - insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - - __bpf_call_base_args; + insn->off = (round_up(stack_depth, 32) / 32) - 1; insn->code = BPF_JMP | BPF_CALL_ARGS; return 0; } + +s32 bpf_call_args_imm(s16 idx) +{ + if (WARN_ON_ONCE(idx < 0 || idx >= ARRAY_SIZE(interpreters_args))) + return 0; + return BPF_CALL_IMM(interpreters_args[idx]); +} #endif #endif diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index df8f48091321..3692adf62558 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1250,9 +1250,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) } if (!bpf_pseudo_call(insn)) continue; - insn->off = env->insn_aux_data[i].call_imm; - subprog = bpf_find_subprog(env, i + insn->off + 1); - insn->imm = subprog; + insn->imm = env->insn_aux_data[i].call_imm; + subprog = bpf_find_subprog(env, i + insn->imm + 1); + insn->off = subprog; } prog->jited = 1; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..630d530782fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4919,6 +4919,29 @@ out: return map; } +static void prepare_dump_pseudo_call(struct bpf_insn *insn) +{ + s32 call_off = insn->imm; + + /* + * BPF_CALL_ARGS only exists for interpreter fallback. + * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of + * interpreters_args array, so here using bpf_call_args_imm() + * to get the real address offset. + * 2. For JIT (BPF_CALL): insn->off is the subprog id. + */ + if (insn->code == (BPF_JMP | BPF_CALL_ARGS)) + insn->imm = bpf_call_args_imm(insn->off); + else + insn->imm = insn->off; + + /* Avoid dumping a truncated and misleading pc-relative offset. */ + if (call_off > S16_MAX || call_off < S16_MIN) + insn->off = 0; + else + insn->off = call_off; +} + static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, const struct cred *f_cred) { @@ -4944,6 +4967,9 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, } if (code == (BPF_JMP | BPF_CALL) || code == (BPF_JMP | BPF_CALL_ARGS)) { + /* Restore the legacy xlated dump layout. */ + if (insns[i].src_reg == BPF_PSEUDO_CALL) + prepare_dump_pseudo_call(&insns[i]); if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok(f_cred)) -- cgit v1.2.3 From 5dd74441cbf42c22e874450eb6a6bbb19390a216 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:31 +0800 Subject: cgroup/cpuset: Reserve DL bandwidth only for root-domain moves cpuset_can_attach() currently adds the bandwidth of all migrating SCHED_DEADLINE tasks to sum_migrate_dl_bw. If the source and destination cpuset effective CPU masks do not overlap, the whole sum is then reserved in the destination root domain. set_cpus_allowed_dl(), however, subtracts bandwidth from the source root domain only when the affinity change really moves the task between root domains. A DL task can move between cpusets that are still in the same root domain, so including that task in sum_migrate_dl_bw can reserve destination bandwidth without a matching source-side subtraction. Share the root-domain move test with set_cpus_allowed_dl(). Keep nr_migrate_dl_tasks counting all migrating deadline tasks for cpuset DL task accounting, but add to sum_migrate_dl_bw only for tasks that need a root-domain bandwidth move. Keep using the destination cpuset effective CPU mask and leave the broader can_attach()/attach() transaction model unchanged. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Acked-by: Juri Lelli Tested-by: Juri Lelli Signed-off-by: Tejun Heo --- include/linux/sched/deadline.h | 9 +++++++++ kernel/cgroup/cpuset-internal.h | 1 + kernel/cgroup/cpuset.c | 33 ++++++++++++++++++--------------- kernel/sched/deadline.c | 13 ++++++++++--- 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea30..f7aaf01f7cd5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3fbf6e7f68c3..e84e801e22cf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2993,7 +2993,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,28 +3038,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - ret = -EINVAL; - goto out_unlock; - } + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; + } - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) - goto out_unlock; + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; - cs->dl_bw_cpu = cpu; - } + cs->dl_bw_cpu = cpu; out_success: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..7db4c87df83b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { -- cgit v1.2.3 From b5782e2d462c028096f922abca46318cec890670 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:40 +0100 Subject: netfs: Fix missing barriers when accessing stream->subrequests locklessly The list of subrequests attached to stream->subrequests is accessed without locks by netfs_collect_read_results() and netfs_collect_write_results(), and then they access subreq->flags without taking a barrier after getting the subreq pointer from the list. Relatedly, the functions that build the list don't use any sort of write barrier when constructing the list to make sure that the NETFS_SREQ_IN_PROGRESS flag is perceived to be set first if no lock is taken. Fix this by: (1) Add a new list_add_tail_release() function that uses a release barrier to set the pointer to the new member of the list. (2) Add a new list_first_entry_or_null_acquire() function that uses an acquire barrier to read the pointer to the first member in a list (or return NULL). (3) Use list_add_tail_release() when adding a subreq to ->subrequests. (4) Use list_first_entry_or_null_acquire() when initially accessing the front of the list (when an item is removed, the pointer to the new front iterm is obtained under the same lock). Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 3 ++- fs/netfs/misc.c | 1 + fs/netfs/read_collect.c | 6 ++++-- fs/netfs/write_collect.c | 6 ++++-- fs/netfs/write_issue.c | 3 ++- include/linux/list.h | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 50 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a27ed501b6d4..15d73026ff64 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -168,7 +168,8 @@ void netfs_queue_read(struct netfs_io_request *rreq, * remove entries off of the front. */ spin_lock(&rreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 6df89c92b10b..21357907b7ee 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -356,6 +356,7 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, DEFINE_WAIT(myself); list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + smp_rmb(); /* Read ->next before IN_PROGRESS. */ if (!netfs_check_subreq_in_progress(subreq)) continue; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index d2d902f46627..3c9b847885c2 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -205,8 +205,10 @@ reassess: * in progress. The issuer thread may be adding stuff to the tail * whilst we're doing this. */ - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { size_t transferred; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index b194447f4b11..7fbf50907a7f 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -228,8 +228,10 @@ reassess_streams: if (!smp_load_acquire(&stream->active)) continue; - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { trace_netfs_collect_sreq(wreq, front); //_debug("sreq [%x] %llx %zx/%zx", diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 2db688f94125..b0e9690bb90c 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -204,7 +204,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq, * remove entries off of the front. */ spin_lock(&wreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; diff --git a/include/linux/list.h b/include/linux/list.h index 00ea8e5fb88b..09d979976b3b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -191,6 +191,29 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +/** + * list_add_tail_release - add a new entry with release barrier + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head, using a release barrier to set + * the ->next pointer that points to it. This is useful for implementing + * queues, in particular one that the elements will be walked through forwards + * locklessly. + */ +static inline void list_add_tail_release(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev = head->prev; + + if (__list_add_valid(new, prev, head)) { + new->next = head; + new->prev = prev; + head->prev = new; + smp_store_release(&prev->next, new); + } +} + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -644,6 +667,20 @@ static inline void list_splice_tail_init(struct list_head *list, pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) +/** + * list_first_entry_or_null_acquire - get the first element from a list with barrier + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null_acquire(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = smp_load_acquire(&head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. -- cgit v1.2.3 From 2c8f4742bb76117d735f92a3932d85239b16c494 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:42 +0100 Subject: netfs: Fix potential for tearing in ->remote_i_size and ->zero_point Fix potential tearing in using ->remote_i_size and ->zero_point by copying i_size_read() and i_size_write() and using the same seqcount as for i_size. We need to make sure that netfslib and the filesystems that use it always hold i_lock whilst updating any of the sizes to prevent i_size_seqcount from getting corrupted. Fixes: 4058f742105e ("netfs: Keep track of the actual remote file size") Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-6-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/9p/v9fs_vfs.h | 13 -- fs/9p/vfs_inode.c | 6 +- fs/9p/vfs_inode_dotl.c | 12 +- fs/afs/file.c | 24 +++- fs/afs/inode.c | 31 +++-- fs/afs/internal.h | 11 +- fs/afs/write.c | 2 +- fs/netfs/buffered_read.c | 6 +- fs/netfs/buffered_write.c | 2 +- fs/netfs/direct_write.c | 6 +- fs/netfs/misc.c | 32 +++-- fs/netfs/write_collect.c | 9 +- fs/smb/client/cifsfs.c | 38 ++++-- fs/smb/client/cifssmb.c | 3 +- fs/smb/client/file.c | 13 +- fs/smb/client/inode.c | 14 ++- fs/smb/client/readdir.c | 3 +- fs/smb/client/smb2ops.c | 42 ++++--- fs/smb/client/smb2pdu.c | 3 +- include/linux/netfs.h | 293 ++++++++++++++++++++++++++++++++++++++++++++-- 20 files changed, 450 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index d3aefbec4de6..34c115d7c250 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -75,17 +75,4 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) int v9fs_open_to_dotl_flags(int flags); -static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) -{ - /* - * 32-bit need the lock, concurrent updates could break the - * sequences and make i_size_read() loop forever. - * 64-bit updates are atomic and can skip the locking. - */ - if (sizeof(i_size) > sizeof(long)) - spin_lock(&inode->i_lock); - i_size_write(inode, i_size); - if (sizeof(i_size) > sizeof(long)) - spin_unlock(&inode->i_lock); -} #endif diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d1508b1fe109..f468acb8ee7d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1141,11 +1141,13 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->length; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->length); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->length); + i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ inode->i_blocks = (stat->length + 512 - 1) >> 9; + spin_unlock(&inode->i_lock); v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 71796a89bcf4..141fb54db65d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -634,10 +634,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->st_size; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->st_size); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->st_size); + i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode_set_atime(inode, stat->st_atime_sec, @@ -662,13 +664,15 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } + spin_lock(&inode->i_lock); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) { - v9inode->netfs.remote_i_size = stat->st_size; - v9fs_i_size_write(inode, stat->st_size); + netfs_write_remote_i_size(inode, stat->st_size); + i_size_write(inode, stat->st_size); } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; diff --git a/fs/afs/file.c b/fs/afs/file.c index 85696ac984cc..0467742bfeee 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -427,21 +427,35 @@ static void afs_free_request(struct netfs_io_request *rreq) afs_put_wb_key(rreq->netfs_priv2); } -static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +/* + * Set the file size and block count, taking ->cb_lock and ->i_lock to maintain + * coherency and prevent 64-bit tearing on 32-bit arches. + * + * Also, estimate the number of 512 bytes blocks used, rounded up to nearest 1K + * for consistency with other AFS clients. + */ +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size) { - struct afs_vnode *vnode = AFS_FS_I(inode); + struct inode *inode = &vnode->netfs.inode; loff_t i_size; write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); if (new_i_size > i_size) { - i_size_write(&vnode->netfs.inode, new_i_size); - inode_set_bytes(&vnode->netfs.inode, new_i_size); + i_size_write(inode, new_i_size); + inode_set_bytes(inode, round_up(new_i_size, 1024)); } + spin_unlock(&inode->i_lock); write_sequnlock(&vnode->cb_lock); fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +{ + afs_set_i_size(AFS_FS_I(inode), new_i_size); +} + static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { struct afs_vnode *vnode = AFS_FS_I(wreq->inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index a5173434f786..19fe2e392885 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -224,7 +224,8 @@ static int afs_inode_init_from_status(struct afs_operation *op, return afs_protocol_error(NULL, afs_eproto_file_type); } - afs_set_i_size(vnode, status->size); + i_size_write(inode, status->size); + inode_set_bytes(inode, status->size); afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; @@ -253,7 +254,8 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->netfs.inode; + struct netfs_inode *ictx = &vnode->netfs; + struct inode *inode = &ictx->inode; struct timespec64 t; umode_t mode; bool unexpected_jump = false; @@ -336,6 +338,8 @@ static void afs_apply_status(struct afs_operation *op, } if (data_changed) { + unsigned long long zero_point, size = status->size; + inode_set_iversion_raw(inode, status->data_version); /* Only update the size if the data version jumped. If the @@ -343,16 +347,25 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs.remote_i_size = status->size; - if (change_size || status->size > i_size_read(inode)) { - afs_set_i_size(vnode, status->size); + spin_lock(&inode->i_lock); + + if (change_size || size > i_size_read(inode)) { + /* We can read the sizes directly as we hold i_lock. */ + zero_point = ictx->_zero_point; + if (unexpected_jump) - vnode->netfs.zero_point = status->size; + zero_point = size; + netfs_write_sizes(inode, size, size, zero_point); + inode_set_bytes(inode, size); inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); + } else { + netfs_write_remote_i_size(inode, size); } + spin_unlock(&inode->i_lock); + if (op->ops == &afs_fetch_data_operation) - op->fetch.subreq->rreq->i_size = status->size; + op->fetch.subreq->rreq->i_size = size; } } @@ -709,7 +722,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, * it, but we need to give userspace the server's size. */ if (S_ISDIR(inode->i_mode)) - stat->size = vnode->netfs.remote_i_size; + stat->size = netfs_read_remote_i_size(inode); } while (read_seqretry(&vnode->cb_lock, seq)); return 0; @@ -889,7 +902,7 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->netfs.remote_i_size) { + attr->ia_size > netfs_read_remote_i_size(inode)) { truncate_setsize(inode, attr->ia_size); netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 599353c33337..816dc848ea71 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1157,6 +1157,7 @@ extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size); /* * flock.c @@ -1758,16 +1759,6 @@ static inline void afs_update_dentry_version(struct afs_operation *op, (void *)(unsigned long)dir_vp->scb.status.data_version; } -/* - * Set the file size and block count. Estimate the number of 512 bytes blocks - * used, rounded up to nearest 1K for consistency with other AFS clients. - */ -static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) -{ - i_size_write(&vnode->netfs.inode, size); - vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; -} - /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just diff --git a/fs/afs/write.c b/fs/afs/write.c index fcfed9d24e0a..7f34b939706a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -142,7 +142,7 @@ static void afs_issue_write_worker(struct work_struct *work) afs_begin_vnode_operation(op); op->store.write_iter = &subreq->io_iter; - op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->store.i_size = umax(pos + len, netfs_read_remote_i_size(&vnode->netfs.inode)); op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index fee0aebf5a3d..ebd84a6cc3f0 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -209,7 +209,6 @@ static void netfs_issue_read(struct netfs_io_request *rreq, static void netfs_read_to_pagecache(struct netfs_io_request *rreq, struct readahead_control *ractl) { - struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; @@ -233,7 +232,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); subreq->source = source; if (source == NETFS_DOWNLOAD_FROM_SERVER) { - unsigned long long zp = umin(ictx->zero_point, rreq->i_size); + unsigned long long zero_point = netfs_read_zero_point(rreq->inode); + unsigned long long zp = umin(zero_point, rreq->i_size); size_t len = subreq->len; if (unlikely(rreq->origin == NETFS_READ_SINGLE)) @@ -249,7 +249,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", rreq->debug_id, subreq->debug_index, subreq->len, size, - subreq->start, ictx->zero_point, rreq->i_size); + subreq->start, zero_point, rreq->i_size); netfs_cancel_read(subreq, ret); break; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 05ea5b0cc0e8..b6ecd059dc4f 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -230,7 +230,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * server would just return a block of zeros or a short read if * we try to read it. */ - if (fpos >= ctx->zero_point) { + if (fpos >= netfs_read_zero_point(inode)) { folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index f9ab69de3e29..25f8ceb15fad 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -376,8 +376,10 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) goto out; end = iocb->ki_pos + iov_iter_count(from); - if (end > ictx->zero_point) - ictx->zero_point = end; + spin_lock(&inode->i_lock); + if (end > ictx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 21357907b7ee..bad661ff2bec 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -211,18 +211,25 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); if (offset == 0 && length == flen) { - unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long i_size, remote_i_size, zero_point; unsigned long long fpos = folio_pos(folio), end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); end = umin(fpos + flen, i_size); - if (fpos < i_size && end > ctx->zero_point) - ctx->zero_point = end; + if (fpos < i_size && end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(fpos + flen, inode->i_size); + if (fpos < i_size && end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } } folio_wait_private_2(folio); /* [DEPRECATED] */ @@ -292,15 +299,22 @@ EXPORT_SYMBOL(netfs_invalidate_folio); */ bool netfs_release_folio(struct folio *folio, gfp_t gfp) { - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); - unsigned long long end; + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); + unsigned long long i_size, remote_i_size, zero_point, end; if (folio_test_dirty(folio)) return false; - end = umin(folio_next_pos(folio), i_size_read(&ctx->inode)); - if (end > ctx->zero_point) - ctx->zero_point = end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + end = umin(folio_next_pos(folio), i_size); + if (end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(folio_next_pos(folio), inode->i_size); + if (end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } if (folio_test_private(folio)) return false; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 7fbf50907a7f..24fc2bb2f8a4 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -57,7 +57,8 @@ static void netfs_dump_request(const struct netfs_io_request *rreq) int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; - struct netfs_inode *ictx = netfs_inode(folio->mapping->host); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ictx = netfs_inode(inode); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -69,8 +70,10 @@ int netfs_folio_written_back(struct folio *folio) unsigned long long fend; fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; - if (fend > ictx->zero_point) - ictx->zero_point = fend; + spin_lock(&ictx->inode.i_lock); + if (fend > ictx->_zero_point) + netfs_write_zero_point(inode, fend); + spin_unlock(&ictx->inode.i_lock); folio_detach_private(folio); group = finfo->netfs_group; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9f76b0347fa9..feac491c5070 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -434,7 +434,8 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->netfs.remote_i_size = 0; + cifs_inode->netfs._remote_i_size = 0; + cifs_inode->netfs._zero_point = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -1303,7 +1304,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, struct cifsFileInfo *smb_file_src = src_file->private_data; struct cifsFileInfo *smb_file_target = dst_file->private_data; struct cifs_tcon *target_tcon, *src_tcon; - unsigned long long destend, fstart, fend, old_size, new_size; + unsigned long long i_size, old_size, new_size, zero_point; + unsigned long long destend, fstart, fend; unsigned int xid; int rc; @@ -1347,7 +1349,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1368,16 +1370,18 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); if (rc) goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - old_size = target_cifsi->netfs.remote_i_size; + + spin_lock(&target_inode->i_lock); + if (fend > zero_point) + netfs_write_zero_point(target_inode, fend + 1); + i_size = target_inode->i_size; + spin_unlock(&target_inode->i_lock); /* Discard all the folios that overlap the destination region. */ cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend); truncate_inode_pages_range(&target_inode->i_data, fstart, fend); - fscache_invalidate(cifs_inode_cookie(target_inode), NULL, - i_size_read(target_inode), 0); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size, 0); rc = -EOPNOTSUPP; if (target_tcon->ses->server->ops->duplicate_extents) { @@ -1402,8 +1406,12 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = -EINVAL; } } - if (rc == 0 && new_size > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = new_size; + if (rc == 0) { + spin_lock(&target_inode->i_lock); + if (new_size > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, new_size); + spin_unlock(&target_inode->i_lock); + } } /* force revalidate of size and timestamps of target file now @@ -1474,7 +1482,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1502,8 +1510,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, fscache_resize_cookie(cifs_inode_cookie(target_inode), i_size_read(target_inode)); } - if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = destoff + rc; + if (rc > 0) { + spin_lock(&target_inode->i_lock); + if (destoff + rc > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, destoff + rc); + spin_unlock(&target_inode->i_lock); + } } file_accessed(src_file); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3990a9012264..9e27bfa7376b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1465,6 +1465,7 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct cifs_io_subrequest *rdata = mid->callback_data; struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); + struct inode *inode = &ictx->inode; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1, .rq_iter = rdata->subreq.io_iter }; @@ -1538,7 +1539,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else if (rdata->got_bytes > 0) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 664a2c223089..b60344125f27 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2517,18 +2517,23 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; - struct netfs_inode *ictx = netfs_inode(wreq->inode); + struct inode *inode = wreq->inode; + struct netfs_inode *ictx = netfs_inode(inode); loff_t wrend; if (result > 0) { + spin_lock(&inode->i_lock); + wrend = wdata->subreq.start + wdata->subreq.transferred + result; - if (wrend > ictx->zero_point && + if (wrend > ictx->_zero_point && (wdata->rreq->origin == NETFS_UNBUFFERED_WRITE || wdata->rreq->origin == NETFS_DIO_WRITE)) - ictx->zero_point = wrend; - if (wrend > ictx->remote_i_size) + netfs_write_zero_point(inode, wrend); + if (wrend > ictx->_remote_i_size) netfs_resize_file(ictx, wrend, true); + + spin_unlock(&inode->i_lock); } netfs_write_subrequest_terminated(&wdata->subreq, result); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 16a5310155d5..9472c0a6c187 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -119,7 +119,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->netfs.remote_i_size == fattr->cf_eof) { + netfs_read_remote_i_size(inode) == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -173,12 +173,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode_state_read_once(inode) & I_NEW) - CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; - cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); + if (inode_state_read_once(inode) & I_NEW) + netfs_write_zero_point(inode, fattr->cf_eof); + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode); fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); @@ -212,7 +212,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->netfs.remote_i_size = fattr->cf_eof; + netfs_write_remote_i_size(inode, fattr->cf_eof); /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -2772,7 +2772,9 @@ cifs_revalidate_mapping(struct inode *inode) if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RW_CACHE) goto skip_invalidate; - cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, netfs_inode(inode)->_remote_i_size); + spin_unlock(&inode->i_lock); rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX); if (rc) { cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n", diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index be22bbc4a65a..e860fa08b5e3 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -143,7 +143,8 @@ retry: fattr->cf_rdev = inode->i_rdev; fattr->cf_uid = inode->i_uid; fattr->cf_gid = inode->i_gid; - fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; + fattr->cf_eof = + netfs_read_remote_i_size(inode); fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6cb9b144530..0ea3ce1b94ea 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3402,8 +3402,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = netfs_inode(inode); - unsigned long long i_size, new_size, remote_size; + unsigned long long i_size, new_size, remote_i_size, zero_point; long rc; unsigned int xid; @@ -3414,9 +3413,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, filemap_invalidate_lock(inode->i_mapping); - i_size = i_size_read(inode); - remote_size = ictx->remote_i_size; - if (offset + len >= remote_size && offset < i_size) { + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (offset + len >= remote_i_size && offset < i_size) { unsigned long long top = umin(offset + len, i_size); rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); @@ -3449,9 +3447,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_size, true); - if (offset < cifsi->netfs.zero_point) - cifsi->netfs.zero_point = offset; + if (offset < cifsi->netfs._zero_point) + netfs_write_zero_point(inode, offset); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3474,7 +3474,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; - unsigned long long end = offset + len, i_size, remote_i_size; + unsigned long long end = offset + len, i_size, remote_i_size, zero_point; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3516,14 +3516,17 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, * that we locally hole-punch the tail of the dirty data, the proposed * EOF update will end up in the wrong place. */ - i_size = i_size_read(inode); - remote_i_size = netfs_inode(inode)->remote_i_size; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (end > remote_i_size && i_size > remote_i_size) { unsigned long long extend_to = umin(end, i_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, extend_to); - if (rc >= 0) - netfs_inode(inode)->remote_i_size = extend_to; + if (rc >= 0) { + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, extend_to); + spin_unlock(&inode->i_lock); + } } unlock: @@ -3787,7 +3790,6 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3805,7 +3807,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); - ictx->zero_point = old_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, old_eof); + spin_unlock(&inode->i_lock); netfs_wait_for_outstanding_io(inode); rc = smb2_copychunk_range(xid, cfile, cfile, off + len, @@ -3822,8 +3826,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_eof, true); - ictx->zero_point = new_eof; + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); @@ -3866,13 +3872,17 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); if (rc < 0) goto out_2; - cifsi->netfs.zero_point = new_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); rc = smb3_zero_data(file, tcon, off, len, xid); if (rc < 0) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 995fcdd30681..3bd300347f16 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4608,6 +4608,7 @@ smb2_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base; + struct inode *inode = &ictx->inode; struct cifs_credits credits = { .value = 0, .instance = 0, @@ -4721,7 +4722,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ba17ac5bf356..4fd1d796ad73 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -62,8 +62,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ - loff_t remote_i_size; /* Size of the remote file */ - loff_t zero_point; /* Size after which we assume there's no data + loff_t _remote_i_size; /* Size of the remote file */ + loff_t _zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; @@ -474,6 +474,254 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) return container_of(inode, struct netfs_inode, inode); } +/** + * netfs_read_remote_i_size - Read remote_i_size safely + * @inode: The inode to access + * + * Read remote_i_size safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_remote_i_size(const struct inode *inode) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long remote_i_size; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + remote_i_size = ictx->_remote_i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + remote_i_size = ictx->_remote_i_size; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + remote_i_size = smp_load_acquire(&ictx->_remote_i_size); +#endif + return remote_i_size; +} + +/* + * netfs_write_remote_i_size - Set remote_i_size safely + * @inode: The inode to access + * @remote_i_size: The new value for the size of the file on the server + * + * Set remote_i_size safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_remote_i_size(), netfs_write_remote_i_size() does + * need locking around it (normally i_rwsem), otherwise on 32bit/SMP an update + * of i_size_seqcount can be lost, resulting in subsequent i_size_read() calls + * spinning forever. + */ +static inline void netfs_write_remote_i_size(struct inode *inode, + unsigned long long remote_i_size) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_remote_i_size = remote_i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_remote_i_size = remote_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_remote_i_size, remote_i_size); +#endif +} + +/** + * netfs_read_zero_point - Read zero_point safely + * @inode: The inode to access + * + * Read zero_point safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_zero_point(const struct inode *inode) +{ + struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long zero_point; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + zero_point = smp_load_acquire(&ictx->_zero_point); +#endif + return zero_point; +} + +/* + * netfs_write_zero_point - Set zero_point safely + * @inode: The inode to access + * @zero_point: The new value for the point beyond which the server has no data + * + * Set zero_point safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_zero_point(struct inode *inode, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_zero_point() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** + * netfs_read_sizes - Read remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: Where to return the local file size. + * @remote_i_size: Where to return the size of the file on the server + * @zero_point: Where to return the the point beyond which the server has no data + * + * Read remote_i_size and zero_point safely without the potential for tearing + * on 32-bit arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline void netfs_read_sizes(const struct inode *inode, + unsigned long long *i_size, + unsigned long long *remote_i_size, + unsigned long long *zero_point) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in i_size_write() */ + *i_size = smp_load_acquire(&inode->i_size); + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + *remote_i_size = smp_load_acquire(&ictx->_remote_i_size); + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + *zero_point = smp_load_acquire(&ictx->_zero_point); +#endif +} + +/* + * netfs_write_sizes - Set i_size, remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: The new value for the local size of the file + * @remote_i_size: The new value for the size of the file on the server + * @zero_point: The new value for the point beyond which the server has no data + * + * Set both remote_i_size and zero_point safely without the potential for + * tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_sizes(struct inode *inode, + unsigned long long i_size, + unsigned long long remote_i_size, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in i_size_read(), + * netfs_read_remote_i_size() and netfs_read_zero_point() to ensure + * changes related to inode size (such as page contents) are visible + * before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); + smp_store_release(&ictx->_remote_i_size, remote_i_size); + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise @@ -488,8 +736,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, bool use_zero_point) { ctx->ops = ops; - ctx->remote_i_size = i_size_read(&ctx->inode); - ctx->zero_point = LLONG_MAX; + ctx->_remote_i_size = i_size_read(&ctx->inode); + ctx->_zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) @@ -498,7 +746,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { - ctx->zero_point = ctx->remote_i_size; + ctx->_zero_point = ctx->_remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } @@ -511,13 +759,40 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, +static inline void netfs_resize_file(struct netfs_inode *ictx, + unsigned long long new_i_size, bool changed_on_server) { +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + struct inode *inode = &ictx->inode; + + preempt_disable(); + write_seqcount_begin(&inode->i_size_seqcount); + if (changed_on_server) + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); if (changed_on_server) - ctx->remote_i_size = new_i_size; - if (new_i_size < ctx->zero_point) - ctx->zero_point = new_i_size; + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size and + * netfs_read_zero_point() to ensure changes related to inode size + * (such as page contents) are visible before we see the changed inode + * size. + */ + if (changed_on_server) + smp_store_release(&ictx->_remote_i_size, new_i_size); + if (new_i_size < ictx->_zero_point) + smp_store_release(&ictx->_zero_point, new_i_size); +#endif } /** -- cgit v1.2.3 From 156ac2ec2ee77c44c4eb7439d6d165247ba12247 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:48 +0100 Subject: netfs: Fix netfs_invalidate_folio() to clear dirty bit if all changes gone If a streaming write is made, this will leave the relevant modified folio in a not-uptodate, but dirty state with a netfs_folio struct hung off of folio->private indicating the dirty range. Subsequently truncating the file such that the dirty data in the folio is removed, but the first part of the folio theoretically remains will cause the netfs_folio struct to be discarded... but will leave the dirty flag set. If the folio is then read via mmap(), netfs_read_folio() will see that the page is dirty and jump to netfs_read_gaps() to fill in the missing bits. netfs_read_gaps(), however, expects there to be a netfs_folio struct present and can oops because truncate removed it. Fix this by calling folio_cancel_dirty() in netfs_invalidate_folio() in the event that all the dirty data in the folio is erased (as nfs does). Also add some tracepoints to log modifications to a dirty page. This can be reproduced with something like: dd if=/dev/zero of=/xfstest.test/foo bs=1M count=1 umount /xfstest.test mount /xfstest.test xfs_io -c "w 0xbbbf 0xf96c" \ -c "truncate 0xbbbf" \ -c "mmap -r 0xb000 0x11000" \ -c "mr 0xb000 0x11000" \ /xfstest.test/foo with fscaching disabled (otherwise streaming writes are suppressed) and a change to netfs_perform_write() to disallow streaming writes if the fd is open O_RDWR: if (//(file->f_mode & FMODE_READ) || <--- comment this out netfs_is_cache_enabled(ctx)) { It should be reproducible even without this change, but if prevents the above trivial xfs_io command from reproducing it. Note that the initial dd is important: the file must start out sufficiently large that the zero-point logic doesn't just clear the gaps because it knows there's nothing in the file to read yet. Unmounting and mounting is needed to clear the pagecache (there are other ways to do that that may also work). This was initially reproduced with the generic/522 xfstest on some patches that remove the FMODE_READ restriction. Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write") Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-12-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 6 +++++- include/trace/events/netfs.h | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 723571ca1b88..24b20e80e9a8 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -263,6 +263,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* Move the start of the data. */ finfo->dirty_len = fend - iend; finfo->dirty_offset = offset; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_front); return; } @@ -271,12 +272,14 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) */ if (iend >= fend) { finfo->dirty_len = offset - fstart; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_tail); return; } /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ + trace_netfs_folio(folio, netfs_folio_trace_invalidate_middle); } return; @@ -284,8 +287,9 @@ erase_completely: netfs_put_group(netfs_folio_group(folio)); folio_detach_private(folio); folio_clear_uptodate(folio); + folio_cancel_dirty(folio); kfree(finfo); - return; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_all); } EXPORT_SYMBOL(netfs_invalidate_folio); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 8c936fc575d5..0b702f74aefe 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -194,6 +194,10 @@ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ + EM(netfs_folio_trace_invalidate_all, "inval-all") \ + EM(netfs_folio_trace_invalidate_front, "inval-front") \ + EM(netfs_folio_trace_invalidate_middle, "inval-mid") \ + EM(netfs_folio_trace_invalidate_tail, "inval-tail") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \ EM(netfs_folio_trace_kill_g, "kill-g") \ -- cgit v1.2.3 From 7b4dcf1b9455a6e52ac7478b4057dbe10359576d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:50 +0100 Subject: netfs: Fix streaming write being overwritten In order to avoid reading whilst writing, netfslib will allow "streaming writes" in which dirty data is stored directly into folios without reading them first. Such folios are marked dirty but may not be marked uptodate. If a folio is entirely written by a streaming write, uptodate will be set, otherwise it will have a netfs_folio struct attached to ->private recording the dirty region. In the event that a partially written streaming write page is to be overwritten entirely by a single write(), netfs_perform_write() will try to copy over it, but doesn't discard the netfs_folio if it succeeds; further, it doesn't correctly handle a partial copy that overwrites some of the dirty data. Fix this by the following: (1) If the folio is successfully overwritten, free the netfs_folio struct before marking the page uptodate. (2) If the copy to the folio partially fails, but short of the dirty data, just ignore the copy. (3) If the copy partially fails and overwrites some of the dirty data, accept the copy, update the netfs_folio struct to record the new data. If the folio is now filled, free the netfs_folio and set uptodate, otherwise return a partial write. Found with: fsx -q -N 1000000 -p 10000 -o 128000 -l 600000 \ /xfstest.test/junk --replay-ops=junk.fsxops using the following as junk.fsxops: truncate 0x0 0 0x927c0 write 0x63fb8 0x53c8 0 copy_range 0xb704 0x19b9 0x24429 0x79380 write 0x2402b 0x144a2 0x90660 * write 0x204d5 0x140a0 0x927c0 * copy_range 0x1f72c 0x137d0 0x7a906 0x927c0 * read 0x00000 0x20000 0x9157c read 0x20000 0x20000 0x9157c read 0x40000 0x20000 0x9157c read 0x60000 0x20000 0x9157c read 0x7e1a0 0xcfb9 0x9157c on cifs with the default cache option. It shows folio 0x24 misbehaving if the FMODE_READ check is commented out in netfs_perform_write(): if (//(file->f_mode & FMODE_READ) || netfs_is_cache_enabled(ctx)) { and no fscache. This was initially found with the generic/522 xfstest. Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-14-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 47 ++++++++++++++++++++++++++++++++------------ include/trace/events/netfs.h | 3 +++ 2 files changed, 37 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 278aeb074e75..991552724868 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -246,18 +246,38 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, /* See if we can write a whole folio in one go. */ if (!maybe_trouble && offset == 0 && part >= flen) { copied = copy_folio_from_iter_atomic(folio, offset, part, iter); - if (unlikely(copied == 0)) + if (likely(copied == part)) { + if (finfo) { + trace = netfs_whole_folio_modify_filled; + goto folio_now_filled; + } + __netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_whole_folio_modify; + goto copied; + } + if (copied == 0) goto copy_failed; - if (unlikely(copied < part)) { + if (!finfo || copied <= finfo->dirty_offset) { maybe_trouble = true; iov_iter_revert(iter, copied); copied = 0; folio_unlock(folio); goto retry; } - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_whole_folio_modify; + + /* We overwrote some existing dirty data, so we have to + * accept the partial write. + */ + finfo->dirty_len += finfo->dirty_offset; + if (finfo->dirty_len == flen) { + trace = netfs_whole_folio_modify_filled_efault; + goto folio_now_filled; + } + if (copied > finfo->dirty_len) + finfo->dirty_len = copied; + finfo->dirty_offset = 0; + trace = netfs_whole_folio_modify_efault; goto copied; } @@ -327,16 +347,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto copy_failed; finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); - folio_mark_uptodate(folio); - kfree(finfo); trace = netfs_streaming_cont_filled_page; - } else { - trace = netfs_streaming_write_cont; + goto folio_now_filled; } + trace = netfs_streaming_write_cont; goto copied; } @@ -350,6 +364,13 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto out; continue; + folio_now_filled: + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); copied: trace_netfs_folio(folio, trace); flush_dcache_folio(folio); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 0b702f74aefe..aa9940ba307b 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -177,6 +177,9 @@ EM(netfs_folio_is_uptodate, "mod-uptodate") \ EM(netfs_just_prefetch, "mod-prefetch") \ EM(netfs_whole_folio_modify, "mod-whole-f") \ + EM(netfs_whole_folio_modify_efault, "mod-whole-f!") \ + EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ + EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ -- cgit v1.2.3 From dbe556972100fabb8e5a1b3d2163831ff07b1e8e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:56 +0100 Subject: netfs: Fix potential UAF in netfs_unlock_abandoned_read_pages() netfs_unlock_abandoned_read_pages(rreq) accesses the index of the folios it is wanting to unlock and compares that to rreq->no_unlock_folio so that it doesn't unlock a folio being read for netfs_perform_write() or netfs_write_begin(). However, given that netfs_unlock_abandoned_read_pages() is called _after_ NETFS_RREQ_IN_PROGRESS is cleared, the one folio that it's not allowed to dereference is the one specified by ->no_unlock_folio as ownership immediately reverts to the caller. Fix this by storing the folio pointer instead and using that rather than the index. Also fix netfs_unlock_read_folio() where the same applies. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-20-dhowells@redhat.com cc: Paulo Alcantara cc: Viacheslav Dubeyko cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 4 ++-- fs/netfs/read_collect.c | 2 +- fs/netfs/read_retry.c | 2 +- include/linux/netfs.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 004d426c02b4..83d0b8153e96 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -670,7 +670,7 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); @@ -736,7 +736,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3c9b847885c2..23660a590124 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,7 +83,7 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (folio->index == rreq->no_unlock_folio && + if (folio == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { _debug("no unlock"); } else { diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index e10eb5a07332..f59a70f3a086 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -292,7 +292,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) struct folio *folio = folioq_folio(p, slot); if (folio && !folioq_is_marked2(p, slot)) { - if (folio->index == rreq->no_unlock_folio && + if (folio == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { _debug("no unlock"); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4fd1d796ad73..243c0f737938 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -252,7 +252,7 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ - pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + const struct folio *no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ -- cgit v1.2.3 From ccde2ac757c713535b224233a296de40efe5212d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:58 +0100 Subject: netfs: Fix folio->private handling in netfs_perform_write() Under some circumstances, netfs_perform_write() doesn't correctly manipulate folio->private between NULL, NETFS_FOLIO_COPY_TO_CACHE, pointing to a group and pointing to a netfs_folio struct, leading to potential multiple attachments of private data with associated folio ref leaks and also leaks of netfs_folio structs or netfs_group refs. Fix this by consolidating the place at which a folio is marked uptodate in one place and having that look at what's attached to folio->private and decide how to clean it up and then set the new group. Also, the content shouldn't be flushed if group is NULL, even if a group is specified in the netfs_group parameter, as that would be the case for a new folio. A filesystem should always specify netfs_group or never specify netfs_group. The Sashiko auto-review tool noted that it was theoretically possible that the fpos >= ctx->zero_point section might leak if it modified a streaming write folio. This is unlikely, but with a network filesystem, third party changes can happen. It also pointed out that __netfs_set_group() would leak if called multiple times on the same folio from the "whole folio modify section". Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-22-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 134 ++++++++++++++++++++++++++----------------- include/trace/events/netfs.h | 1 + 2 files changed, 82 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f79fb5996540..6bde3320bcec 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -12,24 +12,6 @@ #include #include "internal.h" -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - if (netfs_group) - folio_attach_private(folio, netfs_get_group(netfs_group)); -} - -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - void *priv = folio_get_private(folio); - - if (unlikely(priv != netfs_group)) { - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) - folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); - } -} - /* * Grab a folio for writing and lock it. Attempt to allocate as large a folio * as possible to hold as much of the remaining length as possible in one go. @@ -157,6 +139,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t offset; /* Offset into pagecache folio */ size_t part; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + void *priv; offset = pos & (max_chunk - 1); part = min(max_chunk - offset, iov_iter_count(iter)); @@ -202,6 +185,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto error_folio_unlock; } + finfo = netfs_folio_info(folio); + group = netfs_folio_group(folio); + + /* If the requested group differs from the group set on the + * page, then we need to flush out the folio if it has a group + * set (ie. is non-NULL). Note that COPY_TO_CACHE is a special + * case, being a netfs annotation rather than an actual group. + * + * The filesystem isn't permitted to mix writes with groups and + * writes without groups as the NULL group is used to indicate + * that no group is set. + */ + if (unlikely(group != netfs_group) && + group != NETFS_FOLIO_COPY_TO_CACHE && + group) { + WARN_ON_ONCE(!netfs_group); + goto flush_content; + } + /* Decide how we should modify a folio. We might be attempting * to do write-streaming, as we don't want to a local RMW cycle * if we can avoid it. If we're doing local caching or content @@ -209,22 +211,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * file is open readably, then we let ->read_folio() fill in * the gaps. */ - finfo = netfs_folio_info(folio); - group = netfs_folio_group(folio); - - if (unlikely(group != netfs_group) && - group != NETFS_FOLIO_COPY_TO_CACHE) - goto flush_content; - if (folio_test_uptodate(folio)) { if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); trace = netfs_folio_is_uptodate; - goto copied; + goto copied_uptodate; } /* If the page is above the zero-point then we assume that the @@ -237,24 +231,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(copied == 0)) goto copy_failed; folio_zero_segment(folio, offset + copied, flen); - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_modify_and_clear; - goto copied; + if (finfo) + trace = netfs_modify_and_clear_rm_finfo; + else + trace = netfs_modify_and_clear; + goto mark_uptodate; } /* See if we can write a whole folio in one go. */ if (!maybe_trouble && offset == 0 && part >= flen) { copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (likely(copied == part)) { - if (finfo) { + if (finfo) trace = netfs_whole_folio_modify_filled; - goto folio_now_filled; - } - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_whole_folio_modify; - goto copied; + else + trace = netfs_whole_folio_modify; + goto mark_uptodate; } if (copied == 0) goto copy_failed; @@ -272,7 +264,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len += finfo->dirty_offset; if (finfo->dirty_len == flen) { trace = netfs_whole_folio_modify_filled_efault; - goto folio_now_filled; + goto mark_uptodate; } if (copied > finfo->dirty_len) finfo->dirty_len = copied; @@ -300,11 +292,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); trace = netfs_just_prefetch; - goto copied; + goto copied_uptodate; } + /* Do a streaming write on a folio that has nothing in it yet. */ if (!finfo) { ret = -EIO; if (WARN_ON(folio_get_private(folio))) @@ -313,10 +305,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(copied == 0)) goto copy_failed; if (offset == 0 && copied == flen) { - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); trace = netfs_streaming_filled_page; - goto copied; + goto mark_uptodate; } finfo = kzalloc_obj(*finfo); @@ -345,7 +335,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { trace = netfs_streaming_cont_filled_page; - goto folio_now_filled; + goto mark_uptodate; } trace = netfs_streaming_write_cont; goto copied; @@ -361,13 +351,36 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto out; continue; - folio_now_filled: - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); + /* Mark a folio as being up to data when we've filled it + * completely. If the folio has a group attached, then it must + * be the same group, otherwise we should have flushed it out + * above. We have to get rid of the netfs_folio struct if + * there was one. + */ + mark_uptodate: folio_mark_uptodate(folio); - kfree(finfo); + + copied_uptodate: + priv = folio_get_private(folio); + if (likely(priv == netfs_group)) { + /* Already set correctly; no change required. */ + } else if (priv == NETFS_FOLIO_COPY_TO_CACHE) { + if (!netfs_group) + folio_detach_private(folio); + else + folio_change_private(folio, netfs_get_group(netfs_group)); + } else if (!priv) { + folio_attach_private(folio, netfs_get_group(netfs_group)); + } else { + WARN_ON_ONCE(!finfo); + if (netfs_group) + /* finfo->netfs_group has a ref */ + folio_change_private(folio, netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } + copied: trace_netfs_folio(folio, trace); flush_dcache_folio(folio); @@ -530,6 +543,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_NOPAGE; + void *priv; int err; _enter("%lx", folio->index); @@ -550,7 +564,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr } group = netfs_folio_group(folio); - if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { + if (group && + group != netfs_group && + group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), @@ -572,7 +588,19 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); else trace_netfs_folio(folio, netfs_folio_trace_mkwrite); - netfs_set_group(folio, netfs_group); + + priv = folio_get_private(folio); + if (priv != netfs_group) { + if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); + else if (netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_change_private(folio, netfs_get_group(netfs_group)); + else if (netfs_group && !priv) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else + WARN_ON_ONCE(1); + } + file_update_time(file); set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); if (ictx->ops->post_modify) diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index aa9940ba307b..082cb03c6131 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -181,6 +181,7 @@ EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ + EM(netfs_modify_and_clear_rm_finfo, "mod-n-clear+") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ EM(netfs_flush_content, "flush") \ -- cgit v1.2.3 From 11f152c0acaa924d93339000cb785d34e003aff5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 21 Apr 2026 16:27:01 +0200 Subject: xen/arm: Replace __ASSEMBLY__ with __ASSEMBLER__ in interface.h While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize now on the __ASSEMBLER__ macro that is provided by the compilers. Signed-off-by: Thomas Huth Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20260421142701.548978-1-thuth@redhat.com> --- include/xen/arm/interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/arm/interface.h b/include/xen/arm/interface.h index c3eada2642aa..61360b89da40 100644 --- a/include/xen/arm/interface.h +++ b/include/xen/arm/interface.h @@ -30,7 +30,7 @@ #define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Explicitly size integers that represent pfns in the interface with * Xen so that we can have one ABI that works for 32 and 64 bit guests. * Note that this means that the xen_pfn_t type may be capable of -- cgit v1.2.3 From 2c85c61d1332e1e16f020d76951baf167dcb6f7a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:22 +0200 Subject: HID: pass the buffer size to hid_report_raw_event commit 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") enforced the provided data to be at least the size of the declared buffer in the report descriptor to prevent a buffer overflow. However, we can try to be smarter by providing both the buffer size and the data size, meaning that hid_report_raw_event() can make better decision whether we should plaining reject the buffer (buffer overflow attempt) or if we can safely memset it to 0 and pass it to the rest of the stack. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Acked-by: Johan Hovold Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- drivers/hid/bpf/hid_bpf_dispatch.c | 6 ++++-- drivers/hid/hid-core.c | 42 +++++++++++++++++++++++++------------- drivers/hid/hid-gfrm.c | 4 ++-- drivers/hid/hid-logitech-hidpp.c | 2 +- drivers/hid/hid-multitouch.c | 2 +- drivers/hid/hid-primax.c | 2 +- drivers/hid/hid-vivaldi-common.c | 2 +- drivers/hid/wacom_sys.c | 6 +++--- drivers/staging/greybus/hid.c | 2 +- include/linux/hid.h | 4 ++-- include/linux/hid_bpf.h | 14 ++++++++----- 11 files changed, 53 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 50c7b45c59e3..d0130658091b 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -24,7 +24,8 @@ EXPORT_SYMBOL(hid_ops); u8 * dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf) + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf) { struct hid_bpf_ctx_kern ctx_kern = { .ctx = { @@ -74,6 +75,7 @@ dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type *size = ret; } + *buf_size = ctx_kern.ctx.allocated_size; return ctx_kern.data; } EXPORT_SYMBOL_GPL(dispatch_hid_bpf_device_event); @@ -505,7 +507,7 @@ __hid_bpf_input_report(struct hid_bpf_ctx *ctx, enum hid_report_type type, u8 *b if (ret) return ret; - return hid_ops->hid_input_report(ctx->hid, type, buf, size, 0, (u64)(long)ctx, true, + return hid_ops->hid_input_report(ctx->hid, type, buf, size, size, 0, (u64)(long)ctx, true, lock_already_taken); } diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 61afec5915ec..a806820df7e5 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2033,24 +2033,32 @@ int __hid_request(struct hid_device *hid, struct hid_report *report, } EXPORT_SYMBOL_GPL(__hid_request); -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt) +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; + size_t bsize = bufsize; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) - goto out; + return 0; + + if (unlikely(bsize < csize)) { + hid_warn_ratelimited(hid, "Event data for report %d is incorrect (%d vs %ld)\n", + report->id, csize, bsize); + return -EINVAL; + } if (report_enum->numbered) { cdata++; csize--; + bsize--; } rsize = hid_compute_report_size(report); @@ -2063,11 +2071,16 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * else if (rsize > max_buffer_size) rsize = max_buffer_size; + if (bsize < rsize) { + hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %ld)\n", + report->id, rsize, bsize); + return -EINVAL; + } + if (csize < rsize) { - hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %d)\n", - report->id, rsize, csize); - ret = -EINVAL; - goto out; + dbg_hid("report %d is too short, (%d < %d)\n", report->id, + csize, rsize); + memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) @@ -2075,7 +2088,7 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) - goto out; + return ret; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { @@ -2087,15 +2100,15 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); -out: + return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken) + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; @@ -2120,7 +2133,8 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, report_enum = hid->report_enum + type; hdrv = hid->driver; - data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); + data = dispatch_hid_bpf_device_event(hid, type, data, &bufsize, &size, interrupt, + source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; @@ -2149,7 +2163,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, goto unlock; } - ret = hid_report_raw_event(hid, type, data, size, interrupt); + ret = hid_report_raw_event(hid, type, data, bufsize, size, interrupt); unlock: if (!lock_already_taken) @@ -2171,7 +2185,7 @@ unlock: int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { - return __hid_input_report(hid, type, data, size, interrupt, 0, + return __hid_input_report(hid, type, data, size, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c index 699186ff2349..d2a56bf92b41 100644 --- a/drivers/hid/hid-gfrm.c +++ b/drivers/hid/hid-gfrm.c @@ -66,7 +66,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, switch (data[1]) { case GFRM100_SEARCH_KEY_DOWN: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_dn, - sizeof(search_key_dn), 1); + sizeof(search_key_dn), sizeof(search_key_dn), 1); break; case GFRM100_SEARCH_KEY_AUDIO_DATA: @@ -74,7 +74,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, case GFRM100_SEARCH_KEY_UP: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_up, - sizeof(search_key_up), 1); + sizeof(search_key_up), sizeof(search_key_up), 1); break; default: diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b1330d23bd2d..b3ff9265377b 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3673,7 +3673,7 @@ static int hidpp10_consumer_keys_raw_event(struct hidpp_device *hidpp, memcpy(&consumer_report[1], &data[3], 4); /* We are called from atomic context */ hid_report_raw_event(hidpp->hid_dev, HID_INPUT_REPORT, - consumer_report, 5, 1); + consumer_report, sizeof(consumer_report), 5, 1); return 1; } diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index e82a3c4e5b44..eeab0b6e32cc 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -533,7 +533,7 @@ static void mt_get_feature(struct hid_device *hdev, struct hid_report *report) } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, buf, - size, 0); + size, size, 0); if (ret) dev_warn(&hdev->dev, "failed to report feature\n"); } diff --git a/drivers/hid/hid-primax.c b/drivers/hid/hid-primax.c index e44d79dff8de..8db054280afb 100644 --- a/drivers/hid/hid-primax.c +++ b/drivers/hid/hid-primax.c @@ -44,7 +44,7 @@ static int px_raw_event(struct hid_device *hid, struct hid_report *report, data[0] |= (1 << (data[idx] - 0xE0)); data[idx] = 0; } - hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, 0); + hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, size, 0); return 1; default: /* unknown report */ diff --git a/drivers/hid/hid-vivaldi-common.c b/drivers/hid/hid-vivaldi-common.c index bf734055d4b6..b12bb5cc091a 100644 --- a/drivers/hid/hid-vivaldi-common.c +++ b/drivers/hid/hid-vivaldi-common.c @@ -85,7 +85,7 @@ void vivaldi_feature_mapping(struct hid_device *hdev, } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, report_data, - report_len, 0); + report_len, report_len, 0); if (ret) { dev_warn(&hdev->dev, "failed to report feature %d\n", field->report->id); diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 0d1c6d90fe21..a32320b351e3 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -90,7 +90,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, kfree(buf); continue; } - err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false); + err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, size, false); if (err) { hid_warn(hdev, "%s: unable to flush event due to error %d\n", __func__, err); @@ -334,7 +334,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n && features->type == HID_GENERIC) { ret = hid_report_raw_event(hdev, - HID_FEATURE_REPORT, data, n, 0); + HID_FEATURE_REPORT, data, n, n, 0); } else if (ret == 2 && features->type != HID_GENERIC) { features->touch_max = data[1]; } else { @@ -395,7 +395,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, - data, n, 0); + data, n, n, 0); } else { hid_warn(hdev, "%s: could not retrieve sensor offsets\n", __func__); diff --git a/drivers/staging/greybus/hid.c b/drivers/staging/greybus/hid.c index 1f58c907c036..f1f9f6fbc00e 100644 --- a/drivers/staging/greybus/hid.c +++ b/drivers/staging/greybus/hid.c @@ -201,7 +201,7 @@ static void gb_hid_init_report(struct gb_hid *ghid, struct hid_report *report) * we just need to setup the input fields, so using * hid_report_raw_event is safe. */ - hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, size, 1); + hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, ghid->bufsize, size, 1); } static void gb_hid_init_reports(struct gb_hid *ghid) diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e89..ac432a2ef415 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1298,8 +1298,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c..19fffa4574a4 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, -- cgit v1.2.3 From 206342541fc887ae919774a43942dc883161fece Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:23 +0200 Subject: HID: core: introduce hid_safe_input_report() hid_input_report() is used in too many places to have a commit that doesn't cross subsystem borders. Instead of changing the API, introduce a new one when things matters in the transport layers: - usbhid - i2chid This effectively revert to the old behavior for those two transport layers. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 25 +++++++++++++++++++++++++ drivers/hid/i2c-hid/i2c-hid-core.c | 7 ++++--- drivers/hid/usbhid/hid-core.c | 11 ++++++----- include/linux/hid.h | 2 ++ 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index a806820df7e5..b3596851c719 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2181,6 +2181,7 @@ unlock: * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. + * Legacy, please use hid_safe_input_report() instead. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) @@ -2191,6 +2192,30 @@ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data } EXPORT_SYMBOL_GPL(hid_input_report); +/** + * hid_safe_input_report - report data from lower layer (usb, bt...) + * + * @hid: hid device + * @type: HID report type (HID_*_REPORT) + * @data: report contents + * @bufsize: allocated size of the data buffer + * @size: useful size of data parameter + * @interrupt: distinguish between interrupt and control transfers + * + * This is data entry for lower layers. + * Please use this function instead of the non safe version because we provide + * here the size of the buffer, allowing hid-core to make smarter decisions + * regarding the incoming buffer. + */ +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) +{ + return __hid_input_report(hid, type, data, bufsize, size, interrupt, 0, + false, /* from_bpf */ + false /* lock_already_taken */); +} +EXPORT_SYMBOL_GPL(hid_safe_input_report); + bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 5a183af3d5c6..e0a302544cef 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -574,9 +574,10 @@ static void i2c_hid_get_input(struct i2c_hid *ihid) if (ihid->hid->group != HID_GROUP_RMI) pm_wakeup_event(&ihid->client->dev, 0); - hid_input_report(ihid->hid, HID_INPUT_REPORT, - ihid->inbuf + sizeof(__le16), - ret_size - sizeof(__le16), 1); + hid_safe_input_report(ihid->hid, HID_INPUT_REPORT, + ihid->inbuf + sizeof(__le16), + ihid->bufsize - sizeof(__le16), + ret_size - sizeof(__le16), 1); } return; diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index fbbfc0f60829..5af93b9b1fb5 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -283,9 +283,9 @@ static void hid_irq_in(struct urb *urb) break; usbhid_mark_busy(usbhid); if (!test_bit(HID_RESUME_RUNNING, &usbhid->iofl)) { - hid_input_report(urb->context, HID_INPUT_REPORT, - urb->transfer_buffer, - urb->actual_length, 1); + hid_safe_input_report(urb->context, HID_INPUT_REPORT, + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 1); /* * autosuspend refused while keys are pressed * because most keyboards don't wake up when @@ -482,9 +482,10 @@ static void hid_ctrl(struct urb *urb) switch (status) { case 0: /* success */ if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN) - hid_input_report(urb->context, + hid_safe_input_report(urb->context, usbhid->ctrl[usbhid->ctrltail].report->type, - urb->transfer_buffer, urb->actual_length, 0); + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 0); break; case -ESHUTDOWN: /* unplug */ unplug = 1; diff --git a/include/linux/hid.h b/include/linux/hid.h index ac432a2ef415..bfb9859f391e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); -- cgit v1.2.3 From 32d5019ed3b6ff4439cb075fb275f655c8a2059c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:47 +0200 Subject: block: pass a minsize argument to bio_iov_iter_bounce When bouncing for block size > PAGE_SIZE file systems that require file system block size alignment (e.g. zoned XFS), the bio needs to be big enough to fit an entire block. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 23 +++++++++++++---------- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 3 ++- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index b8972dba68a0..f3e5d8bea08c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1279,11 +1279,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); } -static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size, + size_t minsize) { struct folio *folio; - while (*size > PAGE_SIZE) { + while (*size > minsize) { folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); if (folio) return folio; @@ -1307,7 +1308,7 @@ static void bio_free_folios(struct bio *bio) } static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t total_len = min(maxlen, iov_iter_count(iter)); @@ -1322,13 +1323,13 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, size_t this_len = min(total_len, SZ_1M); struct folio *folio; - if (this_len > PAGE_SIZE * 2) + if (this_len > minsize * 2) this_len = rounddown_pow_of_two(this_len); if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) break; - folio = folio_alloc_greedy(GFP_KERNEL, &this_len); + folio = folio_alloc_greedy(GFP_KERNEL, &this_len, minsize); if (!folio) break; bio_add_folio_nofail(bio, folio, this_len, 0); @@ -1348,12 +1349,12 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M); struct folio *folio; - folio = folio_alloc_greedy(GFP_KERNEL, &len); + folio = folio_alloc_greedy(GFP_KERNEL, &len, minsize); if (!folio) return -ENOMEM; @@ -1390,6 +1391,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * @bio: bio to send * @iter: iter to read from / write into * @maxlen: maximum size to bounce + * @minsize: minimum folio allocation size * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require @@ -1397,11 +1399,12 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen) +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize) { if (op_is_write(bio_op(bio))) - return bio_iov_iter_bounce_write(bio, iter, maxlen); - return bio_iov_iter_bounce_read(bio, iter, maxlen); + return bio_iov_iter_bounce_write(bio, iter, maxlen, minsize); + return bio_iov_iter_bounce_read(bio, iter, maxlen, minsize); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b0a6549b3848..b36ee619cdcd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_BOUNCE) ret = bio_iov_iter_bounce(bio, dio->submit.iter, - iomap_max_bio_size(&iter->iomap)); + iomap_max_bio_size(&iter->iomap), alignment); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35..dc17780d6c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From df733ddc263dbe5f471e7c80c8b669532f56bf76 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:46:42 -0700 Subject: vfio/pci: Make VFIO_PCI_OFFSET_TO_INDEX() return unsigned VFIO_PCI_OFFSET_TO_INDEX() is used in several places with a signed parameter (e.g. loff_t). Because it makes no sense for a BAR/resource index to be negative, enforce this in the macro. This fixes at least one current issue, where vfio_pci_ioeventfd() uses this macro with an unvalidated signed loff_t returned into a signed type, leading to a possible negative array access. This instance does test against an out-of-bounds positive value, so treating the index as unsigned fixes this issue. Fixes: 89e1f7d4c66d8 ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511144642.2926799-1-mattev@meta.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..89165b769e5c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -- cgit v1.2.3 From 620072fd783290ad92c2d445a47b0a61b161f352 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Apr 2026 12:31:17 -0700 Subject: mm/damon: fix damos_stat tracepoint format for sz_applied The print format is wrongly marking sz_applied as sz_tried. Fix it. Link: https://lore.kernel.org/20260426193119.88095-1-sj@kernel.org Fixes: 804c26b961da ("mm/damon/core: add trace point for damos stat per apply interval") Signed-off-by: SeongJae Park Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: # 7.0.x Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 24fc402ab3c8..7e25f4469b81 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -41,7 +41,7 @@ TRACE_EVENT(damos_stat_after_apply_interval, ), TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " - "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "nr_applied=%lu sz_applied=%lu sz_ops_filter_passed=%lu " "qt_exceeds=%lu nr_snapshots=%lu", __entry->context_idx, __entry->scheme_idx, __entry->nr_tried, __entry->sz_tried, -- cgit v1.2.3 From 6a288a4ddb4a994490505ab5f41c445f8e6b6467 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 21 Apr 2026 17:39:07 +0200 Subject: mm/page_alloc: fix initialization of tags of the huge zero folio with init_on_free __GFP_ZEROTAGS semantics are currently a bit weird, but effectively this flag is only ever set alongside __GFP_ZERO and __GFP_SKIP_KASAN. If we run with init_on_free, we will zero out pages during __free_pages_prepare(), to skip zeroing on the allocation path. However, when allocating with __GFP_ZEROTAG set, post_alloc_hook() will consequently not only skip clearing page content, but also skip clearing tag memory. Not clearing tags through __GFP_ZEROTAGS is irrelevant for most pages that will get mapped to user space through set_pte_at() later: set_pte_at() and friends will detect that the tags have not been initialized yet (PG_mte_tagged not set), and initialize them. However, for the huge zero folio, which will be mapped through a PMD marked as special, this initialization will not be performed, ending up exposing whatever tags were still set for the pages. The docs (Documentation/arch/arm64/memory-tagging-extension.rst) state that allocation tags are set to 0 when a page is first mapped to user space. That no longer holds with the huge zero folio when init_on_free is enabled. Fix it by decoupling __GFP_ZEROTAGS from __GFP_ZERO, passing to tag_clear_highpages() whether we want to also clear page content. Invert the meaning of the tag_clear_highpages() return value to have clearer semantics. Reproduced with the huge zero folio by modifying the check_buffer_fill arm64/mte selftest to use a 2 MiB area, after making sure that pages have a non-0 tag set when freeing (note that, during boot, we will not actually initialize tags, but only set KASAN_TAG_KERNEL in the page flags). $ ./check_buffer_fill 1..20 ... not ok 17 Check initial tags with private mapping, sync error mode and mmap memory not ok 18 Check initial tags with private mapping, sync error mode and mmap/mprotect memory ... This code needs more cleanups; we'll tackle that next, like decoupling __GFP_ZEROTAGS from __GFP_SKIP_KASAN. [akpm@linux-foundation.org: s/__GPF_ZERO/__GFP_ZERO/, per David] Link: https://lore.kernel.org/20260421-zerotags-v2-1-05cb1035482e@kernel.org Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Catalin Marinas Tested-by: Lance Yang Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Will Deacon Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- arch/arm64/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 11 +++++++---- include/linux/gfp_types.h | 10 +++++----- include/linux/highmem.h | 7 ++++--- mm/page_alloc.c | 8 ++++---- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index e25d0d18f6d7..58200de8a221 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,7 +33,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -bool tag_clear_highpages(struct page *to, int numpages); +bool tag_clear_highpages(struct page *to, int numpages, bool clear_pages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..739800835920 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1018,7 +1018,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -bool tag_clear_highpages(struct page *page, int numpages) +bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages) { /* * Check if MTE is supported and fall back to clear_highpage(). @@ -1026,13 +1026,16 @@ bool tag_clear_highpages(struct page *page, int numpages) * post_alloc_hook() will invoke tag_clear_highpages(). */ if (!system_supports_mte()) - return false; + return clear_pages; /* Newly allocated pages, shouldn't have been tagged yet */ for (int i = 0; i < numpages; i++, page++) { WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); + if (clear_pages) + mte_zero_clear_page_tags(page_address(page)); + else + mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } - return true; + return false; } diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 227d58dc3de6..23c7298d3be2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1808,9 +1808,9 @@ static inline bool should_skip_init(gfp_t flags) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + const bool zero_tags = gfp_flags & __GFP_ZEROTAGS; bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); int i; set_page_private(page, 0); @@ -1832,11 +1832,11 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed - * (which happens only when memory should be initialized as well). + * Clearing tags can efficiently clear the memory for us as well, if + * required. */ if (zero_tags) - init = !tag_clear_highpages(page, 1 << order); + init = tag_clear_highpages(page, 1 << order, /* clear_pages= */init); if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { -- cgit v1.2.3 From 6624bba469a325ecd699feae400b77cd11c76b98 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:30:59 +0800 Subject: macsec: use rcu_work to defer RX SA crypto cleanup out of softirq crypto_free_aead() can internally invoke vunmap() (e.g. via dma_free_attrs() in hardware crypto drivers such as hisi_sec2). vunmap() must not be called from softirq context, but free_rxsa() is an RCU callback that runs in softirq, leading to a kernel crash: vunmap+0x4c/0x70 __iommu_dma_free+0xd0/0x138 dma_free_attrs+0xf4/0x100 sec_aead_exit+0x64/0xb8 [hisi_sec2] crypto_destroy_tfm+0x98/0x110 free_rxsa+0x28/0x50 [macsec] rcu_do_batch+0x184/0x460 rcu_core+0xf4/0x1f8 handle_softirqs+0x118/0x330 Use rcu_work to defer the cleanup to a workqueue. rcu_work dispatches the worker asynchronously after the RCU grace period, so no thread blocks waiting, and concurrent releases of multiple SAs naturally share the same grace period. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-3-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index ef5ac634f916..e7ad24f1ea5b 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -176,9 +176,10 @@ static void macsec_rxsc_put(struct macsec_rx_sc *sc) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } -static void free_rxsa(struct rcu_head *head) +static void free_rxsa_work(struct work_struct *work) { - struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); + struct macsec_rx_sa *sa = + container_of(to_rcu_work(work), struct macsec_rx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -188,7 +189,7 @@ static void free_rxsa(struct rcu_head *head) static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_rxsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) @@ -1409,6 +1410,7 @@ static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); + INIT_RCU_WORK(&rx_sa->destroy_work, free_rxsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index bc7de5b53e54..0980ef36fbf0 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -123,6 +124,7 @@ struct macsec_dev_stats { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_rx_sa { struct macsec_key key; @@ -136,7 +138,7 @@ struct macsec_rx_sa { bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; - struct rcu_head rcu; + struct rcu_work destroy_work; }; struct pcpu_rx_sc_stats { -- cgit v1.2.3 From 552cc2306c3d87632f44a655737d1d367c2a3295 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:31:00 +0800 Subject: macsec: use rcu_work to defer TX SA crypto cleanup out of softirq free_txsa() is an RCU callback running in softirq context, but calls crypto_free_aead() which can invoke vunmap() internally on hardware crypto drivers (e.g. hisi_sec2), triggering a kernel crash. Use rcu_work to defer the cleanup to a workqueue, for the same reasons as the analogous fix to free_rxsa() in the previous patch. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-4-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index e7ad24f1ea5b..f904f4d16b45 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -205,9 +205,10 @@ static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) return sa; } -static void free_txsa(struct rcu_head *head) +static void free_txsa_work(struct work_struct *work) { - struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); + struct macsec_tx_sa *sa = + container_of(to_rcu_work(work), struct macsec_tx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -217,7 +218,7 @@ static void free_txsa(struct rcu_head *head) static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_txsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) @@ -1510,6 +1511,7 @@ static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); + INIT_RCU_WORK(&tx_sa->destroy_work, free_txsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index 0980ef36fbf0..d962093ee923 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -176,6 +176,7 @@ struct macsec_rx_sc { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_tx_sa { struct macsec_key key; @@ -188,7 +189,7 @@ struct macsec_tx_sa { refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; - struct rcu_head rcu; + struct rcu_work destroy_work; }; /** -- cgit v1.2.3 From e83f5e24da741fa9405aeeff00b08c5ee7c37b88 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Wed, 6 May 2026 19:43:30 +0800 Subject: Bluetooth: serialize accept_q access bt_sock_poll() walks the accept queue without synchronization, while child teardown can unlink the same socket and drop its last reference. The unsynchronized accept queue walk has existed since the initial Bluetooth import. Protect accept_q with a dedicated lock for queue updates and polling. Also rework bt_accept_dequeue() to take temporary child references under the queue lock before dropping it and locking the child socket. Fixes: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reported-by: Jann Horn Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Jiexun Wang Reviewed-by: Jann Horn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/af_bluetooth.c | 87 +++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 69eed69f7f26..3faea66b1979 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -398,6 +398,7 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src); struct bt_sock { struct sock sk; struct list_head accept_q; + spinlock_t accept_q_lock; /* protects accept_q */ struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 33d053d63407..9d68dd86023c 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -154,6 +154,7 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock, sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + spin_lock_init(&bt_sk(sk)->accept_q_lock); sock_reset_flag(sk, SOCK_ZAPPED); @@ -214,6 +215,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; struct pid *old_pid; + struct bt_sock *par = bt_sk(parent); BT_DBG("parent %p, sk %p", parent, sk); @@ -224,9 +226,13 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + spin_lock_bh(&par->accept_q_lock); + list_add_tail(&bt_sk(sk)->accept_q, &par->accept_q); + sk_acceptq_added(parent); + spin_unlock_bh(&par->accept_q_lock); + /* Copy credentials from parent since for incoming connections the * socket is allocated by the kernel. */ @@ -244,8 +250,6 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) bh_unlock_sock(sk); else release_sock(sk); - - sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -254,45 +258,72 @@ EXPORT_SYMBOL(bt_accept_enqueue); */ void bt_accept_unlink(struct sock *sk) { + struct sock *parent = bt_sk(sk)->parent; + BT_DBG("sk %p state %d", sk, sk->sk_state); + spin_lock_bh(&bt_sk(parent)->accept_q_lock); list_del_init(&bt_sk(sk)->accept_q); - sk_acceptq_removed(bt_sk(sk)->parent); + sk_acceptq_removed(parent); + spin_unlock_bh(&bt_sk(parent)->accept_q_lock); bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); +static struct sock *bt_accept_get(struct sock *parent, struct sock *sk) +{ + struct bt_sock *bt = bt_sk(parent); + struct sock *next = NULL; + + /* accept_q is modified from child teardown paths too, so take a + * temporary reference before dropping the queue lock. + */ + spin_lock_bh(&bt->accept_q_lock); + + if (sk) { + if (bt_sk(sk)->parent != parent) + goto out; + + if (!list_is_last(&bt_sk(sk)->accept_q, &bt->accept_q)) { + next = &list_next_entry(bt_sk(sk), accept_q)->sk; + sock_hold(next); + } + } else if (!list_empty(&bt->accept_q)) { + next = &list_first_entry(&bt->accept_q, + struct bt_sock, accept_q)->sk; + sock_hold(next); + } + +out: + spin_unlock_bh(&bt->accept_q_lock); + return next; +} + struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { - struct bt_sock *s, *n; - struct sock *sk; + struct sock *sk, *next; BT_DBG("parent %p", parent); restart: - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)s; - + for (sk = bt_accept_get(parent, NULL); sk; sk = next) { /* Prevent early freeing of sk due to unlink and sock_kill */ - sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ - if (!bt_sk(sk)->parent) { + if (bt_sk(sk)->parent != parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); - /* Restart the loop as sk is no longer in the list - * and also avoid a potential infinite loop because - * list_for_each_entry_safe() is not thread safe. - */ goto restart; } + next = bt_accept_get(parent, sk); + /* sk is safely in the parent list so reduce reference count */ sock_put(sk); @@ -310,6 +341,8 @@ restart: sock_graft(sk, newsock); release_sock(sk); + if (next) + sock_put(next); return sk; } @@ -518,18 +551,28 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { - struct bt_sock *s, *n; + struct bt_sock *bt = bt_sk(parent); + struct bt_sock *s; struct sock *sk; + __poll_t mask = 0; + + spin_lock_bh(&bt->accept_q_lock); + list_for_each_entry(s, &bt->accept_q, accept_q) { + int state; - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; - if (sk->sk_state == BT_CONNECTED || - (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && - sk->sk_state == BT_CONNECT2)) - return EPOLLIN | EPOLLRDNORM; + state = READ_ONCE(sk->sk_state); + + if (state == BT_CONNECTED || + (test_bit(BT_SK_DEFER_SETUP, &bt->flags) && + state == BT_CONNECT2)) { + mask = EPOLLIN | EPOLLRDNORM; + break; + } } + spin_unlock_bh(&bt->accept_q_lock); - return 0; + return mask; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, -- cgit v1.2.3 From 31e62c2ebbfdc3fe3dbdf5e02c92a9dc67087a3a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2026 11:37:18 -0700 Subject: ptrace: slightly saner 'get_dumpable()' logic The 'dumpability' of a task is fundamentally about the memory image of the task - the concept comes from whether it can core dump or not - and makes no sense when you don't have an associated mm. And almost all users do in fact use it only for the case where the task has a mm pointer. But we have one odd special case: ptrace_may_access() uses 'dumpable' to check various other things entirely independently of the MM (typically explicitly using flags like PTRACE_MODE_READ_FSCREDS). Including for threads that no longer have a VM (and maybe never did, like most kernel threads). It's not what this flag was designed for, but it is what it is. The ptrace code does check that the uid/gid matches, so you do have to be uid-0 to see kernel thread details, but this means that the traditional "drop capabilities" model doesn't make any difference for this all. Make it all make a *bit* more sense by saying that if you don't have a MM pointer, we'll use a cached "last dumpability" flag if the thread ever had a MM (it will be zero for kernel threads since it is never set), and require a proper CAP_SYS_PTRACE capability to override. Reported-by: Qualys Security Advisory Cc: Oleg Nesterov Cc: Kees Cook Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ kernel/exit.c | 1 + kernel/ptrace.c | 22 ++++++++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..ee06cba5c6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/kernel/exit.c b/kernel/exit.c index 9a909993ab1d..f50d73c272d6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,6 +571,7 @@ static void exit_mm(void) */ smp_mb__after_spinlock(); local_irq_disable(); + current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 68c17daef8d4..130043bfc209 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -272,11 +272,24 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return ns_capable(ns, CAP_SYS_PTRACE); } +static bool task_still_dumpable(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm = task->mm; + if (mm) { + if (get_dumpable(mm) == SUID_DUMP_USER) + return true; + return ptrace_has_cap(mm->user_ns, mode); + } + + if (task->user_dumpable) + return true; + return ptrace_has_cap(&init_user_ns, mode); +} + /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -337,11 +350,8 @@ ok: * Pairs with a write barrier in commit_creds(). */ smp_rmb(); - mm = task->mm; - if (mm && - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; + if (!task_still_dumpable(task, mode)) + return -EPERM; return security_ptrace_access_check(task, mode); } -- cgit v1.2.3 From 5522d65d81a711c60a9969d37a485d48d0ad1496 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 May 2026 13:46:05 +0300 Subject: ipvs: avoid possible loop in ip_vs_dst_event on resizing Sashiko points out that unprivileged user can frequently call ip_vs_flush() or ip_vs_del_service() to trigger svc_table_changes updates that can lead to infinite loop in ip_vs_dst_event(). This can also happen if the user triggers frequent table resizing without deleting all services. We should also consider the possible effects if the user triggers many NETDEV_DOWN events. One way to solve it is to hold svc_resize_sem in ip_vs_dst_event() but this can block the dev notifier during the whole resizing process. Instead, use new rw_semaphore svc_replace_sem to protect just the svc_table replacement which is a short code section. Then hold svc_replace_sem in ip_vs_dst_event() to serialize with replacing the svc_table. As result, loop is avoided as there is no need to repeat the table walking from the start. By this way changes in svc_table_changes can happen only when all services are removed and all dev references dropped which allows us to abort the table walking. As IP_VS_WORK_SVC_NORESIZE is the flag used to stop the svc_resize_work under service_mutex, we should check only this flag often but not while under service_mutex. To remove the mutex_trylock() for service_mutex in the second phase where the resizer installs the new table after rehashing, we will avoid holding the service_mutex there. As result, the code in configuration context which is under service_mutex should access ipvs->svc_table under RCU because it can be replaced at anytime and released after a RCU grace period. As for ip_vs_zero_all(), it needs different solution as a table walker which can escape single RCU read-side critical section: to hold the svc_replace_sem to prevent table to be replaced. In ip_vs_status_show() prefer to hold svc_replace_sem to avoid many loops, just detect if the svc_table is removed. Prefer the newly attached table for the u_thresh/l_thresh checks to know when to grow/shrink while adding or deleting services because the new table size is based on the latest parameters. Link: https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org Fixes: 840aac3d900d ("ipvs: use resizable hash table for services") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 +- net/netfilter/ipvs/ip_vs_ctl.c | 187 +++++++++++++++++++++++++++-------------- 2 files changed, 124 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 02762ce73a0c..a02e569813d2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1186,8 +1186,9 @@ struct netns_ipvs { struct timer_list dest_trash_timer; /* expiration timer */ struct mutex service_mutex; /* service reconfig */ struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct rw_semaphore svc_replace_sem; /* svc_table replace */ struct delayed_work svc_resize_work; /* resize svc_table */ - atomic_t svc_table_changes;/* ++ on new table */ + atomic_t svc_table_changes;/* ++ on table changes */ /* Service counters */ atomic_t num_services[IP_VS_AF_MAX]; /* Services */ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c7c7f6a7a9f6..bd9cae44d214 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -327,18 +327,22 @@ ip_vs_use_count_dec(void) /* Service hashing: * Operation Locking order * --------------------------------------------------------------------------- - * add table service_mutex, svc_resize_sem(W) - * del table service_mutex - * move between tables svc_resize_sem(W), seqcount_t(W), bit lock - * add/del service service_mutex, bit lock + * add first table service_mutex + * attach new table service_mutex + * add/del service service_mutex, RCU, bit lock + * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock + * replace old with attached svc_resize_sem(W), svc_replace_sem(W) * find service RCU, seqcount_t(R) * walk services(blocking) service_mutex, svc_resize_sem(R) * walk services(non-blocking) RCU, seqcount_t(R) + * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) + * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) + * del table service_mutex after stopped work * - * - new tables are linked/unlinked under service_mutex and svc_resize_sem - * - new table is linked on resizing and all operations can run in parallel - * in 2 tables until the new table is registered as current one - * - two contexts can modify buckets: config and table resize, both in + * - new table is attached on resizing under service_mutex and all operations + * can run in parallel in 2 tables until the new table is registered as current + * one + * - two contexts can modify buckets: config and table resize (work), both in * process context * - only table resizer can move entries, so we do not protect t->seqc[] * items with t->lock[] @@ -346,9 +350,13 @@ ip_vs_use_count_dec(void) * services are moved to new table * - move operations may disturb readers: find operation will not miss entries * but walkers may see same entry twice if they are forced to retry chains - * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold - * service_mutex to disallow new tables to be installed or to check + * or to walk the newly attached second table + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check * svc_table_changes and repeat the RCU read section if new table is installed + * - walkers may serialize with the whole resizing process (svc_resize_sem) + * to prevent seeing same service twice or just with the svc_table + * replace (svc_replace_sem) when we can see entries twice but we + * prefer to run concurrently with the rehashing. */ /* @@ -387,9 +395,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); + /* New entries go into recent table */ - t = rcu_dereference_protected(ipvs->svc_table, 1); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); if (svc->fwmark == 0) { /* @@ -410,6 +425,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) hlist_bl_add_head_rcu(&svc->s_list, head); hlist_bl_unlock(head); + rcu_read_unlock(); + return 1; } @@ -432,7 +449,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) return 0; } - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); hash_key = READ_ONCE(svc->hash_key); /* We need to lock the bucket in the right table */ if (ip_vs_rht_same_table(t, hash_key)) { @@ -443,13 +466,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* Moved to new table ? */ if (hash_key != hash_key2) { hlist_bl_unlock(head); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key2 & t->mask); hlist_bl_lock(head); } } else { /* It is already moved to new table */ - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key & t->mask); hlist_bl_lock(head); } @@ -459,6 +482,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); hlist_bl_unlock(head); + + rcu_read_unlock(); return 1; } @@ -666,15 +691,14 @@ static void svc_resize_work_handler(struct work_struct *work) goto unlock_sem; more_work = false; clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + if (!READ_ONCE(ipvs->enable)) goto unlock_m; t = rcu_dereference_protected(ipvs->svc_table, 1); /* Do nothing if table is removed */ if (!t) goto unlock_m; - /* New table needs to be registered? BUG! */ - if (t != rcu_dereference_protected(t->new_tbl, 1)) + /* New table already attached? BUG! */ + if (t != rcu_access_pointer(t->new_tbl)) goto unlock_m; lfactor = sysctl_svc_lfactor(ipvs); @@ -691,6 +715,7 @@ static void svc_resize_work_handler(struct work_struct *work) /* Flip the table_id */ t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + /* Attach new table */ rcu_assign_pointer(t->new_tbl, t_new); /* Allow add/del to new_tbl while moving from old table */ mutex_unlock(&ipvs->service_mutex); @@ -698,8 +723,8 @@ static void svc_resize_work_handler(struct work_struct *work) ip_vs_rht_for_each_bucket(t, bucket, head) { same_bucket: if (++limit >= 16) { - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, + /* Check if work is stopped */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) goto unlock_sem; if (resched_score >= 100) { @@ -764,16 +789,12 @@ same_bucket: goto same_bucket; } - /* Tables can be switched only under service_mutex */ - while (!mutex_trylock(&ipvs->service_mutex)) { - cond_resched(); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_sem; - } - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_m; + /* Serialize with readers that don't like svc_table changes */ + down_write(&ipvs->svc_replace_sem); + + /* Check if work is stopped to avoid synchronize_rcu() */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_repl; rcu_assign_pointer(ipvs->svc_table, t_new); /* Inform readers that new table is installed */ @@ -781,8 +802,8 @@ same_bucket: atomic_inc(&ipvs->svc_table_changes); t_free = t; -unlock_m: - mutex_unlock(&ipvs->service_mutex); +unlock_repl: + up_write(&ipvs->svc_replace_sem); unlock_sem: up_write(&ipvs->svc_resize_sem); @@ -801,6 +822,11 @@ out: test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) return; queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); + return; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + goto unlock_sem; } static inline void @@ -1691,6 +1717,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_pe *pe = NULL; int ret_hooks = -1; int ret = 0; + bool grow; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1732,16 +1759,25 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* The old table can be freed, protect it with RCU */ + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); if (!t) { int lfactor = sysctl_svc_lfactor(ipvs); int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + rcu_read_unlock(); t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); if (!t_new) { ret = -ENOMEM; goto out_err; } + grow = false; + } else { + /* Even the currently attached new table may need to grow */ + t = rcu_dereference(t->new_tbl); + grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; + rcu_read_unlock(); } if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { @@ -1800,6 +1836,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, goto out_err; if (t_new) { + /* Add table for first time */ clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); rcu_assign_pointer(ipvs->svc_table, t_new); t_new = NULL; @@ -1831,8 +1868,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); /* Schedule resize work */ - if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); @@ -2054,7 +2090,6 @@ static int ip_vs_del_service(struct ip_vs_service *svc) return -EEXIST; ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - t = rcu_dereference_protected(ipvs->svc_table, 1); /* Drop the table if no more services */ ns = ip_vs_get_num_services(ipvs); @@ -2062,6 +2097,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* Stop the resizer and drop the tables */ set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); cancel_delayed_work_sync(&ipvs->svc_resize_work); + t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); /* Inform readers that table is removed */ @@ -2075,11 +2111,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) t = p; } } - } else if (ns <= t->l_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, - &ipvs->work_flags)) { - queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, - 1); + } else { + bool shrink; + + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); + /* Even the currently attached new table may need to shrink */ + t = rcu_dereference(t->new_tbl); + shrink = ns <= t->l_thresh; + rcu_read_unlock(); + if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 1); } return 0; } @@ -2184,17 +2228,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_service *svc; struct hlist_bl_node *e; struct ip_vs_dest *dest; - int old_gen, new_gen; + int old_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + /* Allow concurrent rehashing on resize but to avoid loop + * serialize with installing the new table. + */ + down_read(&ipvs->svc_replace_sem); + old_gen = atomic_read(&ipvs->svc_table_changes); rcu_read_lock(); -repeat: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { @@ -2207,17 +2255,17 @@ repeat: } resched_score++; if (resched_score >= 100) { - resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - old_gen = new_gen; - goto repeat; - } + /* Flushed? So no more dev refs */ + if (atomic_read(&ipvs->svc_table_changes) != old_gen) + goto done; + resched_score = 0; } } + +done: rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); return NOTIFY_DONE; } @@ -2244,6 +2292,10 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; struct hlist_bl_node *e; + /* svc_table can not be replaced (svc_replace_sem) or + * removed (service_mutex) + */ + down_read(&ipvs->svc_replace_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { @@ -2259,6 +2311,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; @@ -3062,6 +3115,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) u32 sum; int i; + /* Info for conns */ rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); @@ -3123,6 +3177,12 @@ repeat_conn: } after_conns: + rcu_read_unlock(); + + /* Info for services */ + down_read(&ipvs->svc_replace_sem); + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); @@ -3133,9 +3193,7 @@ after_conns: if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); - loops = 0; -repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ memset(counts, 0, sizeof(counts)); ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { @@ -3157,15 +3215,10 @@ repeat_svc: if (resched_score >= 100) { resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - /* Too many changes? */ - if (++loops >= 5) - goto after_svc; - old_gen = new_gen; - goto repeat_svc; - } + /* Flushed? */ + if (atomic_read(&ipvs->svc_table_changes) != + old_gen) + goto after_svc; } counts[count]++; } @@ -3184,6 +3237,9 @@ repeat_svc: } after_svc: + rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", ipvs->est_kt_count, ipvs->est_max_threads); seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); @@ -3191,7 +3247,6 @@ after_svc: ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * IPVS_EST_NTICKS); - rcu_read_unlock(); return 0; } @@ -3503,7 +3558,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, int ret = 0; lockdep_assert_held(&ipvs->svc_resize_sem); - /* All service modifications are disabled, go ahead */ + /* All svc_table modifications are disabled, go ahead */ ip_vs_rht_walk_buckets(ipvs->svc_table, head) { hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ @@ -3687,7 +3742,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) pr_err("length: %u != %zu\n", *len, size); return -EINVAL; } - /* Protect against table resizer moving the entries. + /* Prevent modifications to the list with services. * Try reverse locking, so that we do not hold the mutex * while waiting for semaphore. */ @@ -4029,6 +4084,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int start = cb->args[0]; int idx = 0; + /* Make sure we do not see same service twice during resize */ down_read(&ipvs->svc_resize_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { @@ -5072,6 +5128,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); init_rwsem(&ipvs->svc_resize_sem); + init_rwsem(&ipvs->svc_replace_sem); INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); atomic_set(&ipvs->svc_table_changes, 0); RCU_INIT_POINTER(ipvs->svc_table, NULL); -- cgit v1.2.3 From b2870fc21601db9133bc70c48c603b487614fa3b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 14 May 2026 16:46:38 +0200 Subject: netfilter: br_netfilter: Reallocate headroom if necessary in neigh_hh_bridge() neigh_hh_bridge() assumes the skb always has sufficient headroom to copy the aligned L2 header. This assumption can trigger the crash reported below using the following netfilter setup: $modprobe br_netfilter $sysctl -w net.bridge.bridge-nf-call-iptables=1 $root@OpenWrt:~# nft list ruleset table ip nat { chain prerouting { type nat hook prerouting priority dstnat; policy accept; ip daddr 192.168.83.123 dnat to 192.168.83.120 } } - iperf3 client (192.168.83.119) --> bridge (192.168.83.118) --> iperf3 server (192.168.83.120) the iperf3 client is sending packet for 192.168.83.123 to the bridge device. [ 1579.036575] Unable to handle kernel write to read-only memory at virtual address ffffff8004d76ffe [ 1579.045482] Mem abort info: [ 1579.048273] ESR = 0x000000009600004f [ 1579.052024] EC = 0x25: DABT (current EL), IL = 32 bits [ 1579.057363] SET = 0, FnV = 0 [ 1579.060417] EA = 0, S1PTW = 0 [ 1579.063550] FSC = 0x0f: level 3 permission fault [ 1579.068345] Data abort info: [ 1579.071224] ISV = 0, ISS = 0x0000004f, ISS2 = 0x00000000 [ 1579.076720] CM = 0, WnR = 1, TnD = 0, TagAccess = 0 [ 1579.081770] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 1579.087092] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080dc4000 [ 1579.093794] [ffffff8004d76ffe] pgd=180000009ffff003, p4d=180000009ffff003, pud=180000009ffff003, pmd=180000009ffe3003, pte=0060000084d76787 [ 1579.106343] Internal error: Oops: 000000009600004f [#1] SMP [ 1579.193824] CPU: 0 UID: 0 PID: 235 Comm: napi/qdma_eth-3 Tainted: G O 6.12.57 #0 [ 1579.202614] Tainted: [O]=OOT_MODULE [ 1579.206102] Hardware name: Airoha AN7581 Evaluation Board (DT) [ 1579.211929] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1579.218889] pc : br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.225859] lr : br_nf_pre_routing_finish_bridge+0x18c/0xcc8 [br_netfilter] [ 1579.232822] sp : ffffffc0817cba20 [ 1579.236128] x29: ffffffc0817cba20 x28: 0000000000000000 x27: ffffff8002b89000 [ 1579.243273] x26: ffffff8004d7700e x25: 0000000000000008 x24: 0000000000000000 [ 1579.250416] x23: ffffffc08179d4c0 x22: 0000000000000000 x21: ffffffc08179d4c0 [ 1579.257561] x20: ffffff8004d9b800 x19: ffffff8015010000 x18: 0000000000000014 [ 1579.264704] x17: ffffffbf9e930000 x16: ffffffc0817c8000 x15: 0000000000000070 [ 1579.271848] x14: 0000000000000080 x13: 0000000000000001 x12: 0000000000000000 [ 1579.278993] x11: ffffffc0798caae0 x10: ffffff8014db6fd8 x9 : 0000000000000000 [ 1579.286136] x8 : 0000000000000003 x7 : ffffffc08171f628 x6 : 000000001a3b83d3 [ 1579.293281] x5 : 0000000000000000 x4 : 1beb76f22fee0000 x3 : ffffff8004d7700e [ 1579.300425] x2 : 0000000000000000 x1 : ffffff8004d9b8bc x0 : ffffff80026ed000 [ 1579.307570] Call trace: [ 1579.310018] br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.316632] br_nf_hook_thresh+0xd4/0x14bc [br_netfilter] [ 1579.322032] br_nf_hook_thresh+0x250/0x14bc [br_netfilter] [ 1579.327517] br_nf_hook_thresh+0x76c/0x14bc [br_netfilter] [ 1579.333003] br_handle_frame+0x180/0x480 [ 1579.336935] __netif_receive_skb_core.constprop.0+0x540/0xf40 [ 1579.342682] __netif_receive_skb_one_core+0x28/0x50 [ 1579.347561] process_backlog+0x98/0x1e0 [ 1579.351398] __napi_poll+0x34/0x1c4 [ 1579.354887] net_rx_action+0x178/0x330 [ 1579.358638] handle_softirqs+0x108/0x2d4 [ 1579.362560] __do_softirq+0x10/0x18 [ 1579.366051] ____do_softirq+0xc/0x20 [ 1579.369627] call_on_irq_stack+0x30/0x4c [ 1579.373550] do_softirq_own_stack+0x18/0x20 [ 1579.377734] do_softirq+0x4c/0x60 [ 1579.381050] __local_bh_enable_ip+0x88/0x98 [ 1579.385234] napi_threaded_poll_loop+0x188/0x21c [ 1579.389853] napi_threaded_poll+0x70/0x80 [ 1579.393863] kthread+0xd8/0xdc [ 1579.396918] ret_from_fork+0x10/0x20 [ 1579.400499] Code: 88dffc22 3707ffc2 f9406663 f9406684 (f81f0064) [ 1579.406589] ---[ end trace 0000000000000000 ]--- [ 1579.411209] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 1579.418083] SMP: stopping secondary CPUs [ 1579.422012] Kernel Offset: disabled Fix the issue reallocating the skb headroom if necessary in neigh_hh_bridge routine. Fixes: e179e6322ac33 ("netfilter: bridge-netfilter: Fix MAC header handling with IP DNAT") Reviewed-by: Ido Schimmel Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- include/net/neighbour.h | 8 ++++++-- net/bridge/br_netfilter_hooks.c | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 2dfee6d4258a..8860cc2175fc 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -489,11 +489,15 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { - unsigned int seq, hh_alen; + unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN); + int err; + + err = skb_cow_head(skb, hh_alen); + if (err) + return err; do { seq = read_seqbegin(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0ab1c94db4b9..0a394e5f4391 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -297,7 +297,11 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ goto free_skb; } - neigh_hh_bridge(&neigh->hh, skb); + if (neigh_hh_bridge(&neigh->hh, skb)) { + neigh_release(neigh); + goto free_skb; + } + skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); -- cgit v1.2.3 From e196115ec330a18de415bdb9f5071aa9f08e53ce Mon Sep 17 00:00:00 2001 From: Haoze Xie Date: Fri, 15 May 2026 11:19:02 +0800 Subject: netfilter: nf_queue: hold bridge skb->dev while queued br_pass_frame_up() rewrites skb->dev from the ingress port to the bridge master before queueing bridge LOCAL_IN packets. NFQUEUE only holds references on state.in/out and bridge physdevs, so a queued bridge packet can retain a freed bridge master in skb->dev until reinjection. When the verdict is reinjected later, br_netif_receive_skb() re-enters the receive path with skb->dev still pointing at the freed bridge master, triggering a use-after-free. Store skb->dev in the queue entry, hold a reference on it for the queue lifetime, and use the saved device when dropping queued packets during NETDEV_DOWN handling. Fixes: ac2863445686 ("netfilter: bridge: add nf_afinfo to enable queuing to userspace") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Haoze Xie Signed-off-by: Ren Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 1 + net/netfilter/nf_queue.c | 4 +++- net/netfilter/nfnetlink_queue.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index d17035d14d96..3978c3174cdb 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -14,6 +14,7 @@ struct nf_queue_entry { struct list_head list; struct rhash_head hash_node; struct sk_buff *skb; + struct net_device *skb_dev; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a6c81c04b3a5..57b450024a99 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -61,6 +61,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ + dev_put(entry->skb_dev); dev_put(state->in); dev_put(state->out); if (state->sk) @@ -102,6 +103,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; + dev_hold(entry->skb_dev); dev_hold(state->in); dev_hold(state->out); @@ -202,11 +204,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, *entry = (struct nf_queue_entry) { .skb = skb, + .skb_dev = skb->dev, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; - __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 58304fd1f70f..984a0eb9e149 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1212,6 +1212,8 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (physinif == ifindex || physoutif == ifindex) return 1; #endif + if (entry->skb_dev && entry->skb_dev->ifindex == ifindex) + return 1; if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; -- cgit v1.2.3 From 3d562d35a044ae798cab421c65a116f8cedfa5d4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 17 May 2026 09:55:28 +0200 Subject: bpf: Check global subprog exception paths Global subprogs are verified independently and are not descended into when their callers are symbolically executed. This means a caller can hold references or locks across a global subprog call that may throw, while the verifier only checks the non-exceptional return path at the call site. Record whether a subprog might throw in the CFG summary pass, alongside the existing might_sleep and packet-data-changing summaries, and propagate that effect through reachable callees. When a global subprog is marked as possibly throwing, push the normal continuation and validate the exceptional path immediately at the call site, avoiding a synthetic exception state and associated special case in the pruning checks. Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260517075530.3461166-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/cfg.c | 13 ++++++++++++- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..185b2aa43a42 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -729,6 +729,7 @@ struct bpf_subprog_info { */ s16 fastcall_stack_off; bool has_tail_call: 1; + bool might_throw: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; @@ -1308,6 +1309,7 @@ void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); +bool bpf_is_throw_kfunc(struct bpf_insn *insn); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); int bpf_check_cfg(struct bpf_verifier_env *env); diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c index 998f42a8189a..26d37066465f 100644 --- a/kernel/bpf/cfg.c +++ b/kernel/bpf/cfg.c @@ -64,11 +64,19 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) subprog->might_sleep = true; } +static void mark_subprog_might_throw(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = bpf_find_containing_subprog(env, off); + subprog->might_throw = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. * Rely on DFS traversal order and absence of recursive calls to guarantee that - * callee's change_pkt_data marks would be correct at that moment. + * callee's effect marks would be correct at that moment. */ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { @@ -78,6 +86,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; + caller->might_throw |= callee->might_throw; } enum { @@ -509,6 +518,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta)) mark_subprog_changes_pkt_data(env, t); + if (ret == 0 && bpf_is_throw_kfunc(insn)) + mark_subprog_might_throw(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88b40c979b56..7fb88e1cd7c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,7 +442,6 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); -static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); @@ -5405,7 +5404,7 @@ continue_func: if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { bool err = false; - if (!is_bpf_throw_kfunc(insn + i)) + if (!bpf_is_throw_kfunc(insn + i)) continue; for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_cb) { @@ -9499,6 +9498,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return 0; } +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, bool exception_exit); + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -9552,6 +9554,17 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; } + if (env->subprog_info[subprog].might_throw) { + struct bpf_verifier_state *branch; + + branch = push_stack(env, *insn_idx + 1, *insn_idx, false); + if (IS_ERR(branch)) { + verbose(env, "failed to push state for global subprog exception path\n"); + return PTR_ERR(branch); + } + return process_bpf_exit_full(env, NULL, true); + } + /* continue with next insn after call */ return 0; } @@ -11782,7 +11795,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id) is_task_work_add_kfunc(btf_id); } -static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +bool bpf_is_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; @@ -12972,8 +12985,6 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); -static int process_bpf_exit_full(struct bpf_verifier_env *env, - bool *do_print_state, bool exception_exit); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) @@ -13354,7 +13365,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) env->prog->call_session_cookie = true; - if (is_bpf_throw_kfunc(insn)) + if (bpf_is_throw_kfunc(insn)) return process_bpf_exit_full(env, NULL, true); return 0; -- cgit v1.2.3 From f233124fb36cd57ef09f96d517a38ab4b902e15e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:01 +0200 Subject: ata: libata-scsi: do not use the deferred QC feature on PMPs with CBS When using Port Multipliers (PMPs) with Command-Based Switching (CBS), you can only issue commands to one link at a time. For PMPs with CBS, there is already code to handle commands being sent to different links in sata_pmp_qc_defer_cmd_switch() using ap->excl_link. sata_sil24 also makes use of ap->excl_link. A user on the list reported that commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") broke PMPs with CBS. The commit introduced code that stores a deferred qc in ap->deferred_qc, to later be issued via a workqueue. It turns out that this change is incompatible with the existing ap->excl_link handling used by PMPs with CBS. Thus, modify sata_pmp_qc_defer_cmd_switch() and sil24_qc_defer() to return ATA_DEFER_LINK_EXCL, and make sure that the deferred QC handling via workqueue is not used for this return value. This way, PMPs with CBS will work once again. Note that the starvation referenced in commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") can only happen on libsas ports, and libsas does not support Port Multipliers, thus there is no harm of reverting back to the previous way of deferring commands for PMPs with CBS. Non-libsas ports connected to anything but a PMP with CBS (e.g. a normal drive or a PMP with FBS) will continue using the deferred workqueue, since it does result in lower completion latencies for non-NCQ commands, even though the workqueue is not strictly needed to avoid starvation for non-libsas ports. If we want to modify the scope of the workqueue issuing to also handle PMPs with CBS, then we should ensure that we can save both NCQ and non-NCQ commands in ap->deferred_qc, while also removing the existing PMP CBS handling using ap->excl_link, such that we don't duplicate features. While at it, also add a comment explaining how the ap->excl_link mechanism works. Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reported-by: Tommy Kelly Closes: https://lore.kernel.org/linux-ide/ce09cc21-a8e9-4845-b205-35411e22fba9@tkel.ly/ Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-pmp.c | 13 ++++++++++++- drivers/ata/libata-scsi.c | 8 ++++++++ drivers/ata/sata_sil24.c | 6 +++++- include/linux/libata.h | 1 + 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index e3adc008fed1..7e889534d73b 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -110,13 +110,24 @@ int sata_pmp_qc_defer_cmd_switch(struct ata_queued_cmd *qc) { struct ata_link *link = qc->dev->link; struct ata_port *ap = link->ap; + int ret; if (ap->excl_link == NULL || ap->excl_link == link) { if (ap->nr_active_links == 0 || ata_link_active(link)) { qc->flags |= ATA_QCFLAG_CLEAR_EXCL; - return ata_std_qc_defer(qc); + ret = ata_std_qc_defer(qc); + if (ret == ATA_DEFER_LINK) + return ATA_DEFER_LINK_EXCL; + return ret; } + /* + * Note: ap->excl_link contains the link that is next in line, + * i.e. implicit round robin. If there is only one link + * dispatching, ap->excl_link will be left unclaimed, allowing + * other links to set ap->excl_link, ensuring that the currently + * active link cannot queue any more. + */ ap->excl_link = link; } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index f03b6326ad2d..ca29744c57f9 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1787,6 +1787,14 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) case ATA_DEFER_LINK: ret = SCSI_MLQUEUE_DEVICE_BUSY; goto defer_qc; + case ATA_DEFER_LINK_EXCL: + /* + * Drivers making use of ap->excl_link cannot store the QC in + * ap->deferred_qc, because the ap->excl_link handling is + * incompatible with the ap->deferred_qc workqueue handling. + */ + ret = SCSI_MLQUEUE_DEVICE_BUSY; + goto free_qc; case ATA_DEFER_PORT: ret = SCSI_MLQUEUE_HOST_BUSY; goto free_qc; diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index d642ece9f07a..57f1081b86db 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -789,6 +789,7 @@ static int sil24_qc_defer(struct ata_queued_cmd *qc) struct ata_link *link = qc->dev->link; struct ata_port *ap = link->ap; u8 prot = qc->tf.protocol; + int ret; /* * There is a bug in the chip: @@ -826,7 +827,10 @@ static int sil24_qc_defer(struct ata_queued_cmd *qc) qc->flags |= ATA_QCFLAG_CLEAR_EXCL; } - return ata_std_qc_defer(qc); + ret = ata_std_qc_defer(qc); + if (ret == ATA_DEFER_LINK) + return ATA_DEFER_LINK_EXCL; + return ret; } static enum ata_completion_errors sil24_qc_prep(struct ata_queued_cmd *qc) diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c085ef4eda7..360776016b50 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -371,6 +371,7 @@ enum { /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, + ATA_DEFER_LINK_EXCL = 3, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, -- cgit v1.2.3 From 759e8756da00aa115d504a18155b1d1ee1cc12e8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:02 +0200 Subject: ata: libata-scsi: do not needlessly defer commands when using PMP with FBS The ACS specification does not allow a non-NCQ command to be issued while an NCQ command is outstanding. Commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") introduced a feature where a deferred non-NCQ command gets issued from a workqueue. The design stores a single non-NCQ command per port. However, when using Port Multipliers (PMPs), specifically PMPs that support FIS-Based Switching (FBS), non-NCQ and NCQ commands can be mixed on the same port, just not for the same link, see e.g. ata_std_qc_defer() which is, and always has operated on a per-link basis. Therefore, move the deferred_qc from struct ata_port to struct ata_link. This way, when using a PMP with FBS, we will not needlessly defer commands to all other links, just because one link issued a non-NCQ command while having an NCQ command outstanding. Only commands for that specific link will be deferred. This is in line with how PMPs with FBS worked before commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation"). Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 9 ++++++--- drivers/ata/libata-eh.c | 8 ++++---- drivers/ata/libata-pmp.c | 5 ++++- drivers/ata/libata-scsi.c | 43 +++++++++++++++++++++++++------------------ include/linux/libata.h | 6 +++--- 5 files changed, 42 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e76d15411e2a..3d0027ec33c2 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5584,6 +5584,7 @@ void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp) link->pmp = pmp; link->active_tag = ATA_TAG_POISON; link->hw_sata_spd_limit = UINT_MAX; + INIT_WORK(&link->deferred_qc_work, ata_scsi_deferred_qc_work); /* can't use iterator, ap isn't initialized yet */ for (i = 0; i < ATA_MAX_DEVICES; i++) { @@ -5666,7 +5667,6 @@ struct ata_port *ata_port_alloc(struct ata_host *host) mutex_init(&ap->scsi_scan_mutex); INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug); INIT_DELAYED_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan); - INIT_WORK(&ap->deferred_qc_work, ata_scsi_deferred_qc_work); INIT_LIST_HEAD(&ap->eh_done_q); init_waitqueue_head(&ap->eh_wait_q); init_completion(&ap->park_req_pending); @@ -6291,12 +6291,15 @@ static void ata_port_detach(struct ata_port *ap) /* It better be dead now and not have any remaining deferred qc. */ WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED)); - WARN_ON(ap->deferred_qc); - cancel_work_sync(&ap->deferred_qc_work); cancel_delayed_work_sync(&ap->hotplug_task); cancel_delayed_work_sync(&ap->scsi_rescan_task); + ata_for_each_link(link, ap, PMP_FIRST) { + WARN_ON(link->deferred_qc); + cancel_work_sync(&link->deferred_qc_work); + } + /* Delete port multiplier link transport devices */ if (ap->pmp_link) { int i; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 9a4b67b90b17..d623eb32ed8b 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -651,11 +651,11 @@ int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, if (qc->scsicmd != scmd) continue; if ((qc->flags & ATA_QCFLAG_ACTIVE) || - qc == ap->deferred_qc) + qc == qc->dev->link->deferred_qc) break; } - if (i < ATA_MAX_QUEUE && qc == ap->deferred_qc) { + if (i < ATA_MAX_QUEUE && qc == qc->dev->link->deferred_qc) { /* * This is a deferred command that timed out while * waiting for the command queue to drain. Since the qc @@ -666,8 +666,8 @@ int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, * deferred qc work from issuing this qc. */ WARN_ON_ONCE(qc->flags & ATA_QCFLAG_ACTIVE); - ap->deferred_qc = NULL; - cancel_work(&ap->deferred_qc_work); + qc->dev->link->deferred_qc = NULL; + cancel_work(&qc->dev->link->deferred_qc_work); set_host_byte(scmd, DID_TIME_OUT); scsi_eh_finish_cmd(scmd, &ap->eh_done_q); } else if (i < ATA_MAX_QUEUE) { diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index 7e889534d73b..e8540931b4a1 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -582,8 +582,11 @@ static void sata_pmp_detach(struct ata_device *dev) if (ap->ops->pmp_detach) ap->ops->pmp_detach(ap); - ata_for_each_link(tlink, ap, EDGE) + ata_for_each_link(tlink, ap, EDGE) { + WARN_ON(tlink->deferred_qc); + cancel_work_sync(&tlink->deferred_qc_work); ata_eh_detach_dev(tlink->device); + } spin_lock_irqsave(ap->lock, flags); ap->nr_pmp_links = 0; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index ca29744c57f9..d43207c6e467 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1664,8 +1664,9 @@ static void ata_scsi_qc_done(struct ata_queued_cmd *qc, bool set_result, void ata_scsi_deferred_qc_work(struct work_struct *work) { - struct ata_port *ap = - container_of(work, struct ata_port, deferred_qc_work); + struct ata_link *link = + container_of(work, struct ata_link, deferred_qc_work); + struct ata_port *ap = link->ap; struct ata_queued_cmd *qc; unsigned long flags; @@ -1676,10 +1677,10 @@ void ata_scsi_deferred_qc_work(struct work_struct *work) * such case, we should not need any more deferring the qc, so warn if * qc_defer() says otherwise. */ - qc = ap->deferred_qc; + qc = link->deferred_qc; if (qc && !ata_port_eh_scheduled(ap)) { WARN_ON_ONCE(ap->ops->qc_defer(qc)); - ap->deferred_qc = NULL; + link->deferred_qc = NULL; ata_qc_issue(qc); } @@ -1688,7 +1689,7 @@ void ata_scsi_deferred_qc_work(struct work_struct *work) void ata_scsi_requeue_deferred_qc(struct ata_port *ap) { - struct ata_queued_cmd *qc = ap->deferred_qc; + struct ata_link *link; lockdep_assert_held(ap->lock); @@ -1697,16 +1698,21 @@ void ata_scsi_requeue_deferred_qc(struct ata_port *ap) * do not try to be smart about what to do with this deferred command * and simply requeue it by completing it with DID_REQUEUE. */ - if (qc) { - ap->deferred_qc = NULL; - cancel_work(&ap->deferred_qc_work); - ata_scsi_qc_done(qc, true, DID_REQUEUE << 16); + ata_for_each_link(link, ap, PMP_FIRST) { + struct ata_queued_cmd *qc = link->deferred_qc; + + if (qc) { + link->deferred_qc = NULL; + cancel_work(&link->deferred_qc_work); + ata_scsi_qc_done(qc, true, DID_REQUEUE << 16); + } } } -static void ata_scsi_schedule_deferred_qc(struct ata_port *ap) +static void ata_scsi_schedule_deferred_qc(struct ata_link *link) { - struct ata_queued_cmd *qc = ap->deferred_qc; + struct ata_queued_cmd *qc = link->deferred_qc; + struct ata_port *ap = link->ap; lockdep_assert_held(ap->lock); @@ -1723,12 +1729,12 @@ static void ata_scsi_schedule_deferred_qc(struct ata_port *ap) return; } if (!ap->ops->qc_defer(qc)) - queue_work(system_highpri_wq, &ap->deferred_qc_work); + queue_work(system_highpri_wq, &link->deferred_qc_work); } static void ata_scsi_qc_complete(struct ata_queued_cmd *qc) { - struct ata_port *ap = qc->ap; + struct ata_link *link = qc->dev->link; struct scsi_cmnd *cmd = qc->scsicmd; u8 *cdb = cmd->cmnd; bool have_sense = qc->flags & ATA_QCFLAG_SENSE_VALID; @@ -1759,11 +1765,12 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc) ata_scsi_qc_done(qc, false, 0); - ata_scsi_schedule_deferred_qc(ap); + ata_scsi_schedule_deferred_qc(link); } static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) { + struct ata_link *link = qc->dev->link; int ret; if (!ap->ops->qc_defer) @@ -1774,7 +1781,7 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) * requeue and defer all incoming commands until the deferred qc is * processed, once all on-going commands complete. */ - if (ap->deferred_qc) { + if (link->deferred_qc) { ata_qc_free(qc); return SCSI_MLQUEUE_DEVICE_BUSY; } @@ -1790,8 +1797,8 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) case ATA_DEFER_LINK_EXCL: /* * Drivers making use of ap->excl_link cannot store the QC in - * ap->deferred_qc, because the ap->excl_link handling is - * incompatible with the ap->deferred_qc workqueue handling. + * link->deferred_qc, because the ap->excl_link handling is + * incompatible with the link->deferred_qc workqueue handling. */ ret = SCSI_MLQUEUE_DEVICE_BUSY; goto free_qc; @@ -1817,7 +1824,7 @@ defer_qc: * commands complete. */ if (!ata_is_ncq(qc->tf.protocol)) { - ap->deferred_qc = qc; + link->deferred_qc = qc; return 0; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 360776016b50..127229fbd1a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -855,6 +855,9 @@ struct ata_link { unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; + struct work_struct deferred_qc_work; + struct ata_queued_cmd *deferred_qc; + /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ @@ -900,9 +903,6 @@ struct ata_port { u64 qc_active; int nr_active_links; /* #links with active qcs */ - struct work_struct deferred_qc_work; - struct ata_queued_cmd *deferred_qc; - struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ -- cgit v1.2.3 From 379e8f1ca5e919b130b40d8115d92a536e5f8d7a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 18 May 2026 13:41:45 +0200 Subject: drm/gem: Make the GEM LRU lock part of drm_device Recently, a few races have been discovered in the GEM LRU logic, all of them caused by the fact the LRU lock is accessed through gem->lru->lock, and that very same lock also protects changes to gem->lru, leading to situations where gem->lru needs to first be accessed without the lock held, to then get the lru to access the lock through and finally take the lock and do the expected operation. Currently, the only driver making use of this API (MSM) declares a device-wide lock, and the user we're about to add (panthor) will do the same. There's no evidence that we will ever have a driver that wants different pools of LRUs protected by different locks under the same drm_device. So we're better off moving this lock to drm_device and always locking it through obj->dev->gem_lru_mutex, or directly through dev->gem_lru_mutex. If anyone ever needs more fine-grained locking, this can be revisited to pass some drm_gem_lru_pool object representing the pool of LRUs under a specific lock, but for now, the per-device lock seems to be enough. Fixes: e7c2af13f811 ("drm/gem: Add LRU/shrinker helper") Reported-by: Chia-I Wu Closes: https://gitlab.freedesktop.org/panfrost/linux/-/work_items/86 Reviewed-by: Rob Clark Reviewed-by: Liviu Dudau Reviewed-by: Steven Price Reviewed-by: Chia-I Wu Link: https://patch.msgid.link/20260518-panthor-shrinker-fixes-v4-1-1920234470d5@collabora.com Signed-off-by: Boris Brezillon --- drivers/gpu/drm/drm_drv.c | 2 ++ drivers/gpu/drm/drm_gem.c | 36 +++++++++++++++------------------- drivers/gpu/drm/msm/msm_drv.c | 11 +++++------ drivers/gpu/drm/msm/msm_drv.h | 7 ------- drivers/gpu/drm/msm/msm_gem.c | 33 +++++++++++++++---------------- drivers/gpu/drm/msm/msm_gem_shrinker.c | 4 ++-- drivers/gpu/drm/msm/msm_gem_submit.c | 6 +++--- drivers/gpu/drm/msm/msm_gem_vma.c | 12 ++++++------ drivers/gpu/drm/msm/msm_ringbuffer.c | 6 +++--- include/drm/drm_device.h | 7 +++++++ include/drm/drm_gem.h | 20 +++++++++---------- 11 files changed, 69 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 985c283cf59f..675675480da4 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -697,6 +697,7 @@ static void drm_dev_init_release(struct drm_device *dev, void *res) mutex_destroy(&dev->master_mutex); mutex_destroy(&dev->clientlist_mutex); mutex_destroy(&dev->filelist_mutex); + mutex_destroy(&dev->gem_lru_mutex); } static int drm_dev_init(struct drm_device *dev, @@ -738,6 +739,7 @@ static int drm_dev_init(struct drm_device *dev, INIT_LIST_HEAD(&dev->vblank_event_list); spin_lock_init(&dev->event_lock); + mutex_init(&dev->gem_lru_mutex); mutex_init(&dev->filelist_mutex); mutex_init(&dev->clientlist_mutex); mutex_init(&dev->master_mutex); diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index d6424267260b..b95b015d2983 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1541,12 +1541,10 @@ EXPORT_SYMBOL(drm_gem_unlock_reservations); * drm_gem_lru_init - initialize a LRU * * @lru: The LRU to initialize - * @lock: The lock protecting the LRU */ void -drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock) +drm_gem_lru_init(struct drm_gem_lru *lru) { - lru->lock = lock; lru->count = 0; INIT_LIST_HEAD(&lru->list); } @@ -1571,14 +1569,10 @@ drm_gem_lru_remove_locked(struct drm_gem_object *obj) void drm_gem_lru_remove(struct drm_gem_object *obj) { - struct drm_gem_lru *lru = obj->lru; - - if (!lru) - return; - - mutex_lock(lru->lock); - drm_gem_lru_remove_locked(obj); - mutex_unlock(lru->lock); + mutex_lock(&obj->dev->gem_lru_mutex); + if (obj->lru) + drm_gem_lru_remove_locked(obj); + mutex_unlock(&obj->dev->gem_lru_mutex); } EXPORT_SYMBOL(drm_gem_lru_remove); @@ -1593,7 +1587,7 @@ EXPORT_SYMBOL(drm_gem_lru_remove); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj) { - lockdep_assert_held_once(lru->lock); + lockdep_assert_held_once(&obj->dev->gem_lru_mutex); if (obj->lru) drm_gem_lru_remove_locked(obj); @@ -1617,9 +1611,9 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail_locked); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj) { - mutex_lock(lru->lock); + mutex_lock(&obj->dev->gem_lru_mutex); drm_gem_lru_move_tail_locked(lru, obj); - mutex_unlock(lru->lock); + mutex_unlock(&obj->dev->gem_lru_mutex); } EXPORT_SYMBOL(drm_gem_lru_move_tail); @@ -1633,6 +1627,7 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail); * of the shrink callback to check for this (ie. dma_resv_test_signaled()) * or if necessary block until the buffer becomes idle. * + * @dev: DRM device the LRU belongs to * @lru: The LRU to scan * @nr_to_scan: The number of pages to try to reclaim * @remaining: The number of pages left to reclaim, should be initialized by caller @@ -1640,7 +1635,8 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail); * @ticket: Optional ww_acquire_ctx context to use for locking */ unsigned long -drm_gem_lru_scan(struct drm_gem_lru *lru, +drm_gem_lru_scan(struct drm_device *dev, + struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket), @@ -1650,9 +1646,9 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, struct drm_gem_object *obj; unsigned freed = 0; - drm_gem_lru_init(&still_in_lru, lru->lock); + drm_gem_lru_init(&still_in_lru); - mutex_lock(lru->lock); + mutex_lock(&dev->gem_lru_mutex); while (freed < nr_to_scan) { obj = list_first_entry_or_null(&lru->list, typeof(*obj), lru_node); @@ -1675,7 +1671,7 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, * rest of the loop body, to reduce contention with other * code paths that need the LRU lock */ - mutex_unlock(lru->lock); + mutex_unlock(&dev->gem_lru_mutex); if (ticket) ww_acquire_init(ticket, &reservation_ww_class); @@ -1709,7 +1705,7 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, tail: drm_gem_object_put(obj); - mutex_lock(lru->lock); + mutex_lock(&dev->gem_lru_mutex); } /* @@ -1721,7 +1717,7 @@ tail: list_splice_tail(&still_in_lru.list, &lru->list); lru->count += still_in_lru.count; - mutex_unlock(lru->lock); + mutex_unlock(&dev->gem_lru_mutex); return freed; } diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 195f40e331e5..cc2bcd14b1c2 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -128,11 +128,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv, /* * Initialize the LRUs: */ - mutex_init(&priv->lru.lock); - drm_gem_lru_init(&priv->lru.unbacked, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.pinned, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock); + drm_gem_lru_init(&priv->lru.unbacked); + drm_gem_lru_init(&priv->lru.pinned); + drm_gem_lru_init(&priv->lru.willneed); + drm_gem_lru_init(&priv->lru.dontneed); /* Initialize stall-on-fault */ spin_lock_init(&priv->fault_stall_lock); @@ -140,7 +139,7 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv, /* Teach lockdep about lock ordering wrt. shrinker: */ fs_reclaim_acquire(GFP_KERNEL); - might_lock(&priv->lru.lock); + might_lock(&ddev->gem_lru_mutex); fs_reclaim_release(GFP_KERNEL); if (priv->kms_init) { diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 6d847d593f1a..617b3c4b42c0 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -150,13 +150,6 @@ struct msm_drm_private { * DONTNEED state (ie. can be purged) */ struct drm_gem_lru dontneed; - - /** - * lock: - * - * Protects manipulation of all of the LRUs. - */ - struct mutex lock; } lru; struct notifier_block vmap_notifier; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 2cb3ab04f125..efd3d3c9a449 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -177,11 +177,11 @@ static void update_lru_locked(struct drm_gem_object *obj) static void update_lru(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } static struct page **get_pages(struct drm_gem_object *obj) @@ -292,11 +292,11 @@ void msm_gem_pin_obj_locked(struct drm_gem_object *obj) static void pin_obj_locked(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); msm_gem_pin_obj_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } struct page **msm_gem_pin_pages_locked(struct drm_gem_object *obj) @@ -487,16 +487,16 @@ int msm_gem_pin_vma_locked(struct drm_gem_object *obj, struct drm_gpuva *vma) void msm_gem_unpin_locked(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_assert_locked(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); msm_obj->pin_count--; GEM_WARN_ON(msm_obj->pin_count < 0); update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } /* Special unpin path for use in fence-signaling path, avoiding the need @@ -507,10 +507,10 @@ void msm_gem_unpin_locked(struct drm_gem_object *obj) */ void msm_gem_unpin_active(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); - GEM_WARN_ON(!mutex_is_locked(&priv->lru.lock)); + GEM_WARN_ON(!mutex_is_locked(&dev->gem_lru_mutex)); msm_obj->pin_count--; GEM_WARN_ON(msm_obj->pin_count < 0); @@ -797,12 +797,12 @@ void msm_gem_put_vaddr(struct drm_gem_object *obj) */ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_lock(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); if (msm_obj->madv != __MSM_MADV_PURGED) msm_obj->madv = madv; @@ -814,7 +814,7 @@ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) */ update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); msm_gem_unlock(obj); @@ -824,7 +824,6 @@ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) void msm_gem_purge(struct drm_gem_object *obj) { struct drm_device *dev = obj->dev; - struct msm_drm_private *priv = obj->dev->dev_private; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_assert_locked(obj); @@ -839,10 +838,10 @@ void msm_gem_purge(struct drm_gem_object *obj) put_pages(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); /* A one-way transition: */ msm_obj->madv = __MSM_MADV_PURGED; - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); drm_gem_free_mmap_offset(obj); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 31fa51a44f86..c07af9602fee 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -186,7 +186,7 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) if (!stages[i].cond) continue; stages[i].freed = - drm_gem_lru_scan(stages[i].lru, nr, + drm_gem_lru_scan(priv->dev, stages[i].lru, nr, &stages[i].remaining, stages[i].shrink, &ticket); @@ -255,7 +255,7 @@ msm_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr) unsigned long remaining = 0; for (idx = 0; lrus[idx] && unmapped < vmap_shrink_limit; idx++) { - unmapped += drm_gem_lru_scan(lrus[idx], + unmapped += drm_gem_lru_scan(priv->dev, lrus[idx], vmap_shrink_limit - unmapped, &remaining, vmap_shrink, diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 26ea8a28be47..3c6bc90c3d48 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -352,7 +352,7 @@ static int submit_fence_sync(struct msm_gem_submit *submit) static int submit_pin_objects(struct msm_gem_submit *submit) { - struct msm_drm_private *priv = submit->dev->dev_private; + struct drm_device *dev = submit->dev; int i, ret = 0; for (i = 0; i < submit->nr_bos; i++) { @@ -381,11 +381,11 @@ static int submit_pin_objects(struct msm_gem_submit *submit) * get_pages() which could trigger reclaim.. and if we held the LRU lock * could trigger deadlock with the shrinker). */ - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); for (i = 0; i < submit->nr_bos; i++) { msm_gem_pin_obj_locked(submit->bos[i].obj); } - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); submit->bos_pinned = true; diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 1a952b171ed7..c4cfe036066b 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -702,7 +702,7 @@ static struct dma_fence * msm_vma_job_run(struct drm_sched_job *_job) { struct msm_vm_bind_job *job = to_msm_vm_bind_job(_job); - struct msm_drm_private *priv = job->vm->drm->dev_private; + struct drm_device *dev = job->vm->drm; struct msm_gem_vm *vm = to_msm_vm(job->vm); struct drm_gem_object *obj; int ret = vm->unusable ? -EINVAL : 0; @@ -745,13 +745,13 @@ msm_vma_job_run(struct drm_sched_job *_job) if (ret) msm_gem_vm_unusable(job->vm); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); job_foreach_bo (obj, job) { msm_gem_unpin_active(obj); } - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); /* VM_BIND ops are synchronous, so no fence to wait on: */ return NULL; @@ -1305,7 +1305,7 @@ vm_bind_job_pin_objects(struct msm_vm_bind_job *job) return PTR_ERR(pages); } - struct msm_drm_private *priv = job->vm->drm->dev_private; + struct drm_device *dev = job->vm->drm; /* * A second loop while holding the LRU lock (a) avoids acquiring/dropping @@ -1314,10 +1314,10 @@ vm_bind_job_pin_objects(struct msm_vm_bind_job *job) * get_pages() which could trigger reclaim.. and if we held the LRU lock * could trigger deadlock with the shrinker). */ - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); job_foreach_bo (obj, job) msm_gem_pin_obj_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); job->bos_pinned = true; diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 30ddb5351e98..2d6b930b766e 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -16,13 +16,13 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job) struct msm_gem_submit *submit = to_msm_submit(job); struct msm_fence_context *fctx = submit->ring->fctx; struct msm_gpu *gpu = submit->gpu; - struct msm_drm_private *priv = gpu->dev->dev_private; + struct drm_device *dev = gpu->dev; unsigned nr_cmds = submit->nr_cmds; int i; msm_fence_init(submit->hw_fence, fctx); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); for (i = 0; i < submit->nr_bos; i++) { struct drm_gem_object *obj = submit->bos[i].obj; @@ -32,7 +32,7 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job) submit->bos_pinned = false; - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); /* TODO move submit path over to using a per-ring lock.. */ mutex_lock(&gpu->lock); diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h index bc78fb77cc27..768a8dae83c5 100644 --- a/include/drm/drm_device.h +++ b/include/drm/drm_device.h @@ -375,6 +375,13 @@ struct drm_device { * Root directory for debugfs files. */ struct dentry *debugfs_root; + + /** + * @gem_lru_mutex: + * + * Lock protecting movement of GEM objects between LRUs. + */ + struct mutex gem_lru_mutex; }; void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev); diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index 86f5846154f7..8a704f6a65c1 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -245,17 +245,11 @@ struct drm_gem_object_funcs { * for lockless &shrinker.count_objects, and provides * &drm_gem_lru_scan for driver's &shrinker.scan_objects * implementation. + * + * Any access to this kind of object must be done with + * drm_device::gem_lru_mutex held. */ struct drm_gem_lru { - /** - * @lock: - * - * Lock protecting movement of GEM objects between LRUs. All - * LRUs that the object can move between should be protected - * by the same lock. - */ - struct mutex *lock; - /** * @count: * @@ -453,6 +447,9 @@ struct drm_gem_object { * @lru: * * The current LRU list that the GEM object is on. + * + * Access to this field must be done with drm_device::gem_lru_mutex + * held. */ struct drm_gem_lru *lru; }; @@ -610,12 +607,13 @@ void drm_gem_unlock_reservations(struct drm_gem_object **objs, int count, int drm_gem_dumb_map_offset(struct drm_file *file, struct drm_device *dev, u32 handle, u64 *offset); -void drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock); +void drm_gem_lru_init(struct drm_gem_lru *lru); void drm_gem_lru_remove(struct drm_gem_object *obj); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj); unsigned long -drm_gem_lru_scan(struct drm_gem_lru *lru, +drm_gem_lru_scan(struct drm_device *dev, + struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket), -- cgit v1.2.3 From 8817005efbdfdf5d4e4814cb5dc52b53d12917d7 Mon Sep 17 00:00:00 2001 From: Qing Ming Date: Sat, 16 May 2026 15:08:49 +0800 Subject: cgroup/rstat: validate cpu before css_rstat_cpu() access css_rstat_updated() is exposed as a BPF kfunc and accepts a caller-provided cpu argument. The function uses cpu for per-cpu rstat lookups without checking whether it refers to a valid possible CPU. A BPF iter/cgroup program with CAP_BPF and CAP_PERFMON can pass an invalid cpu value. On an unfixed UBSCAN_BOUNDS test kernel, cpu == 0x7fffffff triggers: UBSAN: array-index-out-of-bounds in kernel/cgroup/rstat.c:31:9 index 2147483647 is out of range for type 'long unsigned int [64]' Call Trace: css_rstat_updated bpf_iter_run_prog cgroup_iter_seq_show bpf_seq_read Add cpu validation to the BPF-facing css_rstat_updated() kfunc and move the common implementation to __css_rstat_updated() for in-kernel callers. Fixes: a319185be9f5 ("cgroup: bpf: enable bpf programs to integrate with rstat") Signed-off-by: Qing Ming Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 2 +- include/linux/cgroup.h | 1 + kernel/cgroup/rstat.c | 30 ++++++++++++++++++++---------- mm/memcontrol.c | 6 +++--- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 554c87bb4a86..bc63bd220865 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2241,7 +2241,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - css_rstat_updated(&blkcg->css, cpu); + __css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..e011dc43fcf1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -776,6 +776,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 150e5871e66f..ed60ba119c68 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" +#include #include #include @@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) } /** - * css_rstat_updated - keep track of updated rstat_cpu + * __css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * @@ -63,20 +64,17 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a - * memory barrier is needed before the call to css_rstat_updated() i.e. a + * memory barrier is needed before the call to __css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling - * css_rstat_updated(). + * __css_rstat_updated(). */ -__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct llist_node *self; - /* - * Since bpf programs can call this function, prevent access to - * uninitialized rstat pointers. - */ + /* Prevent access to uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; @@ -125,6 +123,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) llist_add(&rstatc->lnode, lhead); } +/* + * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided + * CPU before passing it to the internal rstat updater. + */ +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +{ + if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu))) + return; + + __css_rstat_updated(css, cpu); +} + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ @@ -170,7 +180,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before - * css_rstat_updated() by the user. + * __css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add @@ -614,7 +624,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - css_rstat_updated(&cgrp->self, smp_processor_id()); + __css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 051b82ebf371..c7e60f26013c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -579,7 +579,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, if (!val) return; - css_rstat_updated(&memcg->css, cpu); + __css_rstat_updated(&memcg->css, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { statc = this_cpu_ptr(statc_pcpu); @@ -2608,7 +2608,7 @@ static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; /* preemption is disabled in_nmi(). */ - css_rstat_updated(&memcg->css, smp_processor_id()); + __css_rstat_updated(&memcg->css, smp_processor_id()); if (idx == NR_SLAB_RECLAIMABLE_B) atomic_add(nr, &pn->slab_reclaimable); else @@ -2832,7 +2832,7 @@ static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) mod_memcg_state(memcg, MEMCG_KMEM, val); } else { /* preemption is disabled in_nmi(). */ - css_rstat_updated(&memcg->css, smp_processor_id()); + __css_rstat_updated(&memcg->css, smp_processor_id()); atomic_add(val, &memcg->kmem_stat); } } -- cgit v1.2.3 From 8939562b16052c75b908d3c5f968bffb526fc6e9 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 18 May 2026 15:02:08 +0800 Subject: efi: efi.h: Remove extra semicolon Remove extra semicolons from comments. Signed-off-by: Rong Tao Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 72e76ec54641..ccbc35479684 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -61,7 +61,7 @@ typedef void *efi_handle_t; /* * The UEFI spec and EDK2 reference implementation both define EFI_GUID as - * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment + * struct { u32 a; u16 b; u16 c; u8 d[8]; }; and so the implied alignment * is 32 bits not 8 bits like our guid_t. In some cases (i.e., on 32-bit ARM), * this means that firmware services invoked by the kernel may assume that * efi_guid_t* arguments are 32-bit aligned, and use memory accessors that -- cgit v1.2.3 From 7122ff96068a03595bde2fbafaca82ca2ed8084e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 13 May 2026 12:00:16 -0300 Subject: RDMA/core: Do not read wild stack memory in uverbs_get_handler_fn() Sashiko points out the legacy write path in ib_uverbs_write() does allocate a struct uverbs_attr_bundle, but it doesn't wrap it in a bundle_priv so downcasting here isn't safe. Instead lift the method_elm out of the bundle_priv and use it for the debug function. The legacy write path will leave it set as NULL since the write method_elm uses a different type. Cc: stable@vger.kernel.org Fixes: 1de9287ece44 ("RDMA: Add ib_copy_validate_udata_in()") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ib_core_uverbs.c | 4 +--- drivers/infiniband/core/uverbs.h | 1 - drivers/infiniband/core/uverbs_ioctl.c | 26 ++++++++++++++------------ include/rdma/uverbs_ioctl.h | 1 + 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index 685030e0c60f..8a0e6fa2a528 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -422,12 +422,10 @@ uverbs_api_ioctl_handler_fn uverbs_get_handler_fn(struct ib_udata *udata) { struct uverbs_attr_bundle *bundle = rdma_udata_to_uverbs_attr_bundle(udata); - struct bundle_priv *pbundle = - container_of(&bundle->hdr, struct bundle_priv, bundle); lockdep_assert_held(&bundle->ufile->device->disassociate_srcu); - return srcu_dereference(pbundle->method_elm->handler, + return srcu_dereference(bundle->method_elm->handler, &bundle->ufile->device->disassociate_srcu); } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a74a2dff1301..f2e192b51e60 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -244,7 +244,6 @@ struct bundle_priv { size_t internal_used; struct radix_tree_root *radix; - const struct uverbs_api_ioctl_method *method_elm; void __rcu **radix_slots; unsigned long radix_slots_len; u32 method_key; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 33feb88d652b..2552a7efe2fb 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -397,13 +397,13 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); - unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; + unsigned int destroy_bkey = bundle->method_elm->destroy_bkey; unsigned int i; int ret; /* See uverbs_disassociate_api() */ handler = srcu_dereference( - pbundle->method_elm->handler, + bundle->method_elm->handler, &pbundle->bundle.ufile->device->disassociate_srcu); if (!handler) return -EIO; @@ -421,12 +421,12 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, } /* User space did not provide all the mandatory attributes */ - if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, + if (unlikely(!bitmap_subset(bundle->method_elm->attr_mandatory, pbundle->bundle.attr_present, - pbundle->method_elm->key_bitmap_len))) + bundle->method_elm->key_bitmap_len))) return -EINVAL; - if (pbundle->method_elm->has_udata) + if (bundle->method_elm->has_udata) uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); else @@ -451,7 +451,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, * assume that the driver wrote to its UHW_OUT and flag userspace * appropriately. */ - if (!ret && pbundle->method_elm->has_udata) { + if (!ret && bundle->method_elm->has_udata) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); @@ -472,7 +472,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, static void bundle_destroy(struct bundle_priv *pbundle, bool commit) { - unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; + unsigned int key_bitmap_len = pbundle->bundle.method_elm->key_bitmap_len; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct bundle_alloc_head *memblock; @@ -560,7 +560,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, } /* Space for the pbundle->bundle.attrs flex array */ - pbundle->method_elm = method_elm; + pbundle->bundle.method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ @@ -569,10 +569,12 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); pbundle->user_attrs = user_attrs; - pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * - sizeof(*container_of(&pbundle->bundle, - struct uverbs_attr_bundle, hdr)->attrs), - sizeof(*pbundle->internal_buffer)); + pbundle->internal_used = ALIGN( + pbundle->bundle.method_elm->key_bitmap_len * + sizeof(*container_of(&pbundle->bundle, + struct uverbs_attr_bundle, hdr) + ->attrs), + sizeof(*pbundle->internal_buffer)); memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index e2af17da3e32..c89428030d61 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -635,6 +635,7 @@ struct uverbs_attr_bundle { struct ib_uverbs_file *ufile; struct ib_ucontext *context; struct ib_uobject *uobject; + const struct uverbs_api_ioctl_method *method_elm; DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN); ); struct uverbs_attr attrs[]; -- cgit v1.2.3 From 0cb5a74faa3bdcfa3b18735d554e12c0f615e35d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 18 May 2026 15:44:57 +0200 Subject: net: airoha: Fix NPU RX DMA descriptor bits In an internal review from Airoha, it was notice that the RX DMA descriptor bits and mask are wrong. These values probably refer to an old NPU firmware never published. The previous value works correctly but it was reported that in some specific condition in mixed scenario with both Ethernet and WiFi offload it's possible that RX DMA descriptor signal wrong value with the problem to the RX ring or packets getting dropped. To handle these specific scenario, apply the new suggested bits mask from Airoha. Correct functionality of both AN7581 NPU and MT7996 variant were verified and confirmed working. Fixes: a7fc8c641cab ("net: airoha: Fix npu rx DMA definitions") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260518134530.3683-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index d01ef4a6b3d7..7589fccfeef6 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,9 +71,9 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ #define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) -- cgit v1.2.3 From b8d7519352ba8c6df83259295d4a3bad093cae90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 May 2026 15:13:25 -0700 Subject: net: shaper: rework the VALID marking (again) Recent commit changed the semantics from NOT_VALID to VALID. I didn't realize that the flags are not stored atomically with the entry in XArray. There's still a race of reader observing a VALID mark for a slot, getting interrupted, writer replacing the entry with a different one, reader continuing, fetching the entry which is now a different pointer than the pointer for which VALID was meant. The biggest consequence of this is that we may see a UAF since net_shaper_rollback() assumed that entries without VALID can be freed without observing RCU. Looks like the XArray marks are buying us nothing at this point. Let's convert the code to an explicit valid field. The smp_load_acquire() / smp_store_release() barriers are marginally cleaner. Reported-by: Sashiko Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260515221325.1685455-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/net_shaper.h | 1 + net/shaper/shaper.c | 45 ++++++++++++++++++--------------------------- 2 files changed, 19 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h index 5c3f49b52fe9..3939b816b001 100644 --- a/include/net/net_shaper.h +++ b/include/net/net_shaper.h @@ -53,6 +53,7 @@ struct net_shaper { /* private: */ u32 leaves; /* accounted only for NODE scope */ + bool valid; struct rcu_head rcu; }; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 520cefdc3d90..dea9270f3e57 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -306,31 +306,24 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* MARK_0 is already in use due to XA_FLAGS_ALLOC. The VALID mark is set on - * an entry only after the device-side configuration has completed - * successfully (see net_shaper_commit()). Lookups and dumps must filter on - * this mark to avoid exposing tentative entries inserted by - * net_shaper_pre_insert() while the driver call is still in flight. - */ -#define NET_SHAPER_VALID XA_MARK_1 - static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { u32 index = net_shaper_handle_to_index(handle); struct net_shaper_hierarchy *hierarchy; + struct net_shaper *cur; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || !xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_VALID)) + if (!hierarchy) return NULL; - /* Pairs with smp_wmb() in net_shaper_commit(): if the entry is - * valid, its contents must be visible too. - */ - smp_rmb(); - return xa_load(&hierarchy->shapers, index); + cur = xa_load(&hierarchy->shapers, index); + /* Check valid before reading fields */ + if (!cur || !smp_load_acquire(&cur->valid)) + return NULL; + + return cur; } /* Allocate on demand the per device shaper's hierarchy container. @@ -444,12 +437,10 @@ static void net_shaper_commit(struct net_shaper_binding *binding, if (WARN_ON_ONCE(!cur)) continue; - /* Successful update: drop the tentative mark - * and update the hierarchy container. - */ + /* Successful update: update the hierarchy container... */ net_shaper_copy(cur, &shapers[i]); - smp_wmb(); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); + /* ... publish to lockless readers. */ + smp_store_release(&cur->valid, true); } xa_unlock(&hierarchy->shapers); } @@ -466,10 +457,10 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { - if (xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_VALID)) + if (cur->valid) continue; __xa_erase(&hierarchy->shapers, index); - kfree(cur); + kfree_rcu(cur, rcu); } xa_unlock(&hierarchy->shapers); } @@ -882,12 +873,12 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, NET_SHAPER_VALID)); + U32_MAX, XA_PRESENT)); ctx->start_index++) { - /* Pairs with smp_wmb() in net_shaper_commit(): the entry - * is marked VALID, so its contents must be visible too. - */ - smp_rmb(); + /* Check valid before reading fields */ + if (!smp_load_acquire(&shaper->valid)) + continue; + ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; -- cgit v1.2.3 From 2b50aceafe6606ea52ed42aadd1b4d44a188aade Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 16 May 2026 00:05:13 +0100 Subject: crypto/krb5, rxrpc: Fix lack of pre-decrypt/pre-verify length checks Change the krb5 crypto library to provide facilities to precheck the length of the message about to be decrypted or verified. Fix AF_RXRPC to make use of this to validate DATA packets secured with RxGK. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260511160753.607296-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Herbert Xu cc: Simon Horman cc: Chuck Lever cc: linux-afs@lists.infradead.org Reviewed-by: Jeffrey Altman Tested-by: Marc Dionne Link: https://patch.msgid.link/20260515230516.2718212-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/crypto/krb5.rst | 17 ++++++++++---- crypto/krb5/krb5_api.c | 54 ++++++++++++++++++++++++++++++++++++++----- include/crypto/krb5.h | 9 +++++--- include/trace/events/rxrpc.h | 1 + net/rxrpc/rxgk.c | 15 ++++++++++-- 5 files changed, 81 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/Documentation/crypto/krb5.rst b/Documentation/crypto/krb5.rst index beffa0133446..f62e07ac6811 100644 --- a/Documentation/crypto/krb5.rst +++ b/Documentation/crypto/krb5.rst @@ -158,13 +158,22 @@ returned. When a message has been received, the location and size of the data with the message can be determined by calling:: - void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len); + int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len); The caller provides the offset and length of the message to the function, which then alters those values to indicate the region containing the data (plus any -padding). It is up to the caller to determine how much padding there is. +padding). It is up to the caller to determine how much padding there is. The +function returns an error if the length is too small or if the mode is +unsupported. An additional function:: + + int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content); + +is provided to just do a basic check that the decrypted/verified message would +have a sufficient minimum payload. Preparation Functions --------------------- diff --git a/crypto/krb5/krb5_api.c b/crypto/krb5/krb5_api.c index 23026d4206c8..c7ea40f900a7 100644 --- a/crypto/krb5/krb5_api.c +++ b/crypto/krb5/krb5_api.c @@ -134,27 +134,69 @@ EXPORT_SYMBOL(crypto_krb5_how_much_data); * Find the offset and size of the data in a secure message so that this * information can be used in the metadata buffer which will get added to the * digest by crypto_krb5_verify_mic(). + * + * Return: 0 if successful, -EBADMSG if the message is too short or -EINVAL if + * the mode is unsupported. */ -void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len) +int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len) { switch (mode) { case KRB5_CHECKSUM_MODE: + if (*_len < krb5->cksum_len) + return -EBADMSG; *_offset += krb5->cksum_len; *_len -= krb5->cksum_len; - return; + return 0; case KRB5_ENCRYPT_MODE: + if (*_len < krb5->conf_len + krb5->cksum_len) + return -EBADMSG; *_offset += krb5->conf_len; *_len -= krb5->conf_len + krb5->cksum_len; - return; + return 0; default: WARN_ON_ONCE(1); - return; + return -EINVAL; } } EXPORT_SYMBOL(crypto_krb5_where_is_the_data); +/** + * crypto_krb5_check_data_len - Check a message is big enough + * @krb5: The encoding to use. + * @mode: Mode of operation. + * @len: The length of the secure blob. + * @min_content: Minimum length of the content inside the blob. + * + * Check that a message is large enough to hold whatever bits the encryption + * type wants to glue on (nonce, checksum) plus a minimum amount of content. + * + * Return: 0 if successful, -EBADMSG if the message is too short or -EINVAL if + * the mode is unsupported. + */ +int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content) +{ + switch (mode) { + case KRB5_CHECKSUM_MODE: + if (len < krb5->cksum_len || + len - krb5->cksum_len < min_content) + return -EBADMSG; + return 0; + case KRB5_ENCRYPT_MODE: + if (len < krb5->conf_len + krb5->cksum_len || + len - (krb5->conf_len + krb5->cksum_len) < min_content) + return -EBADMSG; + return 0; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} +EXPORT_SYMBOL(crypto_krb5_check_data_len); + /* * Prepare the encryption with derived key data. */ diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 71dd38f59be1..aac3ecf88467 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -121,9 +121,12 @@ size_t crypto_krb5_how_much_buffer(const struct krb5_enctype *krb5, size_t crypto_krb5_how_much_data(const struct krb5_enctype *krb5, enum krb5_crypto_mode mode, size_t *_buffer_size, size_t *_offset); -void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len); +int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len); +int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content); struct crypto_aead *crypto_krb5_prepare_encryption(const struct krb5_enctype *krb5, const struct krb5_buffer *TK, u32 usage, gfp_t gfp); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 573f2df3a2c9..704a10de6670 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -71,6 +71,7 @@ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ /* RxGK security errors */ \ + EM(rxgk_abort_1_short_header, "rxgk1-short-hdr") \ EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 0d5e654da918..26e723052a37 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -480,8 +480,12 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, _enter(""); - crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, - &data_offset, &data_len); + if (crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_1_short_header); + goto put_gk; + } hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) @@ -529,6 +533,13 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, _enter(""); + if (crypto_krb5_check_data_len(gk->krb5, KRB5_ENCRYPT_MODE, + len, sizeof(hdr)) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); if (ret < 0) { if (ret != -ENOMEM) -- cgit v1.2.3 From 1bbf0ced1d9db73ac7893c2187f3459288603e0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2026 08:46:11 +0000 Subject: tcp: fix stale per-CPU tcp_tw_isn leak enabling ISN prediction Blamed commit moved the TIME_WAIT-derived ISN from the skb control block to a per-CPU variable, assuming the value would always be consumed by tcp_conn_request() for the same packet that wrote it. That assumption is violated by multiple drop paths between the producer (__this_cpu_write(tcp_tw_isn, isn) in tcp_v{4,6}_rcv()) and the consumer (tcp_conn_request()): - min_ttl / min_hopcount check - xfrm policy check - tcp_inbound_hash() MD5/AO mismatch - tcp_filter() eBPF/SO_ATTACH_FILTER drop - th->syn && th->fin discard in tcp_rcv_state_process() TCP_LISTEN - psp_sk_rx_policy_check() in tcp_v{4,6}_do_rcv() - tcp_checksum_complete() in tcp_v{4,6}_do_rcv() - tcp_v{4,6}_cookie_check() returning NULL When a packet is dropped on any of these paths, tcp_tw_isn is left set. The next SYN processed on the same CPU then consumes the non zero value in tcp_conn_request(), receiving a potentially predictable ISN. This patch moves back tcp_tw_isn to skb->cb[], getting rid of the per-cpu variable. Note that tcp_v{4,6}_fill_cb() do not set it. Very litle impact on overall code size/complexity: $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 2/1 up/down: 8/-15 (-7) Function old new delta tcp_v6_rcv 3038 3042 +4 tcp_v4_rcv 3035 3039 +4 tcp_conn_request 2938 2923 -15 Total: Before=24436060, After=24436053, chg -0.00% Fixes: 41eecbd712b7 ("tcp: replace TCP_SKB_CB(skb)->tcp_tw_isn with a per-cpu field") Reported-by: Chris Mason Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260519084611.2485277-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 7 ++++--- net/ipv4/tcp.c | 3 --- net/ipv4/tcp_input.c | 15 ++++++--------- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ecbadcb3a744..98848db62894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -65,8 +65,6 @@ static inline void tcp_orphan_count_dec(void) this_cpu_dec(tcp_orphan_count); } -DECLARE_PER_CPU(u32, tcp_tw_isn); - void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -1102,10 +1100,13 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : + /* Notes : + * tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ + u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 432fa28e47d4..389a7cc17110 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -299,9 +299,6 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); -DEFINE_PER_CPU(u32, tcp_tw_isn); -EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); - long sysctl_tcp_mem[3] __read_mostly; DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d5c9e65d9760..de9f68a9c0cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7589,6 +7589,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; + u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7599,20 +7600,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; - u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - isn = __this_cpu_read(tcp_tw_isn); - if (isn) { - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - __this_cpu_write(tcp_tw_isn, 0); - } else { + /* If isn is non-zero, this SYN originally matched a TIME_WAIT socket. + * TW sockets are converted to open requests without limitations, + * we skip the queue limits and syncookie checks in the block below. + */ + if (!isn) { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c0526cc03980..fdc81150ff6c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2198,6 +2198,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ @@ -2227,6 +2228,7 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -2313,7 +2315,6 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d13d49bfef19..36d75fb50a70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1839,6 +1839,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ @@ -1868,6 +1869,7 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -1956,7 +1958,6 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } -- cgit v1.2.3 From a494d3c8d5392bcdff83c2a593df0c160ff9f322 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 30 Apr 2026 12:28:16 +0900 Subject: ring-buffer: Flush and stop persistent ring buffer on panic On real hardware, panic and machine reboot may not flush hardware cache to memory. This means the persistent ring buffer, which relies on a coherent state of memory, may not have its events written to the buffer and they may be lost. Moreover, there may be inconsistency with the counters which are used for validation of the integrity of the persistent ring buffer which may cause all data to be discarded. To avoid this issue, stop recording of the ring buffer on panic and flush the cache of the ring buffer's memory. Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance") Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Mathieu Desnoyers Cc: Ian Rogers Link: https://patch.msgid.link/177751969602.2136606.12031934362587643488.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Acked-by: Catalin Marinas Acked-by: Geert Uytterhoeven Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/ring_buffer.h | 10 ++++++++++ arch/csky/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/loongarch/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/nios2/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/riscv/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/x86/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/ring_buffer.h | 13 +++++++++++++ kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++++ 23 files changed, 65 insertions(+) create mode 100644 arch/arm64/include/asm/ring_buffer.h create mode 100644 include/asm-generic/ring_buffer.h (limited to 'include') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 483965c5a4de..b154b4e3dfa8 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += agp.h generic-y += asm-offsets.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 4c69522e0328..483caacc6988 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -5,5 +5,6 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 03657ff8fbe3..decad5f2c826 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += parport.h +generic-y += ring_buffer.h generated-y += mach-types.h generated-y += unistd-nr.h diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h new file mode 100644 index 000000000000..62316c406888 --- /dev/null +++ b/arch/arm64/include/asm/ring_buffer.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_ARM64_RING_BUFFER_H +#define _ASM_ARM64_RING_BUFFER_H + +#include + +/* Flush D-cache on persistent ring buffer */ +#define arch_ring_buffer_flush_range(start, end) dcache_clean_pop(start, end) + +#endif /* _ASM_ARM64_RING_BUFFER_H */ diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 3a5c7f6e5aac..7dca0c6cdc84 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 1efa1e993d4b..0f887d4238ed 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 9034b583a88a..7e92957baf6a 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -10,5 +10,6 @@ generic-y += qrwlock.h generic-y += user.h generic-y += ioctl.h generic-y += mmzone.h +generic-y += ring_buffer.h generic-y += statfs.h generic-y += text-patching.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index b282e0dd8dc1..62543bf305ff 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -3,5 +3,6 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += text-patching.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 7178f990e8b3..0030309b47ad 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += syscalls.h generic-y += tlb.h generic-y += user.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 684569b2ecd6..9771c3d85074 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 28004301c236..0a2530964413 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += cmpxchg.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index cef49d60d74c..8aa34621702d 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -8,4 +8,5 @@ generic-y += spinlock_types.h generic-y += spinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4fb596d94c89..d48d158f7241 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 2e23533b67e3..805b5aeebb6f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h generic-y += agp.h generic-y += mcs_spinlock.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += early_ioremap.h diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index bd5fc9403295..7721b63642f4 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 80bad7de7a04..0c1fc47c3ba0 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,3 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 4d3f10ed8275..f0403d3ee8ab 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -3,4 +3,5 @@ generated-y += syscall_table.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 17ee8a273aa6..49c6bb326b75 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 1b9b82bbe322..2a1629ba8140 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -17,6 +17,7 @@ generic-y += module.lds.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h +generic-y += ring_buffer.h generic-y += runtime-const.h generic-y += softirq_stack.h generic-y += switch_to.h diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4566000e15c4..078fd2c0d69d 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -14,3 +14,4 @@ generic-y += early_ioremap.h generic-y += fprobe.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 13fe45dea296..e57af619263a 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h new file mode 100644 index 000000000000..201d2aee1005 --- /dev/null +++ b/include/asm-generic/ring_buffer.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic arch dependent ring_buffer macros. + */ +#ifndef __ASM_GENERIC_RING_BUFFER_H__ +#define __ASM_GENERIC_RING_BUFFER_H__ + +#include + +/* Flush cache on ring buffer range if needed. Do nothing by default. */ +#define arch_ring_buffer_flush_range(start, end) do { } while (0) + +#endif /* __ASM_GENERIC_RING_BUFFER_H__ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fcd93d49851e..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include +#include #include #include #include @@ -559,6 +561,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, @@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); -- cgit v1.2.3 From 215c90ee656114f5e8c32408228d97082f8e0eef Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 6 May 2026 13:57:00 +0200 Subject: device property: set fwnode->secondary to NULL in fwnode_init() If a firmware node is allocated on the stack (for instance: temporary software node whose life-time we control) or on the heap - but using a non-zeroing allocation function - and initialized using fwnode_init(), its secondary pointer will contain uninitalized memory which likely will be neither NULL nor IS_ERR() and so may end up being dereferenced (for example: in dev_to_swnode()). Set fwnode->secondary to NULL on initialization. Cc: stable Fixes: 01bb86b380a3 ("driver core: Add fwnode_init()") Signed-off-by: Bartosz Golaszewski Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Link: https://patch.msgid.link/20260506115701.23035-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80b38fbf2121..31df7608737e 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -208,6 +208,7 @@ struct fwnode_operations { static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { + fwnode->secondary = NULL; fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); -- cgit v1.2.3