From ca45c84afb8c91a8d688b0012657099c24f59266 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 3 Dec 2025 19:32:15 -0800 Subject: bpf: Add bpf_has_frame_pointer() Introduce a bpf_has_frame_pointer() helper that unwinders can call to determine whether a given instruction pointer is within the valid frame pointer region of a BPF JIT program or trampoline (i.e., after the prologue, before the epilogue). This will enable livepatch (with the ORC unwinder) to reliably unwind through BPF JIT frames. Acked-by: Song Liu Acked-and-tested-by: Andrey Grodzovsky Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/fd2bc5b4e261a680774b28f6100509fd5ebad2f0.1764818927.git.jpoimboe@kernel.org Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Olsa --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..e5be698256d1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1283,6 +1283,8 @@ struct bpf_ksym { struct list_head lnode; struct latch_tree_node tnode; bool prog; + u32 fp_start; + u32 fp_end; }; enum bpf_tramp_prog_type { @@ -1511,6 +1513,7 @@ void bpf_image_ksym_add(struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +bool bpf_has_frame_pointer(unsigned long ip); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); -- cgit v1.2.3 From d9f514d3e6ee48c34d70d637479b4c9384832d4f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 23 Nov 2025 22:51:23 +0000 Subject: block: move around bio flagging helpers We'll need bio_flagged() earlier in bio.h for later patches, move it together with all related helpers, and mark the bio_flagged()'s bio argument as const. Signed-off-by: Pavel Begunkov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ad2d57908c1c..c75a9b3672aa 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -46,6 +46,21 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) #define bio_data_dir(bio) \ (op_is_write(bio_op(bio)) ? WRITE : READ) +static inline bool bio_flagged(const struct bio *bio, unsigned int bit) +{ + return bio->bi_flags & (1U << bit); +} + +static inline void bio_set_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_flags |= (1U << bit); +} + +static inline void bio_clear_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_flags &= ~(1U << bit); +} + /* * Check whether this bio carries any data or not. A NULL bio is allowed. */ @@ -225,21 +240,6 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) atomic_set(&bio->__bi_cnt, count); } -static inline bool bio_flagged(struct bio *bio, unsigned int bit) -{ - return bio->bi_flags & (1U << bit); -} - -static inline void bio_set_flag(struct bio *bio, unsigned int bit) -{ - bio->bi_flags |= (1U << bit); -} - -static inline void bio_clear_flag(struct bio *bio, unsigned int bit) -{ - bio->bi_flags &= ~(1U << bit); -} - static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); -- cgit v1.2.3 From 0c01ea92f545ca7fcafdda6a8e29b65ef3a5ec74 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Fri, 12 Dec 2025 04:08:08 -0500 Subject: mm: Remove tlb_flush_reason::NR_TLB_FLUSH_REASONS from This has been unused since it was added 11 years ago in: d17d8f9dedb9 ("x86/mm: Add tracepoints for TLB flushes") Signed-off-by: Tal Zussman Signed-off-by: Ingo Molnar Reviewed-by: Rik van Riel Acked-by: David Hildenbrand Link: https://patch.msgid.link/20251212-tlb-trace-fix-v2-2-d322e0ad9b69@columbia.edu --- include/linux/mm_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9f6de068295d..42af2292951d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1631,7 +1631,6 @@ enum tlb_flush_reason { TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, TLB_REMOTE_WRONG_CPU, - NR_TLB_FLUSH_REASONS, }; /** -- cgit v1.2.3 From e1b4c6a58304fd490124cc2b454d80edc786665c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Dec 2025 17:50:23 -0500 Subject: shmem: fix recovery on rename failures maple_tree insertions can fail if we are seriously short on memory; simple_offset_rename() does not recover well if it runs into that. The same goes for simple_offset_rename_exchange(). Moreover, shmem_whiteout() expects that if it succeeds, the caller will progress to d_move(), i.e. that shmem_rename2() won't fail past the successful call of shmem_whiteout(). Not hard to fix, fortunately - mtree_store() can't fail if the index we are trying to store into is already present in the tree as a singleton. For simple_offset_rename_exchange() that's enough - we just need to be careful about the order of operations. For simple_offset_rename() solution is to preinsert the target into the tree for new_dir; the rest can be done without any potentially failing operations. That preinsertion has to be done in shmem_rename2() rather than in simple_offset_rename() itself - otherwise we'd need to deal with the possibility of failure after successful shmem_whiteout(). Fixes: a2e459555c5f ("shmem: stable directory offsets") Reviewed-by: Christian Brauner Reviewed-by: Chuck Lever Signed-off-by: Al Viro --- fs/libfs.c | 50 +++++++++++++++++++++----------------------------- include/linux/fs.h | 2 +- mm/shmem.c | 18 +++++++++++++----- 3 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index 9264523be85c..591eb649ebba 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -346,22 +346,22 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) * User space expects the directory offset value of the replaced * (new) directory entry to be unchanged after a rename. * - * Returns zero on success, a negative errno value on failure. + * Caller must have grabbed a slot for new_dentry in the maple_tree + * associated with new_dir, even if dentry is negative. */ -int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long new_offset = dentry2offset(new_dentry); - simple_offset_remove(old_ctx, old_dentry); + if (WARN_ON(!new_offset)) + return; - if (new_offset) { - offset_set(new_dentry, 0); - return simple_offset_replace(new_ctx, old_dentry, new_offset); - } - return simple_offset_add(new_ctx, old_dentry); + simple_offset_remove(old_ctx, old_dentry); + offset_set(new_dentry, 0); + WARN_ON(simple_offset_replace(new_ctx, old_dentry, new_offset)); } /** @@ -388,31 +388,23 @@ int simple_offset_rename_exchange(struct inode *old_dir, long new_index = dentry2offset(new_dentry); int ret; - simple_offset_remove(old_ctx, old_dentry); - simple_offset_remove(new_ctx, new_dentry); + if (WARN_ON(!old_index || !new_index)) + return -EINVAL; - ret = simple_offset_replace(new_ctx, old_dentry, new_index); - if (ret) - goto out_restore; + ret = mtree_store(&new_ctx->mt, new_index, old_dentry, GFP_KERNEL); + if (WARN_ON(ret)) + return ret; - ret = simple_offset_replace(old_ctx, new_dentry, old_index); - if (ret) { - simple_offset_remove(new_ctx, old_dentry); - goto out_restore; + ret = mtree_store(&old_ctx->mt, old_index, new_dentry, GFP_KERNEL); + if (WARN_ON(ret)) { + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); + return ret; } - ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - simple_offset_remove(new_ctx, old_dentry); - simple_offset_remove(old_ctx, new_dentry); - goto out_restore; - } + offset_set(old_dentry, new_index); + offset_set(new_dentry, old_index); + simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); return 0; - -out_restore: - (void)simple_offset_replace(old_ctx, old_dentry, old_index); - (void)simple_offset_replace(new_ctx, new_dentry, new_index); - return ret; } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..f5c9cf28c4dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3247,7 +3247,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); -int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, +void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, diff --git a/mm/shmem.c b/mm/shmem.c index d3edc809e2e7..a9666b0599a4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4038,6 +4038,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + bool had_offset = false; int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) @@ -4050,16 +4051,23 @@ static int shmem_rename2(struct mnt_idmap *idmap, if (!simple_empty(new_dentry)) return -ENOTEMPTY; + error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry); + if (error == -EBUSY) + had_offset = true; + else if (unlikely(error)) + return error; + if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); - if (error) + if (error) { + if (!had_offset) + simple_offset_remove(shmem_get_offset_ctx(new_dir), + new_dentry); return error; + } } - error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - return error; - + simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { -- cgit v1.2.3 From dcd0b625fe440d68bb4b97c71d18ca48ecd6e594 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 17 Dec 2025 07:34:55 -0800 Subject: powercap: intel_rapl: Fix possible recursive lock warning With the RAPL PMU addition, there is a recursive locking when CPU online callback function calls rapl_package_add_pmu(). Here cpu_hotplug_lock is already acquired by cpuhp_thread_fun() and rapl_package_add_pmu() tries to acquire again. <4>[ 8.197433] ============================================ <4>[ 8.197437] WARNING: possible recursive locking detected <4>[ 8.197440] 6.19.0-rc1-lgci-xe-xe-4242-05b7c58b3367dca84+ #1 Not tainted <4>[ 8.197444] -------------------------------------------- <4>[ 8.197447] cpuhp/0/20 is trying to acquire lock: <4>[ 8.197450] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at: rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] <4>[ 8.197463] but task is already holding lock: <4>[ 8.197466] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x6d/0x290 <4>[ 8.197477] other info that might help us debug this: <4>[ 8.197480] Possible unsafe locking scenario: <4>[ 8.197483] CPU0 <4>[ 8.197485] ---- <4>[ 8.197487] lock(cpu_hotplug_lock); <4>[ 8.197490] lock(cpu_hotplug_lock); <4>[ 8.197493] *** DEADLOCK *** .. .. <4>[ 8.197542] __lock_acquire+0x146e/0x2790 <4>[ 8.197548] lock_acquire+0xc4/0x2c0 <4>[ 8.197550] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] <4>[ 8.197556] cpus_read_lock+0x41/0x110 <4>[ 8.197558] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] <4>[ 8.197561] rapl_package_add_pmu+0x37/0x370 [intel_rapl_common] <4>[ 8.197565] rapl_cpu_online+0x85/0x87 [intel_rapl_msr] <4>[ 8.197568] ? __pfx_rapl_cpu_online+0x10/0x10 [intel_rapl_msr] <4>[ 8.197570] cpuhp_invoke_callback+0x41f/0x6c0 <4>[ 8.197573] ? cpuhp_thread_fun+0x6d/0x290 <4>[ 8.197575] cpuhp_thread_fun+0x1e2/0x290 <4>[ 8.197578] ? smpboot_thread_fn+0x26/0x290 <4>[ 8.197581] smpboot_thread_fn+0x12f/0x290 <4>[ 8.197584] ? __pfx_smpboot_thread_fn+0x10/0x10 <4>[ 8.197586] kthread+0x11f/0x250 <4>[ 8.197589] ? __pfx_kthread+0x10/0x10 <4>[ 8.197592] ret_from_fork+0x344/0x3a0 <4>[ 8.197595] ? __pfx_kthread+0x10/0x10 <4>[ 8.197597] ret_from_fork_asm+0x1a/0x30 <4>[ 8.197604] Fix this issue in the same way as rapl powercap package domain is added from the same CPU online callback by introducing another interface which doesn't call cpus_read_lock(). Add rapl_package_add_pmu_locked() and rapl_package_remove_pmu_locked() which don't call cpus_read_lock(). Fixes: 748d6ba43afd ("powercap: intel_rapl: Enable MSR-based RAPL PMU support") Reported-by: Borah, Chaitanya Kumar Closes: https://lore.kernel.org/linux-pm/5427ede1-57a0-43d1-99f3-8ca4b0643e82@intel.com/T/#u Tested-by: Kuppuswamy Sathyanarayanan Tested-by: RavitejaX Veesam Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251217153455.3560176-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 24 ++++++++++++++++++------ drivers/powercap/intel_rapl_msr.c | 4 ++-- include/linux/intel_rapl.h | 4 ++++ 3 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index b9d87e56cbbc..3ff6da3bf4e6 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -2032,7 +2032,7 @@ end: return ret; } -int rapl_package_add_pmu(struct rapl_package *rp) +int rapl_package_add_pmu_locked(struct rapl_package *rp) { struct rapl_package_pmu_data *data = &rp->pmu_data; int idx; @@ -2040,8 +2040,6 @@ int rapl_package_add_pmu(struct rapl_package *rp) if (rp->has_pmu) return -EEXIST; - guard(cpus_read_lock)(); - for (idx = 0; idx < rp->nr_domains; idx++) { struct rapl_domain *rd = &rp->domains[idx]; int domain = rd->id; @@ -2091,17 +2089,23 @@ int rapl_package_add_pmu(struct rapl_package *rp) return rapl_pmu_update(rp); } +EXPORT_SYMBOL_GPL(rapl_package_add_pmu_locked); + +int rapl_package_add_pmu(struct rapl_package *rp) +{ + guard(cpus_read_lock)(); + + return rapl_package_add_pmu_locked(rp); +} EXPORT_SYMBOL_GPL(rapl_package_add_pmu); -void rapl_package_remove_pmu(struct rapl_package *rp) +void rapl_package_remove_pmu_locked(struct rapl_package *rp) { struct rapl_package *pos; if (!rp->has_pmu) return; - guard(cpus_read_lock)(); - list_for_each_entry(pos, &rapl_packages, plist) { /* PMU is still needed */ if (pos->has_pmu && pos != rp) @@ -2111,6 +2115,14 @@ void rapl_package_remove_pmu(struct rapl_package *rp) perf_pmu_unregister(&rapl_pmu.pmu); memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); } +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu_locked); + +void rapl_package_remove_pmu(struct rapl_package *rp) +{ + guard(cpus_read_lock)(); + + rapl_package_remove_pmu_locked(rp); +} EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); #endif diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 0ce1096b6314..9a7e150b3536 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -82,7 +82,7 @@ static int rapl_cpu_online(unsigned int cpu) if (IS_ERR(rp)) return PTR_ERR(rp); if (rapl_msr_pmu) - rapl_package_add_pmu(rp); + rapl_package_add_pmu_locked(rp); } cpumask_set_cpu(cpu, &rp->cpumask); return 0; @@ -101,7 +101,7 @@ static int rapl_cpu_down_prep(unsigned int cpu) lead_cpu = cpumask_first(&rp->cpumask); if (lead_cpu >= nr_cpu_ids) { if (rapl_msr_pmu) - rapl_package_remove_pmu(rp); + rapl_package_remove_pmu_locked(rp); rapl_remove_package_cpuslocked(rp); } else if (rp->lead_cpu == cpu) { rp->lead_cpu = lead_cpu; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index e9ade2ff4af6..f479ef5b3341 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -214,10 +214,14 @@ void rapl_remove_package(struct rapl_package *rp); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); +int rapl_package_add_pmu_locked(struct rapl_package *rp); void rapl_package_remove_pmu(struct rapl_package *rp); +void rapl_package_remove_pmu_locked(struct rapl_package *rp); #else static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; } +static inline int rapl_package_add_pmu_locked(struct rapl_package *rp) { return 0; } static inline void rapl_package_remove_pmu(struct rapl_package *rp) { } +static inline void rapl_package_remove_pmu_locked(struct rapl_package *rp) { } #endif #endif /* __INTEL_RAPL_H__ */ -- cgit v1.2.3 From 4cc5373f2e749a6c96e8b9fa971931a4dd852860 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Dec 2025 11:20:06 +0000 Subject: clang: work around asm output constraint problems Work around clang problems with "=rm" asm constraint. clang seems to always chose the memory output, while it is almost always the worst choice. Add ASM_OUTPUT_RM so that we can replace "=rm" constraint where it matters for clang, while not penalizing gcc. Signed-off-by: Eric Dumazet Suggested-by: Uros Bizjak Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 1 + include/linux/compiler_types.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 107ce05bd16e..7edf1a07b535 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -145,6 +145,7 @@ */ #define ASM_INPUT_G "ir" #define ASM_INPUT_RM "r" +#define ASM_OUTPUT_RM "=r" /* * Declare compiler support for __typeof_unqual__() operator. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1280693766b9..d3318a3c2577 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -548,11 +548,12 @@ struct ftrace_likely_data { /* * Clang has trouble with constraints with multiple - * alternative behaviors (mainly "g" and "rm"). + * alternative behaviors ("g" , "rm" and "=rm"). */ #ifndef ASM_INPUT_G #define ASM_INPUT_G "g" #define ASM_INPUT_RM "rm" + #define ASM_OUTPUT_RM "=rm" #endif #ifdef CONFIG_CC_HAS_ASM_INLINE -- cgit v1.2.3 From 87e7f6019097746d1d06f98874a9f179b7a68f3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 19 Dec 2025 10:36:38 +0200 Subject: software node: Also support referencing non-constant software nodes Fwnode references are be implemented differently if referenced node is a software node. _Generic() is used to differentiate between the two cases but only const software nodes were present in the selection. Also add non-const software nodes. Reported-by: Kenneth Crudup Closes: https://lore.kernel.org/all/af773b82-bef2-4209-baaf-526d4661b7fc@panix.com/ Fixes: d7cdbbc93c56 ("software node: allow referencing firmware nodes") Signed-off-by: Sakari Ailus Tested-By: Kenneth R. Crudup Tested-by: Mehdi Djait # Dell XPS 9315 Reviewed-by: Mehdi Djait Link: https://patch.msgid.link/20251219083638.2454138-1-sakari.ailus@linux.intel.com Signed-off-by: Danilo Krummrich --- include/linux/property.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 272bfbdea7bf..e30ef23a9af3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -371,6 +371,7 @@ struct software_node_ref_args { (const struct software_node_ref_args) { \ .swnode = _Generic(_ref_, \ const struct software_node *: _ref_, \ + struct software_node *: _ref_, \ default: NULL), \ .fwnode = _Generic(_ref_, \ struct fwnode_handle *: _ref_, \ -- cgit v1.2.3 From f059588c552746e0fe299214f35c58effa715b74 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 13:31:52 -0500 Subject: virtio: make it self-contained virtio.h uses struct module, add a forward declaration to make the header self-contained. Message-ID: <9171b5cac60793eb59ab044c96ee038bf1363bee.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 132a474e5914..3626eb694728 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -13,6 +13,8 @@ #include #include +struct module; + /** * struct virtqueue - a queue to register buffers for sending or receiving. * @list: the chain of virtqueues for this device -- cgit v1.2.3 From e88dfb93311c81359b00c12e0b396bd0ea13ad6c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 12:49:34 -0500 Subject: virtio_features: make it self-contained virtio_features.h uses WARN_ON_ONCE and memset so it must include linux/bug.h and linux/string.h Message-ID: <579986aa9b8d023844990d2a0e267382f8ad85d5.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h index ea2ad8717882..ce59ea91f474 100644 --- a/include/linux/virtio_features.h +++ b/include/linux/virtio_features.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_FEATURES_H #include +#include +#include #define VIRTIO_FEATURES_U64S 2 #define VIRTIO_FEATURES_BITS (VIRTIO_FEATURES_U64S * 64) -- cgit v1.2.3 From 3df5dd46fca4b20efc4767c61d8ecc7249e83f5b Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:57:58 +0200 Subject: net/mlx5: Add max_tx_speed and its CAP bit to IFC Introduce the max_tx_speed field to the query and modify_vport_state structures. Add the esw_vport_state_max_tx_speed capability bit, indicating the firmware support modifying the max_tx_speed field via the MODIFY_VPORT_STATE command. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9dcd4bf355d..e844cfa4fe0a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1071,7 +1071,9 @@ struct mlx5_ifc_e_switch_cap_bits { u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; u8 root_ft_on_other_esw[0x1]; - u8 reserved_at_a[0xf]; + u8 reserved_at_a[0x1]; + u8 esw_vport_state_max_tx_speed[0x1]; + u8 reserved_at_c[0xd]; u8 esw_functions_changed[0x1]; u8 reserved_at_1a[0x1]; u8 ecpf_vport_exists[0x1]; @@ -5445,7 +5447,8 @@ struct mlx5_ifc_query_vport_state_out_bits { u8 reserved_at_40[0x20]; - u8 reserved_at_60[0x18]; + u8 max_tx_speed[0x10]; + u8 reserved_at_70[0x8]; u8 admin_state[0x4]; u8 state[0x4]; }; @@ -7778,7 +7781,7 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x10]; + u8 max_tx_speed[0x10]; u8 ingress_connect[0x1]; u8 egress_connect[0x1]; u8 ingress_connect_valid[0x1]; -- cgit v1.2.3 From 50f1d188c580222d7a73e96a338a5dc82360ccc0 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:05 +0200 Subject: net/mlx5: Propagate LAG effective max_tx_speed to vports Currently, vports report only their parent's uplink speed, which in LAG setups does not reflect the true aggregated bandwidth. This makes it hard for upper-layer software to optimize load balancing decisions based on accurate bandwidth information. Fix the issue by calculating the possible maximum speed of a LAG as the sum of speeds of all active uplinks that are part of the LAG. Propagate this effective max speed to vports associated with the LAG whenever a relevant event occurs, such as physical port link state changes or LAG creation/modification. With this change, upper-layer components receive accurate bandwidth information corresponding to the active members of the LAG and can make better load balancing decisions. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 158 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 9 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/port.c | 24 ++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 45 ++++++ include/linux/mlx5/vport.h | 4 + 6 files changed, 241 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index a459a30f36ca..c9d943a230b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) ldev->mode != MLX5_LAG_MODE_MPESW; } +#ifdef CONFIG_MLX5_ESWITCH +static int +mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed, + int (*get_speed)(struct mlx5_core_dev *, u32 *)) +{ + struct mlx5_core_dev *pf_mdev; + int pf_idx; + u32 speed; + int ret; + + *sum_speed = 0; + mlx5_ldev_for_each(pf_idx, 0, ldev) { + pf_mdev = ldev->pf[pf_idx].dev; + if (!pf_mdev) + continue; + + ret = get_speed(pf_mdev, &speed); + if (ret) { + mlx5_core_dbg(pf_mdev, + "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n", + get_speed, dev_name(pf_mdev->device), + ret); + return ret; + } + + *sum_speed += speed; + } + + return 0; +} + +static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, max_speed, + mlx5_port_max_linkspeed); +} + +static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, + u32 speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + unsigned long i; + int ret; + + if (!esw) + return; + + if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed)) + return; + + mlx5_esw_for_each_vport(esw, i, vport) { + if (!vport) + continue; + + if (vport->vport == MLX5_VPORT_UPLINK) + continue; + + ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod, + vport->vport, true, speed); + if (ret) + mlx5_core_dbg(mdev, + "Failed to set vport %d speed %d, err=%d\n", + vport->vport, speed, ret); + } +} + +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + + speed = ldev->tracker.bond_speed_mbps; + + if (speed == SPEED_UNKNOWN) + return; + + /* If speed is not set, use the sum of max speeds of all PFs */ + if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) + return; + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} + +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + int ret; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + ret = mlx5_port_oper_linkspeed(mdev, &speed); + if (ret) { + mlx5_core_dbg(mdev, + "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n", + dev_name(mdev->device), ret); + continue; + } + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} +#endif + static void mlx5_do_bond(struct mlx5_lag *ldev) { int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); @@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) ndev); dev_put(ndev); } + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { + mlx5_lag_reset_vports_speed(ldev); mlx5_disable_lag(ldev); } } @@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, return 1; } +static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct ethtool_link_ksettings lksettings; + struct net_device *bond_dev; + int err; + + if (netif_is_lag_master(ndev)) + bond_dev = ndev; + else + bond_dev = netdev_master_upper_dev_get(ndev); + + if (!bond_dev) { + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + err = __ethtool_get_link_ksettings(bond_dev, &lksettings); + if (err) { + netdev_dbg(bond_dev, + "Failed to get speed for bond dev %s, err=%d\n", + bond_dev->name, err); + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + if (lksettings.base.speed == SPEED_UNKNOWN) + tracker->bond_speed_mbps = 0; + else + tracker->bond_speed_mbps = lksettings.base.speed; +} + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, break; } + if (changed) + mlx5_lag_update_tracker_speed(&tracker, ndev); + ldev->tracker = tracker; if (changed) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 4918eee2b3da..8de5640a0161 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -48,6 +48,7 @@ struct lag_tracker { unsigned int is_bonded:1; unsigned int has_inactive:1; enum netdev_lag_hash hash_type; + u32 bond_speed_mbps; }; /* LAG data of a ConnectX card. @@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev); void mlx5_lag_add_devices(struct mlx5_lag *ldev); struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev); +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev); +#else +static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {} +#endif + static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) { if (!MLX5_CAP_GEN(dev, vport_group_manager) || diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index cfebc110c02f..9fdb9a543cf1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, struct mlx5_link_info *info, bool force_legacy); +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 85a9e534f442..83044c9b6b41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, return link_modes; } +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + const struct mlx5_link_info *table; + struct mlx5_port_eth_proto eproto; + u32 oper_speed = 0; + u32 max_size; + bool ext; + int err; + int i; + + ext = mlx5_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + return err; + + mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false); + for (i = 0; i < max_size; ++i) + if (eproto.oper & MLX5E_PROT_MASK(i)) + oper_speed = max(oper_speed, table[i].speed); + + *speed = oper_speed; + return 0; +} + int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) { const struct mlx5_link_info *table; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 306affbcfd3b..78b1b291cfa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) return MLX5_GET(query_vport_state_out, out, state); } +static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, + u8 *admin_state) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + *admin_state = MLX5_GET(query_vport_state_out, out, admin_state); + return 0; +} + int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { @@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + u8 admin_state; + int err; + + err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport, + &admin_state); + if (err) + return err; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); + MLX5_SET(modify_vport_state_in, in, admin_state, admin_state); + MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed); + + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); +} + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index f876bfc0669c..2acf10e9f60a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -41,6 +41,8 @@ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ mlx5_core_is_pf(mdev)) +#define MLX5_MAX_TX_SPEED_UNIT 100 + enum { MLX5_CAP_INLINE_MODE_L2, MLX5_CAP_INLINE_MODE_VPORT_CONTEXT, @@ -58,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr); int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr); -- cgit v1.2.3 From 28ea6036dad268a055b693d9c06a6f3d126096d3 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:13 +0200 Subject: net/mlx5: Handle port and vport speed change events in MPESW Add port change event handling logic for MPESW LAG mode, ensuring VFs are updated when the speed of LAG physical ports changes. This triggers a speed update workflow when relevant port state changes occur, enabling consistent and accurate reporting of VF bandwidth. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 38 ++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/lag/mpesw.c | 39 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lag/mpesw.h | 14 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 29 ++++++++++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/vport.h | 2 ++ 7 files changed, 121 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index c9d943a230b5..2d6024ebe346 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -233,14 +233,25 @@ static void mlx5_ldev_free(struct kref *ref) { struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); struct net *net; + int i; if (ldev->nb.notifier_call) { net = read_pnet(&ldev->net); unregister_netdevice_notifier_net(net, &ldev->nb); } + mlx5_ldev_for_each(i, 0, ldev) { + if (ldev->pf[i].dev && + ldev->pf[i].port_change_nb.nb.notifier_call) { + struct mlx5_nb *nb = &ldev->pf[i].port_change_nb; + + mlx5_eq_notifier_unregister(ldev->pf[i].dev, nb); + } + } + mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); + cancel_work_sync(&ldev->speed_update_work); destroy_workqueue(ldev->wq); mutex_destroy(&ldev->lock); kfree(ldev); @@ -274,6 +285,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) kref_init(&ldev->ref); mutex_init(&ldev->lock); INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + INIT_WORK(&ldev->speed_update_work, mlx5_mpesw_speed_update_work); ldev->nb.notifier_call = mlx5_lag_netdev_event; write_pnet(&ldev->net, mlx5_core_net(dev)); @@ -1033,6 +1045,13 @@ static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) mlx5_port_max_linkspeed); } +static int mlx5_lag_sum_devices_oper_speed(struct mlx5_lag *ldev, + u32 *oper_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, oper_speed, + mlx5_port_oper_linkspeed); +} + static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, u32 speed) { @@ -1070,10 +1089,14 @@ void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) u32 speed; int pf_idx; - speed = ldev->tracker.bond_speed_mbps; - - if (speed == SPEED_UNKNOWN) - return; + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (mlx5_lag_sum_devices_oper_speed(ldev, &speed)) + return; + } else { + speed = ldev->tracker.bond_speed_mbps; + if (speed == SPEED_UNKNOWN) + return; + } /* If speed is not set, use the sum of max speeds of all PFs */ if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) @@ -1520,6 +1543,10 @@ static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, ldev->pf[fn].dev = dev; dev->priv.lag = ldev; + + MLX5_NB_INIT(&ldev->pf[fn].port_change_nb, + mlx5_lag_mpesw_port_change_event, PORT_CHANGE); + mlx5_eq_notifier_register(dev, &ldev->pf[fn].port_change_nb); } static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, @@ -1531,6 +1558,9 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, if (ldev->pf[fn].dev != dev) return; + if (ldev->pf[fn].port_change_nb.nb.notifier_call) + mlx5_eq_notifier_unregister(dev, &ldev->pf[fn].port_change_nb); + ldev->pf[fn].dev = NULL; dev->priv.lag = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 8de5640a0161..be1afece5fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -39,6 +39,7 @@ struct lag_func { struct mlx5_core_dev *dev; struct net_device *netdev; bool has_drop; + struct mlx5_nb port_change_nb; }; /* Used for collection of netdev event info. */ @@ -67,6 +68,7 @@ struct mlx5_lag { struct lag_tracker tracker; struct workqueue_struct *wq; struct delayed_work bond_work; + struct work_struct speed_update_work; struct notifier_block nb; possible_net_t net; struct lag_mp lag_mp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 2d86af8f0d9b..04762562d7d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -110,6 +110,8 @@ static int enable_mpesw(struct mlx5_lag *ldev) goto err_rescan_drivers; } + mlx5_lag_set_vports_agg_speed(ldev); + return 0; err_rescan_drivers: @@ -223,3 +225,40 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev) return ldev && ldev->mode == MLX5_LAG_MODE_MPESW; } EXPORT_SYMBOL(mlx5_lag_is_mpesw); + +void mlx5_mpesw_speed_update_work(struct work_struct *work) +{ + struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, + speed_update_work); + + mutex_lock(&ldev->lock); + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (ldev->mode_changes_in_progress) + queue_work(ldev->wq, &ldev->speed_update_work); + else + mlx5_lag_set_vports_agg_speed(ldev); + } + + mutex_unlock(&ldev->lock); +} + +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5_nb *mlx5_nb = container_of(nb, struct mlx5_nb, nb); + struct lag_func *lag_func = container_of(mlx5_nb, + struct lag_func, + port_change_nb); + struct mlx5_core_dev *dev = lag_func->dev; + struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_eqe *eqe = data; + + if (!ldev) + return NOTIFY_DONE; + + if (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_DOWN || + eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) + queue_work(ldev->wq, &ldev->speed_update_work); + + return NOTIFY_OK; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h index 02520f27a033..f5d9b5c97b0d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -32,4 +32,18 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev); int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_mpesw_speed_update_work(struct work_struct *work); +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data); +#else +static inline void mlx5_mpesw_speed_update_work(struct work_struct *work) {} +static inline int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, + void *data) +{ + return NOTIFY_DONE; +} +#endif /* CONFIG_MLX5_ESWITCH */ + #endif /* __MLX5_LAG_MPESW_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 78b1b291cfa4..cb098d3eb2fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -122,6 +122,35 @@ int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + u32 state; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, op_mod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + state = MLX5_GET(query_vport_state_out, out, state); + if (state == VPORT_STATE_DOWN) { + *max_tx_speed = 0; + return 0; + } + + *max_tx_speed = MLX5_GET(query_vport_state_out, out, max_tx_speed); + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_vport_max_tx_speed); + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1c54aa6f74fb..9e0ab3cfab73 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,6 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 2acf10e9f60a..dfa2fe32217a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -60,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed); int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From f0b2fde98065e49795ce6824837b3f53fdf16e5d Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:20 +0200 Subject: net/mlx5: Add support for querying bond speed Add mlx5_lag_query_bond_speed() to query the aggregated speed of lag configurations with a bond device. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 27 +++++++++++++++++++++++ include/linux/mlx5/driver.h | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 2d6024ebe346..9fe47c836ebd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1464,6 +1464,33 @@ static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, tracker->bond_speed_mbps = lksettings.base.speed; } +/* Returns speed in Mbps. */ +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *mdev, u32 *speed) +{ + struct mlx5_lag *ldev; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(mdev); + if (!ldev) { + ret = -ENODEV; + goto unlock; + } + + *speed = ldev->tracker.bond_speed_mbps; + + if (*speed == SPEED_UNKNOWN) { + mlx5_core_dbg(mdev, "Bond speed is unknown\n"); + ret = -EINVAL; + } + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_lag_query_bond_speed); + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9e0ab3cfab73..e2d067b1e67b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,7 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); -int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); -- cgit v1.2.3