From 1e372b246199ca7a35f930177fea91b557dac16e Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Wed, 21 Jan 2026 10:10:47 +0100 Subject: drm, drm/xe: Fix xe userptr in the absence of CONFIG_DEVICE_PRIVATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_DEVICE_PRIVATE is not selected by default by some distros, for example Fedora, and that leads to a regression in the xe driver since userptr support gets compiled out. It turns out that DRM_GPUSVM, which is needed for xe userptr support compiles also without CONFIG_DEVICE_PRIVATE, but doesn't compile without CONFIG_ZONE_DEVICE. Exclude the drm_pagemap files from compilation with !CONFIG_ZONE_DEVICE, and remove the CONFIG_DEVICE_PRIVATE dependency from CONFIG_DRM_GPUSVM and the xe driver's selection of it, re-enabling xe userptr for those configs. v2: - Don't compile the drm_pagemap files unless CONFIG_ZONE_DEVICE is set. - Adjust the drm_pagemap.h header accordingly. Fixes: 9e9787414882 ("drm/xe/userptr: replace xe_hmm with gpusvm") Cc: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Thomas Hellström Cc: Matthew Brost Cc: "Thomas Hellström" Cc: Rodrigo Vivi Cc: dri-devel@lists.freedesktop.org Cc: # v6.18+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Acked-by: Maarten Lankhorst Link: https://patch.msgid.link/20260121091048.41371-2-thomas.hellstrom@linux.intel.com --- include/drm/drm_pagemap.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index 46e9c58f09e0..2baf0861f78f 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -243,6 +243,8 @@ struct drm_pagemap_devmem_ops { struct dma_fence *pre_migrate_fence); }; +#if IS_ENABLED(CONFIG_ZONE_DEVICE) + int drm_pagemap_init(struct drm_pagemap *dpagemap, struct dev_pagemap *pagemap, struct drm_device *drm, @@ -252,17 +254,22 @@ struct drm_pagemap *drm_pagemap_create(struct drm_device *drm, struct dev_pagemap *pagemap, const struct drm_pagemap_ops *ops); -#if IS_ENABLED(CONFIG_DRM_GPUSVM) +struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page); void drm_pagemap_put(struct drm_pagemap *dpagemap); #else +static inline struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page) +{ + return NULL; +} + static inline void drm_pagemap_put(struct drm_pagemap *dpagemap) { } -#endif /* IS_ENABLED(CONFIG_DRM_GPUSVM) */ +#endif /* IS_ENABLED(CONFIG_ZONE_DEVICE) */ /** * drm_pagemap_get() - Obtain a reference on a struct drm_pagemap @@ -334,6 +341,8 @@ struct drm_pagemap_migrate_details { u32 source_peer_migrates : 1; }; +#if IS_ENABLED(CONFIG_ZONE_DEVICE) + int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, struct mm_struct *mm, unsigned long start, unsigned long end, @@ -343,8 +352,6 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation); const struct dev_pagemap_ops *drm_pagemap_pagemap_ops_get(void); -struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page); - void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation, struct device *dev, struct mm_struct *mm, const struct drm_pagemap_devmem_ops *ops, @@ -359,4 +366,7 @@ int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, void drm_pagemap_destroy(struct drm_pagemap *dpagemap, bool is_atomic_or_reclaim); int drm_pagemap_reinit(struct drm_pagemap *dpagemap); + +#endif /* IS_ENABLED(CONFIG_ZONE_DEVICE) */ + #endif -- cgit v1.2.3 From 46e51fd0149a85e8165868761f7ede2fdec56654 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Tue, 9 Dec 2025 09:55:49 +0200 Subject: video/vga: Add VGA_IS0_R MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a proper name for the "Input status register 0" IO address. Currently we have some code that does reads using the aliasing VGA_MSR_W define, making it unclear what register we're actually reading. v2: Remove stray '?' Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20251209075549.14051-1-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula Acked-by: Helge Deller --- include/video/vga.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/video/vga.h b/include/video/vga.h index 468764d6727a..2f13c371800b 100644 --- a/include/video/vga.h +++ b/include/video/vga.h @@ -46,6 +46,7 @@ #define VGA_MIS_R 0x3CC /* Misc Output Read Register */ #define VGA_MIS_W 0x3C2 /* Misc Output Write Register */ #define VGA_FTC_R 0x3CA /* Feature Control Read Register */ +#define VGA_IS0_R 0x3C2 /* Input Status Register 0 */ #define VGA_IS1_RC 0x3DA /* Input Status Register 1 - color emulation */ #define VGA_IS1_RM 0x3BA /* Input Status Register 1 - mono emulation */ #define VGA_PEL_D 0x3C9 /* PEL Data Register */ -- cgit v1.2.3 From 20382658ffbb98e1ea8bc22fd774f76d4f342dc9 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 8 Dec 2025 20:26:35 +0200 Subject: drm/i915: Get rid of the INTEL_GMCH_CTRL alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INTEL_GMCH_CTRL and I830_GMCH_CTRL are the same register. Get rid of the INTEL_GMCH_CTRL name. Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20251208182637.334-18-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_vga.c | 2 +- include/drm/intel/i915_drm.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_vga.c b/drivers/gpu/drm/i915/display/intel_vga.c index 9e1f3ab632d5..6fc3e3702cb8 100644 --- a/drivers/gpu/drm/i915/display/intel_vga.c +++ b/drivers/gpu/drm/i915/display/intel_vga.c @@ -20,7 +20,7 @@ static unsigned int intel_gmch_ctrl_reg(struct intel_display *display) { - return DISPLAY_VER(display) >= 6 ? SNB_GMCH_CTRL : INTEL_GMCH_CTRL; + return DISPLAY_VER(display) >= 6 ? SNB_GMCH_CTRL : I830_GMCH_CTRL; } static bool intel_vga_decode_is_enabled(struct intel_display *display) diff --git a/include/drm/intel/i915_drm.h b/include/drm/intel/i915_drm.h index adff68538484..91f628367f1f 100644 --- a/include/drm/intel/i915_drm.h +++ b/include/drm/intel/i915_drm.h @@ -44,8 +44,6 @@ extern struct resource intel_graphics_stolen_res; * This is all handled in the intel-gtt.ko module. i915.ko only * cares about the vga bit for the vga arbiter. */ -#define INTEL_GMCH_CTRL 0x52 -#define INTEL_GMCH_VGA_DISABLE (1 << 1) #define SNB_GMCH_CTRL 0x50 #define SNB_GMCH_GGMS_SHIFT 8 /* GTT Graphics Memory Size */ #define SNB_GMCH_GGMS_MASK 0x3 @@ -80,6 +78,9 @@ extern struct resource intel_graphics_stolen_res; #define INTEL_GMCH_GMS_STOLEN_224M (0xc << 4) #define INTEL_GMCH_GMS_STOLEN_352M (0xd << 4) +/* valid for both I830_GMCH_CTRL and SNB_GMCH_CTRL */ +#define INTEL_GMCH_VGA_DISABLE (1 << 1) + #define I830_DRB3 0x63 #define I85X_DRB3 0x43 #define I865_TOUD 0xc4 -- cgit v1.2.3 From 29ed5593ca202da679493c25f82bf89a59114092 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 8 Dec 2025 20:26:36 +0200 Subject: drm/i915: Clean up PCI config space reg defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCI config space register defines in i915_drm.h are a bit of a mess; Whitespace is all over the place, register masks and values are defined in inconsistent ways. Clean it up a bit. Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20251208182637.334-19-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- include/drm/intel/i915_drm.h | 70 +++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/drm/intel/i915_drm.h b/include/drm/intel/i915_drm.h index 91f628367f1f..c633ce62f2bf 100644 --- a/include/drm/intel/i915_drm.h +++ b/include/drm/intel/i915_drm.h @@ -45,38 +45,36 @@ extern struct resource intel_graphics_stolen_res; * cares about the vga bit for the vga arbiter. */ #define SNB_GMCH_CTRL 0x50 -#define SNB_GMCH_GGMS_SHIFT 8 /* GTT Graphics Memory Size */ -#define SNB_GMCH_GGMS_MASK 0x3 -#define SNB_GMCH_GMS_SHIFT 3 /* Graphics Mode Select */ -#define SNB_GMCH_GMS_MASK 0x1f -#define BDW_GMCH_GGMS_SHIFT 6 -#define BDW_GMCH_GGMS_MASK 0x3 -#define BDW_GMCH_GMS_SHIFT 8 -#define BDW_GMCH_GMS_MASK 0xff +#define SNB_GMCH_GGMS_SHIFT 8 /* GTT Graphics Memory Size */ +#define SNB_GMCH_GGMS_MASK 0x3 +#define SNB_GMCH_GMS_SHIFT 3 /* Graphics Mode Select */ +#define SNB_GMCH_GMS_MASK 0x1f +#define BDW_GMCH_GGMS_SHIFT 6 +#define BDW_GMCH_GGMS_MASK 0x3 +#define BDW_GMCH_GMS_SHIFT 8 +#define BDW_GMCH_GMS_MASK 0xff #define I830_GMCH_CTRL 0x52 - -#define I830_GMCH_GMS_MASK 0x70 -#define I830_GMCH_GMS_LOCAL 0x10 -#define I830_GMCH_GMS_STOLEN_512 0x20 -#define I830_GMCH_GMS_STOLEN_1024 0x30 -#define I830_GMCH_GMS_STOLEN_8192 0x40 - -#define I855_GMCH_GMS_MASK 0xF0 -#define I855_GMCH_GMS_STOLEN_0M 0x0 -#define I855_GMCH_GMS_STOLEN_1M (0x1 << 4) -#define I855_GMCH_GMS_STOLEN_4M (0x2 << 4) -#define I855_GMCH_GMS_STOLEN_8M (0x3 << 4) -#define I855_GMCH_GMS_STOLEN_16M (0x4 << 4) -#define I855_GMCH_GMS_STOLEN_32M (0x5 << 4) -#define I915_GMCH_GMS_STOLEN_48M (0x6 << 4) -#define I915_GMCH_GMS_STOLEN_64M (0x7 << 4) -#define G33_GMCH_GMS_STOLEN_128M (0x8 << 4) -#define G33_GMCH_GMS_STOLEN_256M (0x9 << 4) -#define INTEL_GMCH_GMS_STOLEN_96M (0xa << 4) -#define INTEL_GMCH_GMS_STOLEN_160M (0xb << 4) -#define INTEL_GMCH_GMS_STOLEN_224M (0xc << 4) -#define INTEL_GMCH_GMS_STOLEN_352M (0xd << 4) +#define I830_GMCH_GMS_MASK (0x7 << 4) +#define I830_GMCH_GMS_LOCAL (0x1 << 4) +#define I830_GMCH_GMS_STOLEN_512 (0x2 << 4) +#define I830_GMCH_GMS_STOLEN_1024 (0x3 << 4) +#define I830_GMCH_GMS_STOLEN_8192 (0x4 << 4) +#define I855_GMCH_GMS_MASK (0xF << 4) +#define I855_GMCH_GMS_STOLEN_0M (0x0 << 4) +#define I855_GMCH_GMS_STOLEN_1M (0x1 << 4) +#define I855_GMCH_GMS_STOLEN_4M (0x2 << 4) +#define I855_GMCH_GMS_STOLEN_8M (0x3 << 4) +#define I855_GMCH_GMS_STOLEN_16M (0x4 << 4) +#define I855_GMCH_GMS_STOLEN_32M (0x5 << 4) +#define I915_GMCH_GMS_STOLEN_48M (0x6 << 4) +#define I915_GMCH_GMS_STOLEN_64M (0x7 << 4) +#define G33_GMCH_GMS_STOLEN_128M (0x8 << 4) +#define G33_GMCH_GMS_STOLEN_256M (0x9 << 4) +#define INTEL_GMCH_GMS_STOLEN_96M (0xa << 4) +#define INTEL_GMCH_GMS_STOLEN_160M (0xb << 4) +#define INTEL_GMCH_GMS_STOLEN_224M (0xc << 4) +#define INTEL_GMCH_GMS_STOLEN_352M (0xd << 4) /* valid for both I830_GMCH_CTRL and SNB_GMCH_CTRL */ #define INTEL_GMCH_VGA_DISABLE (1 << 1) @@ -88,12 +86,12 @@ extern struct resource intel_graphics_stolen_res; #define I830_ESMRAMC 0x91 #define I845_ESMRAMC 0x9e #define I85X_ESMRAMC 0x61 -#define TSEG_ENABLE (1 << 0) -#define I830_TSEG_SIZE_512K (0 << 1) -#define I830_TSEG_SIZE_1M (1 << 1) -#define I845_TSEG_SIZE_MASK (3 << 1) -#define I845_TSEG_SIZE_512K (2 << 1) -#define I845_TSEG_SIZE_1M (3 << 1) +#define TSEG_ENABLE (1 << 0) +#define I830_TSEG_SIZE_512K (0 << 1) +#define I830_TSEG_SIZE_1M (1 << 1) +#define I845_TSEG_SIZE_MASK (3 << 1) +#define I845_TSEG_SIZE_512K (2 << 1) +#define I845_TSEG_SIZE_1M (3 << 1) #define INTEL_BSM 0x5c #define INTEL_GEN11_BSM_DW0 0xc0 -- cgit v1.2.3 From 42bb7bdae97cc4e285fb741d9ebd1f5a36fe3f94 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 8 Dec 2025 20:26:37 +0200 Subject: drm/i915: Document the GMCH_CTRL register a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual GMCH_CRTL lives in the host bridge aka. device 0, but device 2 has a read-only mirror on i85x/i865+. Document that fact. Also remove the ancient tales about where the defines are used. Those haven't been true in a long time. Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20251208182637.334-20-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- include/drm/intel/i915_drm.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/drm/intel/i915_drm.h b/include/drm/intel/i915_drm.h index c633ce62f2bf..1fdaabed1470 100644 --- a/include/drm/intel/i915_drm.h +++ b/include/drm/intel/i915_drm.h @@ -39,11 +39,11 @@ bool i915_gpu_turbo_disable(void); extern struct resource intel_graphics_stolen_res; /* - * The Bridge device's PCI config space has information about the - * fb aperture size and the amount of pre-reserved memory. - * This is all handled in the intel-gtt.ko module. i915.ko only - * cares about the vga bit for the vga arbiter. + * The bridge device's (device 0) PCI config space has information + * about the fb aperture size and the amount of pre-reserved memory. */ + +/* device 2 has a read-only mirror */ #define SNB_GMCH_CTRL 0x50 #define SNB_GMCH_GGMS_SHIFT 8 /* GTT Graphics Memory Size */ #define SNB_GMCH_GGMS_MASK 0x3 @@ -54,6 +54,7 @@ extern struct resource intel_graphics_stolen_res; #define BDW_GMCH_GMS_SHIFT 8 #define BDW_GMCH_GMS_MASK 0xff +/* device 2 has a read-only mirror from i85x/i865 onwards */ #define I830_GMCH_CTRL 0x52 #define I830_GMCH_GMS_MASK (0x7 << 4) #define I830_GMCH_GMS_LOCAL (0x1 << 4) -- cgit v1.2.3 From cca7eda1c73045d6fb12b3db34f90de65412e742 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 20 Jan 2026 17:45:41 +0200 Subject: drm/{i915, xe}/dsb: move DSB buffer to parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the DSB buffer handling to the display parent interface, making display more independent of i915 and xe driver implementations. Since the DSB parent interface is only called from intel_dsb.c, add the wrappers there with smaller visibility instead of the usual intel_parent.[ch], and using struct intel_dsb as the context parameter for convenience. Unfortunately, memset() being a macro in linux/fortify-string.h, we can't use that as the function pointer name. dsb->memset() would be using the macro and leading to build failures. Therefore, use .fill() for the memset() functionality. v2: s/memset/fill/ Reviewed-by: Michał Grzelak Link: https://patch.msgid.link/df117c862a6d34dae340e4a85c2482b4e29c8884.1768923917.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dsb.c | 92 +++++++++++++++++++------ drivers/gpu/drm/i915/display/intel_dsb_buffer.h | 22 ------ drivers/gpu/drm/i915/i915_driver.c | 2 + drivers/gpu/drm/i915/i915_dsb_buffer.c | 28 +++++--- drivers/gpu/drm/i915/i915_dsb_buffer.h | 9 +++ drivers/gpu/drm/xe/display/xe_display.c | 2 + drivers/gpu/drm/xe/display/xe_dsb_buffer.c | 28 +++++--- drivers/gpu/drm/xe/display/xe_dsb_buffer.h | 9 +++ include/drm/intel/display_parent_interface.h | 14 ++++ 9 files changed, 146 insertions(+), 60 deletions(-) delete mode 100644 drivers/gpu/drm/i915/display/intel_dsb_buffer.h create mode 100644 drivers/gpu/drm/i915/i915_dsb_buffer.h create mode 100644 drivers/gpu/drm/xe/display/xe_dsb_buffer.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c index 91060e2a5762..cf5fb30cab83 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.c +++ b/drivers/gpu/drm/i915/display/intel_dsb.c @@ -8,6 +8,7 @@ #include #include +#include #include "intel_crtc.h" #include "intel_de.h" @@ -15,7 +16,6 @@ #include "intel_display_rpm.h" #include "intel_display_types.h" #include "intel_dsb.h" -#include "intel_dsb_buffer.h" #include "intel_dsb_regs.h" #include "intel_vblank.h" #include "intel_vrr.h" @@ -75,6 +75,57 @@ struct intel_dsb { * writes). There are no registers reads possible with DSB HW engine. */ +/* + * DSB buffer parent interface calls are here instead of intel_parent.[ch] + * because they're not used outside of intel_dsb.c. + */ +static u32 dsb_buffer_ggtt_offset(struct intel_dsb *dsb) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + return display->parent->dsb->ggtt_offset(dsb->dsb_buf); +} + +static void dsb_buffer_write(struct intel_dsb *dsb, u32 idx, u32 val) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + display->parent->dsb->write(dsb->dsb_buf, idx, val); +} + +static u32 dsb_buffer_read(struct intel_dsb *dsb, u32 idx) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + return display->parent->dsb->read(dsb->dsb_buf, idx); +} + +static void dsb_buffer_fill(struct intel_dsb *dsb, u32 idx, u32 val, size_t size) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + display->parent->dsb->fill(dsb->dsb_buf, idx, val, size); +} + +static struct intel_dsb_buffer *dsb_buffer_create(struct intel_display *display, size_t size) +{ + return display->parent->dsb->create(display->drm, size); +} + +static void dsb_buffer_cleanup(struct intel_dsb *dsb) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + display->parent->dsb->cleanup(dsb->dsb_buf); +} + +static void dsb_buffer_flush_map(struct intel_dsb *dsb) +{ + struct intel_display *display = to_intel_display(dsb->crtc); + + display->parent->dsb->flush_map(dsb->dsb_buf); +} + /* DSB opcodes. */ #define DSB_OPCODE_SHIFT 24 #define DSB_OPCODE_NOOP 0x0 @@ -211,10 +262,10 @@ static void intel_dsb_dump(struct intel_dsb *dsb) for (i = 0; i < ALIGN(dsb->free_pos, 64 / 4); i += 4) drm_dbg_kms(display->drm, " 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n", i * 4, - intel_dsb_buffer_read(dsb->dsb_buf, i), - intel_dsb_buffer_read(dsb->dsb_buf, i + 1), - intel_dsb_buffer_read(dsb->dsb_buf, i + 2), - intel_dsb_buffer_read(dsb->dsb_buf, i + 3)); + dsb_buffer_read(dsb, i), + dsb_buffer_read(dsb, i + 1), + dsb_buffer_read(dsb, i + 2), + dsb_buffer_read(dsb, i + 3)); drm_dbg_kms(display->drm, "}\n"); } @@ -231,12 +282,12 @@ unsigned int intel_dsb_size(struct intel_dsb *dsb) unsigned int intel_dsb_head(struct intel_dsb *dsb) { - return intel_dsb_buffer_ggtt_offset(dsb->dsb_buf); + return dsb_buffer_ggtt_offset(dsb); } static unsigned int intel_dsb_tail(struct intel_dsb *dsb) { - return intel_dsb_buffer_ggtt_offset(dsb->dsb_buf) + intel_dsb_size(dsb); + return dsb_buffer_ggtt_offset(dsb) + intel_dsb_size(dsb); } static void intel_dsb_ins_align(struct intel_dsb *dsb) @@ -263,8 +314,8 @@ static void intel_dsb_emit(struct intel_dsb *dsb, u32 ldw, u32 udw) dsb->ins[0] = ldw; dsb->ins[1] = udw; - intel_dsb_buffer_write(dsb->dsb_buf, dsb->free_pos++, dsb->ins[0]); - intel_dsb_buffer_write(dsb->dsb_buf, dsb->free_pos++, dsb->ins[1]); + dsb_buffer_write(dsb, dsb->free_pos++, dsb->ins[0]); + dsb_buffer_write(dsb, dsb->free_pos++, dsb->ins[1]); } static bool intel_dsb_prev_ins_is_write(struct intel_dsb *dsb, @@ -335,13 +386,12 @@ void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, /* Update the count */ dsb->ins[0]++; - intel_dsb_buffer_write(dsb->dsb_buf, dsb->ins_start_offset + 0, - dsb->ins[0]); + dsb_buffer_write(dsb, dsb->ins_start_offset + 0, dsb->ins[0]); - intel_dsb_buffer_write(dsb->dsb_buf, dsb->free_pos++, val); + dsb_buffer_write(dsb, dsb->free_pos++, val); /* if number of data words is odd, then the last dword should be 0.*/ if (dsb->free_pos & 0x1) - intel_dsb_buffer_write(dsb->dsb_buf, dsb->free_pos, 0); + dsb_buffer_write(dsb, dsb->free_pos, 0); } void intel_dsb_reg_write(struct intel_dsb *dsb, @@ -521,8 +571,7 @@ static void intel_dsb_align_tail(struct intel_dsb *dsb) aligned_tail = ALIGN(tail, CACHELINE_BYTES); if (aligned_tail > tail) - intel_dsb_buffer_memset(dsb->dsb_buf, dsb->free_pos, 0, - aligned_tail - tail); + dsb_buffer_fill(dsb, dsb->free_pos, 0, aligned_tail - tail); dsb->free_pos = aligned_tail / 4; } @@ -541,8 +590,7 @@ static void intel_dsb_gosub_align(struct intel_dsb *dsb) * "Ensure GOSUB is not placed in cacheline QW slot 6 or 7 (numbered 0-7)" */ if (aligned_tail - tail <= 2 * 8) - intel_dsb_buffer_memset(dsb->dsb_buf, dsb->free_pos, 0, - aligned_tail - tail); + dsb_buffer_fill(dsb, dsb->free_pos, 0, aligned_tail - tail); dsb->free_pos = aligned_tail / 4; } @@ -606,14 +654,14 @@ void intel_dsb_gosub_finish(struct intel_dsb *dsb) */ intel_dsb_noop(dsb, 8); - intel_dsb_buffer_flush_map(dsb->dsb_buf); + dsb_buffer_flush_map(dsb); } void intel_dsb_finish(struct intel_dsb *dsb) { intel_dsb_align_tail(dsb); - intel_dsb_buffer_flush_map(dsb->dsb_buf); + dsb_buffer_flush_map(dsb); } static u32 dsb_error_int_status(struct intel_display *display) @@ -917,7 +965,7 @@ void intel_dsb_wait(struct intel_dsb *dsb) !is_busy, 100, 1000, false); if (ret) { - u32 offset = intel_dsb_buffer_ggtt_offset(dsb->dsb_buf); + u32 offset = dsb_buffer_ggtt_offset(dsb); intel_de_write_fw(display, DSB_CTRL(pipe, dsb->id), DSB_ENABLE | DSB_HALT); @@ -983,7 +1031,7 @@ struct intel_dsb *intel_dsb_prepare(struct intel_atomic_state *state, /* ~1 qword per instruction, full cachelines */ size = ALIGN(max_cmds * 8, CACHELINE_BYTES); - dsb_buf = intel_dsb_buffer_create(display->drm, size); + dsb_buf = dsb_buffer_create(display, size); if (IS_ERR(dsb_buf)) goto out_put_rpm; @@ -1021,7 +1069,7 @@ out: */ void intel_dsb_cleanup(struct intel_dsb *dsb) { - intel_dsb_buffer_cleanup(dsb->dsb_buf); + dsb_buffer_cleanup(dsb); kfree(dsb); } diff --git a/drivers/gpu/drm/i915/display/intel_dsb_buffer.h b/drivers/gpu/drm/i915/display/intel_dsb_buffer.h deleted file mode 100644 index f4577d1f25cd..000000000000 --- a/drivers/gpu/drm/i915/display/intel_dsb_buffer.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2023 Intel Corporation - */ - -#ifndef _INTEL_DSB_BUFFER_H -#define _INTEL_DSB_BUFFER_H - -#include - -struct drm_device; -struct intel_dsb_buffer; - -u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf); -void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val); -u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx); -void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size); -struct intel_dsb_buffer *intel_dsb_buffer_create(struct drm_device *drm, size_t size); -void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf); -void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf); - -#endif diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index f0105c5b49a7..1e087dfe03d0 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -93,6 +93,7 @@ #include "i915_driver.h" #include "i915_drm_client.h" #include "i915_drv.h" +#include "i915_dsb_buffer.h" #include "i915_edram.h" #include "i915_file_private.h" #include "i915_getparam.h" @@ -764,6 +765,7 @@ static bool vgpu_active(struct drm_device *drm) } static const struct intel_display_parent_interface parent = { + .dsb = &i915_display_dsb_interface, .hdcp = &i915_display_hdcp_interface, .initial_plane = &i915_display_initial_plane_interface, .irq = &i915_display_irq_interface, diff --git a/drivers/gpu/drm/i915/i915_dsb_buffer.c b/drivers/gpu/drm/i915/i915_dsb_buffer.c index de30d3896f4a..884ccb2bc283 100644 --- a/drivers/gpu/drm/i915/i915_dsb_buffer.c +++ b/drivers/gpu/drm/i915/i915_dsb_buffer.c @@ -3,10 +3,12 @@ * Copyright 2023, Intel Corporation. */ -#include "display/intel_dsb_buffer.h" +#include + #include "gem/i915_gem_internal.h" #include "gem/i915_gem_lmem.h" #include "i915_drv.h" +#include "i915_dsb_buffer.h" #include "i915_vma.h" struct intel_dsb_buffer { @@ -15,29 +17,29 @@ struct intel_dsb_buffer { size_t buf_size; }; -u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) +static u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) { return i915_ggtt_offset(dsb_buf->vma); } -void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) +static void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) { dsb_buf->cmd_buf[idx] = val; } -u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) +static u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) { return dsb_buf->cmd_buf[idx]; } -void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) +static void intel_dsb_buffer_fill(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) { WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf->cmd_buf)); memset(&dsb_buf->cmd_buf[idx], val, size); } -struct intel_dsb_buffer *intel_dsb_buffer_create(struct drm_device *drm, size_t size) +static struct intel_dsb_buffer *intel_dsb_buffer_create(struct drm_device *drm, size_t size) { struct drm_i915_private *i915 = to_i915(drm); struct intel_dsb_buffer *dsb_buf; @@ -93,13 +95,23 @@ err: return ERR_PTR(ret); } -void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) +static void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) { i915_vma_unpin_and_release(&dsb_buf->vma, I915_VMA_RELEASE_MAP); kfree(dsb_buf); } -void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) +static void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) { i915_gem_object_flush_map(dsb_buf->vma->obj); } + +const struct intel_display_dsb_interface i915_display_dsb_interface = { + .ggtt_offset = intel_dsb_buffer_ggtt_offset, + .write = intel_dsb_buffer_write, + .read = intel_dsb_buffer_read, + .fill = intel_dsb_buffer_fill, + .create = intel_dsb_buffer_create, + .cleanup = intel_dsb_buffer_cleanup, + .flush_map = intel_dsb_buffer_flush_map, +}; diff --git a/drivers/gpu/drm/i915/i915_dsb_buffer.h b/drivers/gpu/drm/i915/i915_dsb_buffer.h new file mode 100644 index 000000000000..a01b4d8de947 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_dsb_buffer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __I915_DSB_BUFFER_H__ +#define __I915_DSB_BUFFER_H__ + +extern const struct intel_display_dsb_interface i915_display_dsb_interface; + +#endif /* __I915_DSB_BUFFER_H__ */ diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index f8a831b5dc7d..c640fe3d8490 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -36,6 +36,7 @@ #include "intel_opregion.h" #include "skl_watermark.h" #include "xe_display_rpm.h" +#include "xe_dsb_buffer.h" #include "xe_hdcp_gsc.h" #include "xe_initial_plane.h" #include "xe_module.h" @@ -538,6 +539,7 @@ static const struct intel_display_irq_interface xe_display_irq_interface = { }; static const struct intel_display_parent_interface parent = { + .dsb = &xe_display_dsb_interface, .hdcp = &xe_display_hdcp_interface, .initial_plane = &xe_display_initial_plane_interface, .irq = &xe_display_irq_interface, diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c index fa0acb11eaad..fdb0e8a93745 100644 --- a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c @@ -3,10 +3,12 @@ * Copyright 2023, Intel Corporation. */ -#include "intel_dsb_buffer.h" +#include + #include "xe_bo.h" #include "xe_device.h" #include "xe_device_types.h" +#include "xe_dsb_buffer.h" struct intel_dsb_buffer { u32 *cmd_buf; @@ -14,29 +16,29 @@ struct intel_dsb_buffer { size_t buf_size; }; -u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) +static u32 xe_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) { return xe_bo_ggtt_addr(dsb_buf->bo); } -void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) +static void xe_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) { iosys_map_wr(&dsb_buf->bo->vmap, idx * 4, u32, val); } -u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) +static u32 xe_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) { return iosys_map_rd(&dsb_buf->bo->vmap, idx * 4, u32); } -void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) +static void xe_dsb_buffer_fill(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) { WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf->cmd_buf)); iosys_map_memset(&dsb_buf->bo->vmap, idx * 4, val, size); } -struct intel_dsb_buffer *intel_dsb_buffer_create(struct drm_device *drm, size_t size) +static struct intel_dsb_buffer *xe_dsb_buffer_create(struct drm_device *drm, size_t size) { struct xe_device *xe = to_xe_device(drm); struct intel_dsb_buffer *dsb_buf; @@ -69,13 +71,13 @@ err_pin_map: return ERR_PTR(ret); } -void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) +static void xe_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) { xe_bo_unpin_map_no_vm(dsb_buf->bo); kfree(dsb_buf); } -void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) +static void xe_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) { struct xe_device *xe = dsb_buf->bo->tile->xe; @@ -86,3 +88,13 @@ void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) xe_device_wmb(xe); xe_device_l2_flush(xe); } + +const struct intel_display_dsb_interface xe_display_dsb_interface = { + .ggtt_offset = xe_dsb_buffer_ggtt_offset, + .write = xe_dsb_buffer_write, + .read = xe_dsb_buffer_read, + .fill = xe_dsb_buffer_fill, + .create = xe_dsb_buffer_create, + .cleanup = xe_dsb_buffer_cleanup, + .flush_map = xe_dsb_buffer_flush_map, +}; diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.h b/drivers/gpu/drm/xe/display/xe_dsb_buffer.h new file mode 100644 index 000000000000..2e4772187016 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __XE_DSB_BUFFER_H__ +#define __XE_DSB_BUFFER_H__ + +extern const struct intel_display_dsb_interface xe_display_dsb_interface; + +#endif diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index ce946859a3a9..cd091120731c 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -14,6 +14,7 @@ struct drm_gem_object; struct drm_plane_state; struct drm_scanout_buffer; struct i915_vma; +struct intel_dsb_buffer; struct intel_hdcp_gsc_context; struct intel_initial_plane_config; struct intel_panic; @@ -22,6 +23,16 @@ struct ref_tracker; /* Keep struct definitions sorted */ +struct intel_display_dsb_interface { + u32 (*ggtt_offset)(struct intel_dsb_buffer *dsb_buf); + void (*write)(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val); + u32 (*read)(struct intel_dsb_buffer *dsb_buf, u32 idx); + void (*fill)(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size); + struct intel_dsb_buffer *(*create)(struct drm_device *drm, size_t size); + void (*cleanup)(struct intel_dsb_buffer *dsb_buf); + void (*flush_map)(struct intel_dsb_buffer *dsb_buf); +}; + struct intel_display_hdcp_interface { ssize_t (*gsc_msg_send)(struct intel_hdcp_gsc_context *gsc_context, void *msg_in, size_t msg_in_len, @@ -106,6 +117,9 @@ struct intel_display_stolen_interface { * check the optional pointers. */ struct intel_display_parent_interface { + /** @dsb: DSB buffer interface */ + const struct intel_display_dsb_interface *dsb; + /** @hdcp: HDCP GSC interface */ const struct intel_display_hdcp_interface *hdcp; -- cgit v1.2.3 From ef246da8e63c486780dca4d9b4d79589cbebf5e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:13 +0200 Subject: dma-buf: Rename .move_notify() callback to a clearer identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the .move_notify() callback to .invalidate_mappings() to make its purpose explicit and highlight that it is responsible for invalidating existing mappings. Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-1-f98fca917e96@nvidia.com Signed-off-by: Christian König --- drivers/dma-buf/dma-buf.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 4 ++-- drivers/gpu/drm/virtio/virtgpu_prime.c | 2 +- drivers/gpu/drm/xe/tests/xe_dma_buf.c | 6 +++--- drivers/gpu/drm/xe/xe_dma_buf.c | 2 +- drivers/infiniband/core/umem_dmabuf.c | 4 ++-- drivers/infiniband/hw/mlx5/mr.c | 2 +- drivers/iommu/iommufd/pages.c | 2 +- include/linux/dma-buf.h | 6 +++--- 9 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 77555096e4c7..cc9b88214d97 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1017,7 +1017,7 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, if (WARN_ON(!dmabuf || !dev)) return ERR_PTR(-EINVAL); - if (WARN_ON(importer_ops && !importer_ops->move_notify)) + if (WARN_ON(importer_ops && !importer_ops->invalidate_mappings)) return ERR_PTR(-EINVAL); attach = kzalloc(sizeof(*attach), GFP_KERNEL); @@ -1130,7 +1130,7 @@ EXPORT_SYMBOL_NS_GPL(dma_buf_pin, "DMA_BUF"); * * This unpins a buffer pinned by dma_buf_pin() and allows the exporter to move * any mapping of @attach again and inform the importer through - * &dma_buf_attach_ops.move_notify. + * &dma_buf_attach_ops.invalidate_mappings. */ void dma_buf_unpin(struct dma_buf_attachment *attach) { @@ -1338,7 +1338,7 @@ void dma_buf_move_notify(struct dma_buf *dmabuf) list_for_each_entry(attach, &dmabuf->attachments, node) if (attach->importer_ops) - attach->importer_ops->move_notify(attach); + attach->importer_ops->invalidate_mappings(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, "DMA_BUF"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index e22cfa7c6d32..863454148b28 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -450,7 +450,7 @@ error: } /** - * amdgpu_dma_buf_move_notify - &attach.move_notify implementation + * amdgpu_dma_buf_move_notify - &attach.invalidate_mappings implementation * * @attach: the DMA-buf attachment * @@ -521,7 +521,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops = { .allow_peer2peer = true, - .move_notify = amdgpu_dma_buf_move_notify + .invalidate_mappings = amdgpu_dma_buf_move_notify }; /** diff --git a/drivers/gpu/drm/virtio/virtgpu_prime.c b/drivers/gpu/drm/virtio/virtgpu_prime.c index ce49282198cb..19c78dd2ca77 100644 --- a/drivers/gpu/drm/virtio/virtgpu_prime.c +++ b/drivers/gpu/drm/virtio/virtgpu_prime.c @@ -288,7 +288,7 @@ static void virtgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) static const struct dma_buf_attach_ops virtgpu_dma_buf_attach_ops = { .allow_peer2peer = true, - .move_notify = virtgpu_dma_buf_move_notify + .invalidate_mappings = virtgpu_dma_buf_move_notify }; struct drm_gem_object *virtgpu_gem_prime_import(struct drm_device *dev, diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c index 5df98de5ba3c..1f2cca5c2f81 100644 --- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c @@ -23,7 +23,7 @@ static bool p2p_enabled(struct dma_buf_test_params *params) static bool is_dynamic(struct dma_buf_test_params *params) { return IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY) && params->attach_ops && - params->attach_ops->move_notify; + params->attach_ops->invalidate_mappings; } static void check_residency(struct kunit *test, struct xe_bo *exported, @@ -60,7 +60,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported, /* * Evict exporter. Evicting the exported bo will - * evict also the imported bo through the move_notify() functionality if + * evict also the imported bo through the invalidate_mappings() functionality if * importer is on a different device. If they're on the same device, * the exporter and the importer should be the same bo. */ @@ -198,7 +198,7 @@ out: static const struct dma_buf_attach_ops nop2p_attach_ops = { .allow_peer2peer = false, - .move_notify = xe_dma_buf_move_notify + .invalidate_mappings = xe_dma_buf_move_notify }; /* diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 54e42960daad..2e167b29d0c9 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -287,7 +287,7 @@ static void xe_dma_buf_move_notify(struct dma_buf_attachment *attach) static const struct dma_buf_attach_ops xe_dma_buf_attach_ops = { .allow_peer2peer = true, - .move_notify = xe_dma_buf_move_notify + .invalidate_mappings = xe_dma_buf_move_notify }; #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 0ec2e4120cc9..d77a739cfe7a 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -129,7 +129,7 @@ ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, if (check_add_overflow(offset, (unsigned long)size, &end)) return ret; - if (unlikely(!ops || !ops->move_notify)) + if (unlikely(!ops || !ops->invalidate_mappings)) return ret; dmabuf = dma_buf_get(fd); @@ -195,7 +195,7 @@ ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .allow_peer2peer = true, - .move_notify = ib_umem_dmabuf_unsupported_move_notify, + .invalidate_mappings = ib_umem_dmabuf_unsupported_move_notify, }; struct ib_umem_dmabuf * diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 325fa04cbe8a..97099d3b1688 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1620,7 +1620,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .allow_peer2peer = 1, - .move_notify = mlx5_ib_dmabuf_invalidate_cb, + .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb, }; static struct ib_mr * diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index dbe51ecb9a20..76f900fa1687 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1451,7 +1451,7 @@ static void iopt_revoke_notify(struct dma_buf_attachment *attach) static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { .allow_peer2peer = true, - .move_notify = iopt_revoke_notify, + .invalidate_mappings = iopt_revoke_notify, }; /* diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 91f4939db89b..d9ee4499b37d 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -407,7 +407,7 @@ struct dma_buf { * through the device. * * - Dynamic importers should set fences for any access that they can't - * disable immediately from their &dma_buf_attach_ops.move_notify + * disable immediately from their &dma_buf_attach_ops.invalidate_mappings * callback. * * IMPORTANT: @@ -446,7 +446,7 @@ struct dma_buf_attach_ops { bool allow_peer2peer; /** - * @move_notify: [optional] notification that the DMA-buf is moving + * @invalidate_mappings: [optional] notification that the DMA-buf is moving * * If this callback is provided the framework can avoid pinning the * backing store while mappings exists. @@ -463,7 +463,7 @@ struct dma_buf_attach_ops { * New mappings can be created after this callback returns, and will * point to the new location of the DMA-buf. */ - void (*move_notify)(struct dma_buf_attachment *attach); + void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; /** -- cgit v1.2.3 From 95308225e5baeaae1e313816059c59a0036ab6b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:14 +0200 Subject: dma-buf: Rename dma_buf_move_notify() to dma_buf_invalidate_mappings() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Along with renaming the .move_notify() callback, rename the corresponding dma-buf core function. This makes the expected behavior clear to exporters calling this function. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-2-f98fca917e96@nvidia.com Signed-off-by: Christian König --- drivers/dma-buf/dma-buf.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +- drivers/gpu/drm/xe/xe_bo.c | 2 +- drivers/iommu/iommufd/selftest.c | 2 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 4 ++-- include/linux/dma-buf.h | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index cc9b88214d97..1c257607a623 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -981,7 +981,7 @@ dma_buf_pin_on_map(struct dma_buf_attachment *attach) * 3. Exporters must hold the dma-buf reservation lock when calling these * functions: * - * - dma_buf_move_notify() + * - dma_buf_invalidate_mappings() */ /** @@ -1323,14 +1323,14 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, "DMA_BUF"); /** - * dma_buf_move_notify - notify attachments that DMA-buf is moving + * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their * mappings. */ -void dma_buf_move_notify(struct dma_buf *dmabuf) +void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; @@ -1340,7 +1340,7 @@ void dma_buf_move_notify(struct dma_buf *dmabuf) if (attach->importer_ops) attach->importer_ops->invalidate_mappings(attach); } -EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, "DMA_BUF"); +EXPORT_SYMBOL_NS_GPL(dma_buf_invalidate_mappings, "DMA_BUF"); /** * DOC: cpu access diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index e08f58de4b17..f73dc99d1887 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1270,7 +1270,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, if (abo->tbo.base.dma_buf && !drm_gem_is_imported(&abo->tbo.base) && old_mem && old_mem->mem_type != TTM_PL_SYSTEM) - dma_buf_move_notify(abo->tbo.base.dma_buf); + dma_buf_invalidate_mappings(abo->tbo.base.dma_buf); /* move_notify is called before move happens */ trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1, diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index b0bd31d14bb9..94712b05edff 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -819,7 +819,7 @@ static int xe_bo_move_notify(struct xe_bo *bo, /* Don't call move_notify() for imported dma-bufs. */ if (ttm_bo->base.dma_buf && !ttm_bo->base.import_attach) - dma_buf_move_notify(ttm_bo->base.dma_buf); + dma_buf_invalidate_mappings(ttm_bo->base.dma_buf); /* * TTM has already nuked the mmap for us (see ttm_bo_unmap_virtual), diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index c4322fd26f93..fd47953db4a3 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -2073,7 +2073,7 @@ static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, priv = dmabuf->priv; dma_resv_lock(dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(dmabuf); + dma_buf_invalidate_mappings(dmabuf); dma_resv_unlock(dmabuf->resv); err_put: diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index d4d0f7d08c53..362e3d149817 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -320,7 +320,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (priv->revoked != revoked) { dma_resv_lock(priv->dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); } fput(priv->dmabuf->file); @@ -341,7 +341,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) list_del_init(&priv->dmabufs_elm); priv->vdev = NULL; priv->revoked = true; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d9ee4499b37d..d0470af8887e 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -588,7 +588,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, enum dma_data_direction); void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); -void dma_buf_move_notify(struct dma_buf *dma_buf); +void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From b3a2a91ae9b48c74e50833242af7d73f8a0ec3a6 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 26 Jan 2026 13:29:25 +0200 Subject: drm/{i915, xe}/pcode: move display pcode calls to parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Call the parent driver pcode functions through the parent interface function pointers instead of expecting both to have functions of the same name. In i915, add the interface to existing intel_pcode.[ch], while in xe move them to new display/xe_display_pcode.[ch] and build it only for CONFIG_DRM_XE_DISPLAY=y. Do not add separate write and write_timeout calls in the interface. Instead, handle the default 1 ms timeout in the intel_parent.c glue layer. This drops the last intel_pcode.h includes from display, and allows us to remove the corresponding xe compat header. v2: initialize .pcode in i915 Reviewed-by: Michał Grzelak Link: https://patch.msgid.link/20260126112925.2452171-1-jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/hsw_ips.c | 8 ++-- drivers/gpu/drm/i915/display/intel_bw.c | 22 ++++----- drivers/gpu/drm/i915/display/intel_cdclk.c | 54 +++++++++++----------- drivers/gpu/drm/i915/display/intel_display_power.c | 3 +- .../drm/i915/display/intel_display_power_well.c | 5 +- drivers/gpu/drm/i915/display/intel_dram.c | 6 +-- drivers/gpu/drm/i915/display/intel_hdcp.c | 3 +- drivers/gpu/drm/i915/display/intel_parent.c | 22 +++++++++ drivers/gpu/drm/i915/display/intel_parent.h | 7 +++ drivers/gpu/drm/i915/display/skl_watermark.c | 21 ++++----- drivers/gpu/drm/i915/i915_driver.c | 1 + drivers/gpu/drm/i915/intel_pcode.c | 16 +++++-- drivers/gpu/drm/i915/intel_pcode.h | 9 +--- drivers/gpu/drm/xe/Makefile | 1 + .../gpu/drm/xe/compat-i915-headers/intel_pcode.h | 11 ----- drivers/gpu/drm/xe/display/xe_display.c | 2 + drivers/gpu/drm/xe/display/xe_display_pcode.c | 38 +++++++++++++++ drivers/gpu/drm/xe/display/xe_display_pcode.h | 9 ++++ drivers/gpu/drm/xe/xe_pcode.c | 30 ------------ drivers/gpu/drm/xe/xe_pcode.h | 8 ---- include/drm/intel/display_parent_interface.h | 10 ++++ 21 files changed, 161 insertions(+), 125 deletions(-) delete mode 100644 drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h create mode 100644 drivers/gpu/drm/xe/display/xe_display_pcode.c create mode 100644 drivers/gpu/drm/xe/display/xe_display_pcode.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/hsw_ips.c b/drivers/gpu/drm/i915/display/hsw_ips.c index 008d339d5c21..0caaea2e64e1 100644 --- a/drivers/gpu/drm/i915/display/hsw_ips.c +++ b/drivers/gpu/drm/i915/display/hsw_ips.c @@ -14,7 +14,7 @@ #include "intel_display_regs.h" #include "intel_display_rpm.h" #include "intel_display_types.h" -#include "intel_pcode.h" +#include "intel_parent.h" static void hsw_ips_enable(const struct intel_crtc_state *crtc_state) { @@ -39,8 +39,8 @@ static void hsw_ips_enable(const struct intel_crtc_state *crtc_state) if (display->platform.broadwell) { drm_WARN_ON(display->drm, - intel_pcode_write(display->drm, DISPLAY_IPS_CONTROL, - val | IPS_PCODE_CONTROL)); + intel_parent_pcode_write(display, DISPLAY_IPS_CONTROL, + val | IPS_PCODE_CONTROL)); /* * Quoting Art Runyan: "its not safe to expect any particular * value in IPS_CTL bit 31 after enabling IPS through the @@ -72,7 +72,7 @@ bool hsw_ips_disable(const struct intel_crtc_state *crtc_state) if (display->platform.broadwell) { drm_WARN_ON(display->drm, - intel_pcode_write(display->drm, DISPLAY_IPS_CONTROL, 0)); + intel_parent_pcode_write(display, DISPLAY_IPS_CONTROL, 0)); /* * Wait for PCODE to finish disabling IPS. The BSpec specified * 42ms timeout value leads to occasional timeouts so use 100ms diff --git a/drivers/gpu/drm/i915/display/intel_bw.c b/drivers/gpu/drm/i915/display/intel_bw.c index 4ee3f5172f4e..8d84445c69f1 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.c +++ b/drivers/gpu/drm/i915/display/intel_bw.c @@ -15,7 +15,7 @@ #include "intel_display_utils.h" #include "intel_dram.h" #include "intel_mchbar_regs.h" -#include "intel_pcode.h" +#include "intel_parent.h" #include "intel_uncore.h" #include "skl_watermark.h" @@ -114,9 +114,9 @@ static int icl_pcode_read_qgv_point_info(struct intel_display *display, u16 dclk; int ret; - ret = intel_pcode_read(display->drm, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | - ICL_PCODE_MEM_SS_READ_QGV_POINT_INFO(point), - &val, &val2); + ret = intel_parent_pcode_read(display, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | + ICL_PCODE_MEM_SS_READ_QGV_POINT_INFO(point), + &val, &val2); if (ret) return ret; @@ -141,8 +141,8 @@ static int adls_pcode_read_psf_gv_point_info(struct intel_display *display, int ret; int i; - ret = intel_pcode_read(display->drm, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | - ADL_PCODE_MEM_SS_READ_PSF_GV_INFO, &val, NULL); + ret = intel_parent_pcode_read(display, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | + ADL_PCODE_MEM_SS_READ_PSF_GV_INFO, &val, NULL); if (ret) return ret; @@ -189,11 +189,11 @@ static int icl_pcode_restrict_qgv_points(struct intel_display *display, return 0; /* bspec says to keep retrying for at least 1 ms */ - ret = intel_pcode_request(display->drm, ICL_PCODE_SAGV_DE_MEM_SS_CONFIG, - points_mask, - ICL_PCODE_REP_QGV_MASK | ADLS_PCODE_REP_PSF_MASK, - ICL_PCODE_REP_QGV_SAFE | ADLS_PCODE_REP_PSF_SAFE, - 1); + ret = intel_parent_pcode_request(display, ICL_PCODE_SAGV_DE_MEM_SS_CONFIG, + points_mask, + ICL_PCODE_REP_QGV_MASK | ADLS_PCODE_REP_PSF_MASK, + ICL_PCODE_REP_QGV_SAFE | ADLS_PCODE_REP_PSF_SAFE, + 1); if (ret < 0) { drm_err(display->drm, diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index 9bfbfbf34dc0..9217050a76e0 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -42,8 +42,8 @@ #include "intel_display_wa.h" #include "intel_dram.h" #include "intel_mchbar_regs.h" +#include "intel_parent.h" #include "intel_pci_config.h" -#include "intel_pcode.h" #include "intel_plane.h" #include "intel_psr.h" #include "intel_step.h" @@ -888,7 +888,7 @@ static void bdw_set_cdclk(struct intel_display *display, "trying to change cdclk frequency with cdclk not enabled\n")) return; - ret = intel_pcode_write(display->drm, BDW_PCODE_DISPLAY_FREQ_CHANGE_REQ, 0x0); + ret = intel_parent_pcode_write(display, BDW_PCODE_DISPLAY_FREQ_CHANGE_REQ, 0x0); if (ret) { drm_err(display->drm, "failed to inform pcode about cdclk change\n"); @@ -918,8 +918,8 @@ static void bdw_set_cdclk(struct intel_display *display, if (ret) drm_err(display->drm, "Switching back to LCPLL failed\n"); - intel_pcode_write(display->drm, HSW_PCODE_DE_WRITE_FREQ_REQ, - cdclk_config->voltage_level); + intel_parent_pcode_write(display, HSW_PCODE_DE_WRITE_FREQ_REQ, + cdclk_config->voltage_level); intel_de_write(display, CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1); @@ -1175,10 +1175,10 @@ static void skl_set_cdclk(struct intel_display *display, drm_WARN_ON_ONCE(display->drm, display->platform.skylake && vco == 8640000); - ret = intel_pcode_request(display->drm, SKL_PCODE_CDCLK_CONTROL, - SKL_CDCLK_PREPARE_FOR_CHANGE, - SKL_CDCLK_READY_FOR_CHANGE, - SKL_CDCLK_READY_FOR_CHANGE, 3); + ret = intel_parent_pcode_request(display, SKL_PCODE_CDCLK_CONTROL, + SKL_CDCLK_PREPARE_FOR_CHANGE, + SKL_CDCLK_READY_FOR_CHANGE, + SKL_CDCLK_READY_FOR_CHANGE, 3); if (ret) { drm_err(display->drm, "Failed to inform PCU about cdclk change (%d)\n", ret); @@ -1221,8 +1221,8 @@ static void skl_set_cdclk(struct intel_display *display, intel_de_posting_read(display, CDCLK_CTL); /* inform PCU of the change */ - intel_pcode_write(display->drm, SKL_PCODE_CDCLK_CONTROL, - cdclk_config->voltage_level); + intel_parent_pcode_write(display, SKL_PCODE_CDCLK_CONTROL, + cdclk_config->voltage_level); intel_update_cdclk(display); } @@ -2247,18 +2247,18 @@ static void bxt_set_cdclk(struct intel_display *display, if (DISPLAY_VER(display) >= 14 || display->platform.dg2) ; /* NOOP */ else if (DISPLAY_VER(display) >= 11) - ret = intel_pcode_request(display->drm, SKL_PCODE_CDCLK_CONTROL, - SKL_CDCLK_PREPARE_FOR_CHANGE, - SKL_CDCLK_READY_FOR_CHANGE, - SKL_CDCLK_READY_FOR_CHANGE, 3); + ret = intel_parent_pcode_request(display, SKL_PCODE_CDCLK_CONTROL, + SKL_CDCLK_PREPARE_FOR_CHANGE, + SKL_CDCLK_READY_FOR_CHANGE, + SKL_CDCLK_READY_FOR_CHANGE, 3); else /* * BSpec requires us to wait up to 150usec, but that leads to * timeouts; the 2ms used here is based on experiment. */ - ret = intel_pcode_write_timeout(display->drm, - HSW_PCODE_DE_WRITE_FREQ_REQ, - 0x80000000, 2); + ret = intel_parent_pcode_write_timeout(display, + HSW_PCODE_DE_WRITE_FREQ_REQ, + 0x80000000, 2); if (ret) { drm_err(display->drm, @@ -2287,8 +2287,8 @@ static void bxt_set_cdclk(struct intel_display *display, * Display versions 14 and beyond */; else if (DISPLAY_VER(display) >= 11 && !display->platform.dg2) - ret = intel_pcode_write(display->drm, SKL_PCODE_CDCLK_CONTROL, - cdclk_config->voltage_level); + ret = intel_parent_pcode_write(display, SKL_PCODE_CDCLK_CONTROL, + cdclk_config->voltage_level); if (DISPLAY_VER(display) < 11) { /* * The timeout isn't specified, the 2ms used here is based on @@ -2296,9 +2296,9 @@ static void bxt_set_cdclk(struct intel_display *display, * FIXME: Waiting for the request completion could be delayed * until the next PCODE request based on BSpec. */ - ret = intel_pcode_write_timeout(display->drm, - HSW_PCODE_DE_WRITE_FREQ_REQ, - cdclk_config->voltage_level, 2); + ret = intel_parent_pcode_write_timeout(display, + HSW_PCODE_DE_WRITE_FREQ_REQ, + cdclk_config->voltage_level, 2); } if (ret) { drm_err(display->drm, @@ -2598,11 +2598,11 @@ static void intel_pcode_notify(struct intel_display *display, if (pipe_count_update_valid) update_mask |= DISPLAY_TO_PCODE_PIPE_COUNT_VALID; - ret = intel_pcode_request(display->drm, SKL_PCODE_CDCLK_CONTROL, - SKL_CDCLK_PREPARE_FOR_CHANGE | - update_mask, - SKL_CDCLK_READY_FOR_CHANGE, - SKL_CDCLK_READY_FOR_CHANGE, 3); + ret = intel_parent_pcode_request(display, SKL_PCODE_CDCLK_CONTROL, + SKL_CDCLK_PREPARE_FOR_CHANGE | + update_mask, + SKL_CDCLK_READY_FOR_CHANGE, + SKL_CDCLK_READY_FOR_CHANGE, 3); if (ret) drm_err(display->drm, "Failed to inform PCU about display config (err %d)\n", diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index d27397f43863..06adf6afbec0 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -26,7 +26,6 @@ #include "intel_mchbar_regs.h" #include "intel_parent.h" #include "intel_pch_refclk.h" -#include "intel_pcode.h" #include "intel_pmdemand.h" #include "intel_pps_regs.h" #include "intel_snps_phy.h" @@ -1260,7 +1259,7 @@ static u32 hsw_read_dcomp(struct intel_display *display) static void hsw_write_dcomp(struct intel_display *display, u32 val) { if (display->platform.haswell) { - if (intel_pcode_write(display->drm, GEN6_PCODE_WRITE_D_COMP, val)) + if (intel_parent_pcode_write(display, GEN6_PCODE_WRITE_D_COMP, val)) drm_dbg_kms(display->drm, "Failed to write to D_COMP\n"); } else { intel_de_write(display, D_COMP_BDW, val); diff --git a/drivers/gpu/drm/i915/display/intel_display_power_well.c b/drivers/gpu/drm/i915/display/intel_display_power_well.c index 68f293c3ac01..6f9bc6f9615e 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power_well.c +++ b/drivers/gpu/drm/i915/display/intel_display_power_well.c @@ -27,7 +27,6 @@ #include "intel_dpll.h" #include "intel_hotplug.h" #include "intel_parent.h" -#include "intel_pcode.h" #include "intel_pps.h" #include "intel_psr.h" #include "intel_tc.h" @@ -518,7 +517,7 @@ static void icl_tc_cold_exit(struct intel_display *display) int ret, tries = 0; while (1) { - ret = intel_pcode_write(display->drm, ICL_PCODE_EXIT_TCCOLD, 0); + ret = intel_parent_pcode_write(display, ICL_PCODE_EXIT_TCCOLD, 0); if (ret != -EAGAIN || ++tries == 3) break; msleep(1); @@ -1791,7 +1790,7 @@ tgl_tc_cold_request(struct intel_display *display, bool block) * Spec states that we should timeout the request after 200us * but the function below will timeout after 500us */ - ret = intel_pcode_read(display->drm, TGL_PCODE_TCCOLD, &low_val, &high_val); + ret = intel_parent_pcode_read(display, TGL_PCODE_TCCOLD, &low_val, &high_val); if (ret == 0) { if (block && (low_val & TGL_PCODE_EXIT_TCCOLD_DATA_L_EXIT_FAILED)) diff --git a/drivers/gpu/drm/i915/display/intel_dram.c b/drivers/gpu/drm/i915/display/intel_dram.c index 170de304fe96..3b9879714ea9 100644 --- a/drivers/gpu/drm/i915/display/intel_dram.c +++ b/drivers/gpu/drm/i915/display/intel_dram.c @@ -13,7 +13,7 @@ #include "intel_display_utils.h" #include "intel_dram.h" #include "intel_mchbar_regs.h" -#include "intel_pcode.h" +#include "intel_parent.h" #include "intel_uncore.h" #include "vlv_iosf_sb.h" @@ -692,8 +692,8 @@ static int icl_pcode_read_mem_global_info(struct intel_display *display, u32 val = 0; int ret; - ret = intel_pcode_read(display->drm, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | - ICL_PCODE_MEM_SS_READ_GLOBAL_INFO, &val, NULL); + ret = intel_parent_pcode_read(display, ICL_PCODE_MEM_SUBSYSYSTEM_INFO | + ICL_PCODE_MEM_SS_READ_GLOBAL_INFO, &val, NULL); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index 7114fc405c29..8d3137067bf6 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -33,7 +33,6 @@ #include "intel_hdcp_regs.h" #include "intel_hdcp_shim.h" #include "intel_parent.h" -#include "intel_pcode.h" #include "intel_step.h" #define USE_HDCP_GSC(__display) (DISPLAY_VER(__display) >= 14) @@ -398,7 +397,7 @@ static int intel_hdcp_load_keys(struct intel_display *display) * Mailbox interface. */ if (DISPLAY_VER(display) == 9 && !display->platform.broxton) { - ret = intel_pcode_write(display->drm, SKL_PCODE_LOAD_HDCP_KEYS, 1); + ret = intel_parent_pcode_write(display, SKL_PCODE_LOAD_HDCP_KEYS, 1); if (ret) { drm_err(display->drm, "Failed to initiate HDCP key load (%d)\n", diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index 72ae553f79a4..7f73695a0444 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -92,6 +92,28 @@ void intel_parent_pc8_unblock(struct intel_display *display) display->parent->pc8->unblock(display->drm); } +/* pcode */ +int intel_parent_pcode_read(struct intel_display *display, u32 mbox, u32 *val, u32 *val1) +{ + return display->parent->pcode->read(display->drm, mbox, val, val1); +} + +int intel_parent_pcode_write_timeout(struct intel_display *display, u32 mbox, u32 val, int timeout_ms) +{ + return display->parent->pcode->write(display->drm, mbox, val, timeout_ms); +} + +int intel_parent_pcode_write(struct intel_display *display, u32 mbox, u32 val) +{ + return intel_parent_pcode_write_timeout(display, mbox, val, 1); +} + +int intel_parent_pcode_request(struct intel_display *display, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms) +{ + return display->parent->pcode->request(display->drm, mbox, request, reply_mask, reply, timeout_base_ms); +} + /* rps */ bool intel_parent_rps_available(struct intel_display *display) { diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 47cdc14f9aa2..04782bb26b61 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -36,6 +36,13 @@ void intel_parent_panic_finish(struct intel_display *display, struct intel_panic void intel_parent_pc8_block(struct intel_display *display); void intel_parent_pc8_unblock(struct intel_display *display); +/* pcode */ +int intel_parent_pcode_read(struct intel_display *display, u32 mbox, u32 *val, u32 *val1); +int intel_parent_pcode_write_timeout(struct intel_display *display, u32 mbox, u32 val, int timeout_ms); +int intel_parent_pcode_write(struct intel_display *display, u32 mbox, u32 val); +int intel_parent_pcode_request(struct intel_display *display, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms); + /* rps */ bool intel_parent_rps_available(struct intel_display *display); void intel_parent_rps_boost_if_not_started(struct intel_display *display, struct dma_fence *fence); diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index a6aab79812e5..b41da10f0f85 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -26,7 +26,7 @@ #include "intel_fb.h" #include "intel_fixed.h" #include "intel_flipq.h" -#include "intel_pcode.h" +#include "intel_parent.h" #include "intel_plane.h" #include "intel_vblank.h" #include "intel_wm.h" @@ -115,9 +115,8 @@ intel_sagv_block_time(struct intel_display *display) u32 val = 0; int ret; - ret = intel_pcode_read(display->drm, - GEN12_PCODE_READ_SAGV_BLOCK_TIME_US, - &val, NULL); + ret = intel_parent_pcode_read(display, GEN12_PCODE_READ_SAGV_BLOCK_TIME_US, + &val, NULL); if (ret) { drm_dbg_kms(display->drm, "Couldn't read SAGV block time!\n"); return 0; @@ -184,8 +183,8 @@ static void skl_sagv_enable(struct intel_display *display) return; drm_dbg_kms(display->drm, "Enabling SAGV\n"); - ret = intel_pcode_write(display->drm, GEN9_PCODE_SAGV_CONTROL, - GEN9_SAGV_ENABLE); + ret = intel_parent_pcode_write(display, GEN9_PCODE_SAGV_CONTROL, + GEN9_SAGV_ENABLE); /* We don't need to wait for SAGV when enabling */ @@ -217,9 +216,9 @@ static void skl_sagv_disable(struct intel_display *display) drm_dbg_kms(display->drm, "Disabling SAGV\n"); /* bspec says to keep retrying for at least 1 ms */ - ret = intel_pcode_request(display->drm, GEN9_PCODE_SAGV_CONTROL, - GEN9_SAGV_DISABLE, - GEN9_SAGV_IS_DISABLED, GEN9_SAGV_IS_DISABLED, 1); + ret = intel_parent_pcode_request(display, GEN9_PCODE_SAGV_CONTROL, + GEN9_SAGV_DISABLE, + GEN9_SAGV_IS_DISABLED, GEN9_SAGV_IS_DISABLED, 1); /* * Some skl systems, pre-release machines in particular, * don't actually have SAGV. @@ -3283,7 +3282,7 @@ static void skl_read_wm_latency(struct intel_display *display) /* read the first set of memory latencies[0:3] */ val = 0; /* data0 to be programmed to 0 for first set */ - ret = intel_pcode_read(display->drm, GEN9_PCODE_READ_MEM_LATENCY, &val, NULL); + ret = intel_parent_pcode_read(display, GEN9_PCODE_READ_MEM_LATENCY, &val, NULL); if (ret) { drm_err(display->drm, "SKL Mailbox read error = %d\n", ret); return; @@ -3296,7 +3295,7 @@ static void skl_read_wm_latency(struct intel_display *display) /* read the second set of memory latencies[4:7] */ val = 1; /* data0 to be programmed to 1 for second set */ - ret = intel_pcode_read(display->drm, GEN9_PCODE_READ_MEM_LATENCY, &val, NULL); + ret = intel_parent_pcode_read(display, GEN9_PCODE_READ_MEM_LATENCY, &val, NULL); if (ret) { drm_err(display->drm, "SKL Mailbox read error = %d\n", ret); return; diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 1e087dfe03d0..f8a1f10d4874 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -771,6 +771,7 @@ static const struct intel_display_parent_interface parent = { .irq = &i915_display_irq_interface, .panic = &i915_display_panic_interface, .pc8 = &i915_display_pc8_interface, + .pcode = &i915_display_pcode_interface, .rpm = &i915_display_rpm_interface, .rps = &i915_display_rps_interface, .stolen = &i915_display_stolen_interface, diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c index 756652b8ec97..76c5916b28f4 100644 --- a/drivers/gpu/drm/i915/intel_pcode.c +++ b/drivers/gpu/drm/i915/intel_pcode.c @@ -4,6 +4,7 @@ */ #include +#include #include "i915_drv.h" #include "i915_reg.h" @@ -276,26 +277,31 @@ int snb_pcode_write_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u3 return err; } -/* Helpers with drm device */ -int intel_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1) +static int intel_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1) { struct drm_i915_private *i915 = to_i915(drm); return snb_pcode_read(&i915->uncore, mbox, val, val1); } -int intel_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms) +static int intel_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms) { struct drm_i915_private *i915 = to_i915(drm); return snb_pcode_write_timeout(&i915->uncore, mbox, val, timeout_ms); } -int intel_pcode_request(struct drm_device *drm, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms) +static int intel_pcode_request(struct drm_device *drm, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms) { struct drm_i915_private *i915 = to_i915(drm); return skl_pcode_request(&i915->uncore, mbox, request, reply_mask, reply, timeout_base_ms); } + +const struct intel_display_pcode_interface i915_display_pcode_interface = { + .read = intel_pcode_read, + .write = intel_pcode_write_timeout, + .request = intel_pcode_request, +}; diff --git a/drivers/gpu/drm/i915/intel_pcode.h b/drivers/gpu/drm/i915/intel_pcode.h index c91a821a88d4..19795ea8172e 100644 --- a/drivers/gpu/drm/i915/intel_pcode.h +++ b/drivers/gpu/drm/i915/intel_pcode.h @@ -27,13 +27,6 @@ int intel_pcode_init(struct intel_uncore *uncore); int snb_pcode_read_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 *val); int snb_pcode_write_p(struct intel_uncore *uncore, u32 mbcmd, u32 p1, u32 p2, u32 val); -/* Helpers with drm device */ -int intel_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1); -int intel_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms); -#define intel_pcode_write(drm, mbox, val) \ - intel_pcode_write_timeout((drm), (mbox), (val), 1) - -int intel_pcode_request(struct drm_device *drm, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms); +extern const struct intel_display_pcode_interface i915_display_pcode_interface; #endif /* _INTEL_PCODE_H */ diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index e76224c848d0..999cbf18f3e5 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -209,6 +209,7 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ display/intel_fb_bo.o \ display/intel_fbdev_fb.o \ display/xe_display.o \ + display/xe_display_pcode.o \ display/xe_display_rpm.o \ display/xe_display_wa.o \ display/xe_dsb_buffer.o \ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h deleted file mode 100644 index 4fcd3bf6b76f..000000000000 --- a/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2023 Intel Corporation - */ - -#ifndef __INTEL_PCODE_H__ -#define __INTEL_PCODE_H__ - -#include "xe_pcode.h" - -#endif /* __INTEL_PCODE_H__ */ diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index c640fe3d8490..c8dd3faa9b97 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -35,6 +35,7 @@ #include "intel_hotplug.h" #include "intel_opregion.h" #include "skl_watermark.h" +#include "xe_display_pcode.h" #include "xe_display_rpm.h" #include "xe_dsb_buffer.h" #include "xe_hdcp_gsc.h" @@ -544,6 +545,7 @@ static const struct intel_display_parent_interface parent = { .initial_plane = &xe_display_initial_plane_interface, .irq = &xe_display_irq_interface, .panic = &xe_display_panic_interface, + .pcode = &xe_display_pcode_interface, .rpm = &xe_display_rpm_interface, .stolen = &xe_display_stolen_interface, }; diff --git a/drivers/gpu/drm/xe/display/xe_display_pcode.c b/drivers/gpu/drm/xe/display/xe_display_pcode.c new file mode 100644 index 000000000000..f6820ef7e666 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_display_pcode.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +/* Copyright © 2026 Intel Corporation */ + +#include + +#include "xe_device.h" +#include "xe_pcode.h" + +static int xe_display_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1) +{ + struct xe_device *xe = to_xe_device(drm); + struct xe_tile *tile = xe_device_get_root_tile(xe); + + return xe_pcode_read(tile, mbox, val, val1); +} + +static int xe_display_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms) +{ + struct xe_device *xe = to_xe_device(drm); + struct xe_tile *tile = xe_device_get_root_tile(xe); + + return xe_pcode_write_timeout(tile, mbox, val, timeout_ms); +} + +static int xe_display_pcode_request(struct drm_device *drm, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms) +{ + struct xe_device *xe = to_xe_device(drm); + struct xe_tile *tile = xe_device_get_root_tile(xe); + + return xe_pcode_request(tile, mbox, request, reply_mask, reply, timeout_base_ms); +} + +const struct intel_display_pcode_interface xe_display_pcode_interface = { + .read = xe_display_pcode_read, + .write = xe_display_pcode_write_timeout, + .request = xe_display_pcode_request, +}; diff --git a/drivers/gpu/drm/xe/display/xe_display_pcode.h b/drivers/gpu/drm/xe/display/xe_display_pcode.h new file mode 100644 index 000000000000..58bd2fb7fb79 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_display_pcode.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __XE_DISPLAY_PCODE_H__ +#define __XE_DISPLAY_PCODE_H__ + +extern const struct intel_display_pcode_interface xe_display_pcode_interface; + +#endif diff --git a/drivers/gpu/drm/xe/xe_pcode.c b/drivers/gpu/drm/xe/xe_pcode.c index 0d33c14ea0cf..dc66d0c7ee06 100644 --- a/drivers/gpu/drm/xe/xe_pcode.c +++ b/drivers/gpu/drm/xe/xe_pcode.c @@ -348,33 +348,3 @@ int xe_pcode_probe_early(struct xe_device *xe) return xe_pcode_ready(xe, false); } ALLOW_ERROR_INJECTION(xe_pcode_probe_early, ERRNO); /* See xe_pci_probe */ - -/* Helpers with drm device. These should only be called by the display side */ -#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) - -int intel_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1) -{ - struct xe_device *xe = to_xe_device(drm); - struct xe_tile *tile = xe_device_get_root_tile(xe); - - return xe_pcode_read(tile, mbox, val, val1); -} - -int intel_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms) -{ - struct xe_device *xe = to_xe_device(drm); - struct xe_tile *tile = xe_device_get_root_tile(xe); - - return xe_pcode_write_timeout(tile, mbox, val, timeout_ms); -} - -int intel_pcode_request(struct drm_device *drm, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms) -{ - struct xe_device *xe = to_xe_device(drm); - struct xe_tile *tile = xe_device_get_root_tile(xe); - - return xe_pcode_request(tile, mbox, request, reply_mask, reply, timeout_base_ms); -} - -#endif diff --git a/drivers/gpu/drm/xe/xe_pcode.h b/drivers/gpu/drm/xe/xe_pcode.h index a5584c1c75f9..490e4f269607 100644 --- a/drivers/gpu/drm/xe/xe_pcode.h +++ b/drivers/gpu/drm/xe/xe_pcode.h @@ -34,12 +34,4 @@ int xe_pcode_request(struct xe_tile *tile, u32 mbox, u32 request, | FIELD_PREP(PCODE_MB_PARAM1, param1)\ | FIELD_PREP(PCODE_MB_PARAM2, param2)) -/* Helpers with drm device */ -int intel_pcode_read(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1); -int intel_pcode_write_timeout(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms); -#define intel_pcode_write(drm, mbox, val) \ - intel_pcode_write_timeout((drm), (mbox), (val), 1) -int intel_pcode_request(struct drm_device *drm, u32 mbox, u32 request, - u32 reply_mask, u32 reply, int timeout_base_ms); - #endif diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index cd091120731c..41f4afe7928c 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -66,6 +66,13 @@ struct intel_display_pc8_interface { void (*unblock)(struct drm_device *drm); }; +struct intel_display_pcode_interface { + int (*read)(struct drm_device *drm, u32 mbox, u32 *val, u32 *val1); + int (*write)(struct drm_device *drm, u32 mbox, u32 val, int timeout_ms); + int (*request)(struct drm_device *drm, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms); +}; + struct intel_display_rpm_interface { struct ref_tracker *(*get)(const struct drm_device *drm); struct ref_tracker *(*get_raw)(const struct drm_device *drm); @@ -135,6 +142,9 @@ struct intel_display_parent_interface { /** @pc8: PC8 interface. Optional. */ const struct intel_display_pc8_interface *pc8; + /** @pcode: Pcode interface */ + const struct intel_display_pcode_interface *pcode; + /** @rpm: Runtime PM functions */ const struct intel_display_rpm_interface *rpm; -- cgit v1.2.3 From 42dab3138176a944b09996441d837986f9ef13f8 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Tue, 6 Jan 2026 11:00:16 -0600 Subject: drm/connector: Add a new 'panel_type' property If the driver can make an assertion whether a connected panel is an OLED panel or not then it can attach a property to the connector that userspace can use as a hint for color schemes. Reviewed-by: Leo Li Link: https://patch.msgid.link/20260106170017.68158-2-superm1@kernel.org Signed-off-by: Mario Limonciello (AMD) --- drivers/gpu/drm/drm_connector.c | 33 +++++++++++++++++++++++++++++++++ include/drm/drm_connector.h | 1 + include/drm/drm_mode_config.h | 4 ++++ include/uapi/drm/drm_mode.h | 4 ++++ 4 files changed, 42 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 4f5b27fab475..aec05adbc889 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -1173,6 +1173,11 @@ static const struct drm_prop_enum_list drm_link_status_enum_list[] = { { DRM_MODE_LINK_STATUS_BAD, "Bad" }, }; +static const struct drm_prop_enum_list drm_panel_type_enum_list[] = { + { DRM_MODE_PANEL_TYPE_UNKNOWN, "unknown" }, + { DRM_MODE_PANEL_TYPE_OLED, "OLED" }, +}; + /** * drm_display_info_set_bus_formats - set the supported bus formats * @info: display info to store bus formats in @@ -1501,6 +1506,9 @@ EXPORT_SYMBOL(drm_hdmi_connector_get_output_format_name); * Summarizing: Only set "DPMS" when the connector is known to be enabled, * assume that a successful SETCONFIG call also sets "DPMS" to on, and * never read back the value of "DPMS" because it can be incorrect. + * panel_type: + * Immutable enum property to indicate the type of connected panel. + * Possible values are "unknown" (default) and "OLED". * PATH: * Connector path property to identify how this sink is physically * connected. Used by DP MST. This should be set by calling @@ -1851,6 +1859,13 @@ int drm_connector_create_standard_properties(struct drm_device *dev) return -ENOMEM; dev->mode_config.link_status_property = prop; + prop = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE, "panel_type", + drm_panel_type_enum_list, + ARRAY_SIZE(drm_panel_type_enum_list)); + if (!prop) + return -ENOMEM; + dev->mode_config.panel_type_property = prop; + prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE, "non-desktop"); if (!prop) return -ENOMEM; @@ -3626,3 +3641,21 @@ struct drm_tile_group *drm_mode_create_tile_group(struct drm_device *dev, return tg; } EXPORT_SYMBOL(drm_mode_create_tile_group); + +/** + * drm_connector_attach_panel_type_property - attaches panel type property + * @connector: connector to attach the property on. + * + * This is used to add support for panel type detection. + */ +void drm_connector_attach_panel_type_property(struct drm_connector *connector) +{ + struct drm_device *dev = connector->dev; + struct drm_property *prop = dev->mode_config.panel_type_property; + + if (!prop) + return; + + drm_object_attach_property(&connector->base, prop, DRM_MODE_PANEL_TYPE_UNKNOWN); +} +EXPORT_SYMBOL(drm_connector_attach_panel_type_property); diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index 7eaec37ae1c7..c18be8c19de0 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -2493,6 +2493,7 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector, u32 scaling_mode_mask); int drm_connector_attach_vrr_capable_property( struct drm_connector *connector); +void drm_connector_attach_panel_type_property(struct drm_connector *connector); int drm_connector_attach_broadcast_rgb_property(struct drm_connector *connector); int drm_connector_attach_colorspace_property(struct drm_connector *connector); int drm_connector_attach_hdr_output_metadata_property(struct drm_connector *connector); diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index 895fb820dba0..5e1dd0cfccde 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -600,6 +600,10 @@ struct drm_mode_config { * multiple CRTCs. */ struct drm_property *tile_property; + /** + * @panel_type_property: Default connector property for panel type + */ + struct drm_property *panel_type_property; /** * @link_status_property: Default connector property for link status * of a connector diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index cbbbfc1dfe2b..3693d82b5279 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -166,6 +166,10 @@ extern "C" { #define DRM_MODE_LINK_STATUS_GOOD 0 #define DRM_MODE_LINK_STATUS_BAD 1 +/* Panel type property */ +#define DRM_MODE_PANEL_TYPE_UNKNOWN 0 +#define DRM_MODE_PANEL_TYPE_OLED 1 + /* * DRM_MODE_ROTATE_ * -- cgit v1.2.3 From 3aecd55af5b83d16d84e3c333d4163999ee8ff51 Mon Sep 17 00:00:00 2001 From: Caterina Shablia Date: Wed, 28 Jan 2026 18:40:57 +0000 Subject: drm: add ARM interleaved 64k modifier This modifier is primarily intended to be used by panvk to implement sparse partially-resident images with better map and unmap performance, and no worse access performance, compared to implementing them in terms of U-interleaved. With this modifier, the plane is divided into 64k byte 1:1 or 2:1 -sided tiles. The 64k tiles are laid out linearly. Each 64k tile is divided into blocks of 16x16 texel blocks each, which themselves are laid out linearly within a 64k tile. Then within each such 16x16 block, texel blocks are laid out according to U order, similar to 16X16_BLOCK_U_INTERLEAVED. Unlike 16X16_BLOCK_U_INTERLEAVED, the layout does not depend on whether a format is compressed or not. The hardware features corresponding to this modifier are available starting with v10 (second gen Valhall.) The corresponding panvk MR can be found at: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38986 Previous version: https://lists.freedesktop.org/archives/dri-devel/2026-January/547072.html No changes since v2 Changes since v1: * Rewrite the description of the modifier to be hopefully unambiguous. Signed-off-by: Caterina Shablia Reviewed-by: Boris Brezillon Reviewed-by: Liviu Dudau Link: https://patch.msgid.link/20260128184058.807213-1-caterina.shablia@collabora.com Signed-off-by: Boris Brezillon --- include/uapi/drm/drm_fourcc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index e527b24bd824..452f901513ad 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -1422,6 +1422,22 @@ drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) #define DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED \ DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_MISC, 1ULL) +/* + * ARM 64k interleaved modifier + * + * This is used by ARM Mali v10+ GPUs. With this modifier, the plane is divided + * into 64k byte 1:1 or 2:1 -sided tiles. The 64k tiles are laid out linearly. + * Each 64k tile is divided into blocks of 16x16 texel blocks, which are + * themselves laid out linearly within a 64k tile. Then within each 16x16 + * block, texel blocks are laid out according to U order, similar to + * 16X16_BLOCK_U_INTERLEAVED. + * + * Note that unlike 16X16_BLOCK_U_INTERLEAVED, the layout does not change + * depending on whether a format is compressed or not. + */ +#define DRM_FORMAT_MOD_ARM_INTERLEAVED_64K \ + DRM_FORMAT_MOD_ARM_CODE(DRM_FORMAT_MOD_ARM_TYPE_MISC, 2ULL) + /* * Allwinner tiled modifier * -- cgit v1.2.3 From 2bcbc706dfa02ae50118173a6f6d8a12e735480c Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 19 Dec 2025 11:41:54 +0100 Subject: dma-buf: add dma_fence_was_initialized function v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some driver use fence->ops to test if a fence was initialized or not. The problem is that this utilizes internal behavior of the dma_fence implementation. So better abstract that into a function. v2: use a flag instead of testing fence->ops, rename the function, move to the beginning of the patch set. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260120105655.7134-2-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 +++++++------ drivers/gpu/drm/qxl/qxl_release.c | 2 +- include/linux/dma-fence.h | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 21c5c30b4f34..c9a036b0d592 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -1054,7 +1054,7 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, fence->lock = lock; fence->context = context; fence->seqno = seqno; - fence->flags = flags; + fence->flags = flags | BIT(DMA_FENCE_FLAG_INITIALIZED_BIT); fence->error = 0; trace_dma_fence_init(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aaf5477fcd7a..f05683d59f8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -282,9 +282,10 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) unsigned i; /* Check if any fences were initialized */ - if (job->base.s_fence && job->base.s_fence->finished.ops) + if (job->base.s_fence && + dma_fence_was_initialized(&job->base.s_fence->finished)) f = &job->base.s_fence->finished; - else if (job->hw_fence && job->hw_fence->base.ops) + else if (dma_fence_was_initialized(&job->hw_fence->base)) f = &job->hw_fence->base; else f = NULL; @@ -301,11 +302,11 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) amdgpu_sync_free(&job->explicit_sync); - if (job->hw_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_fence->base)) dma_fence_put(&job->hw_fence->base); else kfree(job->hw_fence); - if (job->hw_vm_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_vm_fence->base)) dma_fence_put(&job->hw_vm_fence->base); else kfree(job->hw_vm_fence); @@ -339,11 +340,11 @@ void amdgpu_job_free(struct amdgpu_job *job) if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); - if (job->hw_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_fence->base)) dma_fence_put(&job->hw_fence->base); else kfree(job->hw_fence); - if (job->hw_vm_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_vm_fence->base)) dma_fence_put(&job->hw_vm_fence->base); else kfree(job->hw_vm_fence); diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 7b3c9a6016db..06b0b2aa7953 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -146,7 +146,7 @@ qxl_release_free(struct qxl_device *qdev, idr_remove(&qdev->release_idr, release->id); spin_unlock(&qdev->release_idr_lock); - if (release->base.ops) { + if (dma_fence_was_initialized(&release->base)) { WARN_ON(list_empty(&release->bos)); qxl_release_free_list(release); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index d4c92fd35092..9c4d25289239 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -48,6 +48,7 @@ struct seq_file; * atomic ops (bit_*), so taking the spinlock will not be needed most * of the time. * + * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -98,6 +99,7 @@ struct dma_fence { }; enum dma_fence_flag_bits { + DMA_FENCE_FLAG_INITIALIZED_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -263,6 +265,19 @@ void dma_fence_release(struct kref *kref); void dma_fence_free(struct dma_fence *fence); void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq); +/** + * dma_fence_was_initialized - test if fence was initialized + * @fence: fence to test + * + * Return: True if fence was ever initialized, false otherwise. Works correctly + * only when memory backing the fence structure is zero initialized on + * allocation. + */ +static inline bool dma_fence_was_initialized(struct dma_fence *fence) +{ + return fence && test_bit(DMA_FENCE_FLAG_INITIALIZED_BIT, &fence->flags); +} + /** * dma_fence_put - decreases refcount of the fence * @fence: fence to reduce refcount of -- cgit v1.2.3 From 4a9671a03f2be13acde0cb15c5208767a9cc56e4 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part one) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- Documentation/gpu/drm-mm.rst | 6 +- drivers/gpu/Makefile | 2 +- drivers/gpu/buddy.c | 1336 +++++++++++++++++++++++++ drivers/gpu/drm/Kconfig | 4 - drivers/gpu/drm/Kconfig.debug | 1 - drivers/gpu/drm/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 2 +- drivers/gpu/drm/drm_buddy.c | 1336 ------------------------- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +- drivers/gpu/drm/i915/i915_scatterlist.c | 2 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 2 +- drivers/gpu/drm/lib/drm_random.c | 44 - drivers/gpu/drm/lib/drm_random.h | 28 - drivers/gpu/drm/tests/Makefile | 1 - drivers/gpu/drm/tests/drm_buddy_test.c | 928 ----------------- drivers/gpu/drm/tests/drm_exec_test.c | 2 - drivers/gpu/drm/tests/drm_mm_test.c | 2 - drivers/gpu/drm/ttm/tests/ttm_mock_manager.h | 2 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h | 2 +- drivers/gpu/tests/Makefile | 4 + drivers/gpu/tests/gpu_buddy_test.c | 928 +++++++++++++++++ drivers/gpu/tests/gpu_random.c | 44 + drivers/gpu/tests/gpu_random.h | 28 + include/drm/drm_buddy.h | 171 ---- include/linux/gpu_buddy.h | 171 ++++ 25 files changed, 2522 insertions(+), 2529 deletions(-) create mode 100644 drivers/gpu/buddy.c delete mode 100644 drivers/gpu/drm/drm_buddy.c delete mode 100644 drivers/gpu/drm/lib/drm_random.c delete mode 100644 drivers/gpu/drm/lib/drm_random.h delete mode 100644 drivers/gpu/drm/tests/drm_buddy_test.c create mode 100644 drivers/gpu/tests/Makefile create mode 100644 drivers/gpu/tests/gpu_buddy_test.c create mode 100644 drivers/gpu/tests/gpu_random.c create mode 100644 drivers/gpu/tests/gpu_random.h delete mode 100644 include/drm/drm_buddy.h create mode 100644 include/linux/gpu_buddy.h (limited to 'include') diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst index f22433470c76..ceee0e663237 100644 --- a/Documentation/gpu/drm-mm.rst +++ b/Documentation/gpu/drm-mm.rst @@ -526,10 +526,10 @@ DRM GPUVM Function References DRM Buddy Allocator =================== -DRM Buddy Function References ------------------------------ +Buddy Allocator Function References (GPU buddy) +----------------------------------------------- -.. kernel-doc:: drivers/gpu/drm/drm_buddy.c +.. kernel-doc:: drivers/gpu/buddy.c :export: DRM Cache Handling and Fast WC memcpy() diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile index 36a54d456630..c5292ee2c852 100644 --- a/drivers/gpu/Makefile +++ b/drivers/gpu/Makefile @@ -2,7 +2,7 @@ # drm/tegra depends on host1x, so if both drivers are built-in care must be # taken to initialize them in the correct order. Link order is the only way # to ensure this currently. -obj-y += host1x/ drm/ vga/ +obj-y += host1x/ drm/ vga/ tests/ obj-$(CONFIG_IMX_IPUV3_CORE) += ipu-v3/ obj-$(CONFIG_TRACE_GPU_MEM) += trace/ obj-$(CONFIG_NOVA_CORE) += nova-core/ diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c new file mode 100644 index 000000000000..4cc63d961d26 --- /dev/null +++ b/drivers/gpu/buddy.c @@ -0,0 +1,1336 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include + +#include +#include +#include +#include + +#include +#include + +enum drm_buddy_free_tree { + DRM_BUDDY_CLEAR_TREE = 0, + DRM_BUDDY_DIRTY_TREE, + DRM_BUDDY_MAX_FREE_TREES, +}; + +static struct kmem_cache *slab_blocks; + +#define for_each_free_tree(tree) \ + for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) + +static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, + struct drm_buddy_block *parent, + unsigned int order, + u64 offset) +{ + struct drm_buddy_block *block; + + BUG_ON(order > DRM_BUDDY_MAX_ORDER); + + block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); + if (!block) + return NULL; + + block->header = offset; + block->header |= order; + block->parent = parent; + + RB_CLEAR_NODE(&block->rb); + + BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); + return block; +} + +static void drm_block_free(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + kmem_cache_free(slab_blocks, block); +} + +static enum drm_buddy_free_tree +get_block_tree(struct drm_buddy_block *block) +{ + return drm_buddy_block_is_clear(block) ? + DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; +} + +static struct drm_buddy_block * +rbtree_get_free_block(const struct rb_node *node) +{ + return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; +} + +static struct drm_buddy_block * +rbtree_last_free_block(struct rb_root *root) +{ + return rbtree_get_free_block(rb_last(root)); +} + +static bool rbtree_is_empty(struct rb_root *root) +{ + return RB_EMPTY_ROOT(root); +} + +static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, + const struct drm_buddy_block *node) +{ + return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); +} + +static bool rbtree_block_offset_less(struct rb_node *block, + const struct rb_node *node) +{ + return drm_buddy_block_offset_less(rbtree_get_free_block(block), + rbtree_get_free_block(node)); +} + +static void rbtree_insert(struct drm_buddy *mm, + struct drm_buddy_block *block, + enum drm_buddy_free_tree tree) +{ + rb_add(&block->rb, + &mm->free_trees[tree][drm_buddy_block_order(block)], + rbtree_block_offset_less); +} + +static void rbtree_remove(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + unsigned int order = drm_buddy_block_order(block); + enum drm_buddy_free_tree tree; + struct rb_root *root; + + tree = get_block_tree(block); + root = &mm->free_trees[tree][order]; + + rb_erase(&block->rb, root); + RB_CLEAR_NODE(&block->rb); +} + +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_allocated(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_ALLOCATED; + + rbtree_remove(mm, block); +} + +static void mark_free(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + enum drm_buddy_free_tree tree; + + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_FREE; + + tree = get_block_tree(block); + rbtree_insert(mm, block, tree); +} + +static void mark_split(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_SPLIT; + + rbtree_remove(mm, block); +} + +static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2) +{ + return s1 <= e2 && e1 >= s2; +} + +static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) +{ + return s1 <= s2 && e1 >= e2; +} + +static struct drm_buddy_block * +__get_buddy(struct drm_buddy_block *block) +{ + struct drm_buddy_block *parent; + + parent = block->parent; + if (!parent) + return NULL; + + if (parent->left == block) + return parent->right; + + return parent->left; +} + +static unsigned int __drm_buddy_free(struct drm_buddy *mm, + struct drm_buddy_block *block, + bool force_merge) +{ + struct drm_buddy_block *parent; + unsigned int order; + + while ((parent = block->parent)) { + struct drm_buddy_block *buddy; + + buddy = __get_buddy(block); + + if (!drm_buddy_block_is_free(buddy)) + break; + + if (!force_merge) { + /* + * Check the block and its buddy clear state and exit + * the loop if they both have the dissimilar state. + */ + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; + + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + } + + rbtree_remove(mm, buddy); + if (force_merge && drm_buddy_block_is_clear(buddy)) + mm->clear_avail -= drm_buddy_block_size(mm, buddy); + + drm_block_free(mm, block); + drm_block_free(mm, buddy); + + block = parent; + } + + order = drm_buddy_block_order(block); + mark_free(mm, block); + + return order; +} + +static int __force_merge(struct drm_buddy *mm, + u64 start, + u64 end, + unsigned int min_order) +{ + unsigned int tree, order; + int i; + + if (!min_order) + return -ENOMEM; + + if (min_order > mm->max_order) + return -EINVAL; + + for_each_free_tree(tree) { + for (i = min_order - 1; i >= 0; i--) { + struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); + + while (iter) { + struct drm_buddy_block *block, *buddy; + u64 block_start, block_end; + + block = rbtree_get_free_block(iter); + iter = rb_prev(iter); + + if (!block || !block->parent) + continue; + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!contains(start, end, block_start, block_end)) + continue; + + buddy = __get_buddy(block); + if (!drm_buddy_block_is_free(buddy)) + continue; + + WARN_ON(drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy)); + + /* + * Advance to the next node when the current node is the buddy, + * as freeing the block will also remove its buddy from the tree. + */ + if (iter == &buddy->rb) + iter = rb_prev(iter); + + rbtree_remove(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + + order = __drm_buddy_free(mm, block, true); + if (order >= min_order) + return 0; + } + } + } + + return -ENOMEM; +} + +/** + * drm_buddy_init - init memory manager + * + * @mm: DRM buddy manager to initialize + * @size: size in bytes to manage + * @chunk_size: minimum page size in bytes for our allocations + * + * Initializes the memory manager and its resources. + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) +{ + unsigned int i, j, root_count = 0; + u64 offset = 0; + + if (size < chunk_size) + return -EINVAL; + + if (chunk_size < SZ_4K) + return -EINVAL; + + if (!is_power_of_2(chunk_size)) + return -EINVAL; + + size = round_down(size, chunk_size); + + mm->size = size; + mm->avail = size; + mm->clear_avail = 0; + mm->chunk_size = chunk_size; + mm->max_order = ilog2(size) - ilog2(chunk_size); + + BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); + + mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, + sizeof(*mm->free_trees), + GFP_KERNEL); + if (!mm->free_trees) + return -ENOMEM; + + for_each_free_tree(i) { + mm->free_trees[i] = kmalloc_array(mm->max_order + 1, + sizeof(struct rb_root), + GFP_KERNEL); + if (!mm->free_trees[i]) + goto out_free_tree; + + for (j = 0; j <= mm->max_order; ++j) + mm->free_trees[i][j] = RB_ROOT; + } + + mm->n_roots = hweight64(size); + + mm->roots = kmalloc_array(mm->n_roots, + sizeof(struct drm_buddy_block *), + GFP_KERNEL); + if (!mm->roots) + goto out_free_tree; + + /* + * Split into power-of-two blocks, in case we are given a size that is + * not itself a power-of-two. + */ + do { + struct drm_buddy_block *root; + unsigned int order; + u64 root_size; + + order = ilog2(size) - ilog2(chunk_size); + root_size = chunk_size << order; + + root = drm_block_alloc(mm, NULL, order, offset); + if (!root) + goto out_free_roots; + + mark_free(mm, root); + + BUG_ON(root_count > mm->max_order); + BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); + + mm->roots[root_count] = root; + + offset += root_size; + size -= root_size; + root_count++; + } while (size); + + return 0; + +out_free_roots: + while (root_count--) + drm_block_free(mm, mm->roots[root_count]); + kfree(mm->roots); +out_free_tree: + while (i--) + kfree(mm->free_trees[i]); + kfree(mm->free_trees); + return -ENOMEM; +} +EXPORT_SYMBOL(drm_buddy_init); + +/** + * drm_buddy_fini - tear down the memory manager + * + * @mm: DRM buddy manager to free + * + * Cleanup memory manager resources and the freetree + */ +void drm_buddy_fini(struct drm_buddy *mm) +{ + u64 root_size, size, start; + unsigned int order; + int i; + + size = mm->size; + + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + start = drm_buddy_block_offset(mm->roots[i]); + __force_merge(mm, start, start + size, order); + + if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) + kunit_fail_current_test("buddy_fini() root"); + + drm_block_free(mm, mm->roots[i]); + + root_size = mm->chunk_size << order; + size -= root_size; + } + + WARN_ON(mm->avail != mm->size); + + for_each_free_tree(i) + kfree(mm->free_trees[i]); + kfree(mm->free_trees); + kfree(mm->roots); +} +EXPORT_SYMBOL(drm_buddy_fini); + +static int split_block(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + unsigned int block_order = drm_buddy_block_order(block) - 1; + u64 offset = drm_buddy_block_offset(block); + + BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!drm_buddy_block_order(block)); + + block->left = drm_block_alloc(mm, block, block_order, offset); + if (!block->left) + return -ENOMEM; + + block->right = drm_block_alloc(mm, block, block_order, + offset + (mm->chunk_size << block_order)); + if (!block->right) { + drm_block_free(mm, block->left); + return -ENOMEM; + } + + mark_split(mm, block); + + if (drm_buddy_block_is_clear(block)) { + mark_cleared(block->left); + mark_cleared(block->right); + clear_reset(block); + } + + mark_free(mm, block->left); + mark_free(mm, block->right); + + return 0; +} + +/** + * drm_get_buddy - get buddy address + * + * @block: DRM buddy block + * + * Returns the corresponding buddy block for @block, or NULL + * if this is a root block and can't be merged further. + * Requires some kind of locking to protect against + * any concurrent allocate and free operations. + */ +struct drm_buddy_block * +drm_get_buddy(struct drm_buddy_block *block) +{ + return __get_buddy(block); +} +EXPORT_SYMBOL(drm_get_buddy); + +/** + * drm_buddy_reset_clear - reset blocks clear state + * + * @mm: DRM buddy manager + * @is_clear: blocks clear state + * + * Reset the clear state based on @is_clear value for each block + * in the freetree. + */ +void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) +{ + enum drm_buddy_free_tree src_tree, dst_tree; + u64 root_size, size, start; + unsigned int order; + int i; + + size = mm->size; + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + start = drm_buddy_block_offset(mm->roots[i]); + __force_merge(mm, start, start + size, order); + + root_size = mm->chunk_size << order; + size -= root_size; + } + + src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + + for (i = 0; i <= mm->max_order; ++i) { + struct rb_root *root = &mm->free_trees[src_tree][i]; + struct drm_buddy_block *block, *tmp; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + rbtree_remove(mm, block); + if (is_clear) { + mark_cleared(block); + mm->clear_avail += drm_buddy_block_size(mm, block); + } else { + clear_reset(block); + mm->clear_avail -= drm_buddy_block_size(mm, block); + } + + rbtree_insert(mm, block, dst_tree); + } + } +} +EXPORT_SYMBOL(drm_buddy_reset_clear); + +/** + * drm_buddy_free_block - free a block + * + * @mm: DRM buddy manager + * @block: block to be freed + */ +void drm_buddy_free_block(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + BUG_ON(!drm_buddy_block_is_allocated(block)); + mm->avail += drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail += drm_buddy_block_size(mm, block); + + __drm_buddy_free(mm, block, false); +} +EXPORT_SYMBOL(drm_buddy_free_block); + +static void __drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + bool mark_clear, + bool mark_dirty) +{ + struct drm_buddy_block *block, *on; + + WARN_ON(mark_dirty && mark_clear); + + list_for_each_entry_safe(block, on, objects, link) { + if (mark_clear) + mark_cleared(block); + else if (mark_dirty) + clear_reset(block); + drm_buddy_free_block(mm, block); + cond_resched(); + } + INIT_LIST_HEAD(objects); +} + +static void drm_buddy_free_list_internal(struct drm_buddy *mm, + struct list_head *objects) +{ + /* + * Don't touch the clear/dirty bit, since allocation is still internal + * at this point. For example we might have just failed part of the + * allocation. + */ + __drm_buddy_free_list(mm, objects, false, false); +} + +/** + * drm_buddy_free_list - free blocks + * + * @mm: DRM buddy manager + * @objects: input list head to free blocks + * @flags: optional flags like DRM_BUDDY_CLEARED + */ +void drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + unsigned int flags) +{ + bool mark_clear = flags & DRM_BUDDY_CLEARED; + + __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); +} +EXPORT_SYMBOL(drm_buddy_free_list); + +static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) +{ + bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; + + return needs_clear != drm_buddy_block_is_clear(block); +} + +static struct drm_buddy_block * +__alloc_range_bias(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags, + bool fallback) +{ + u64 req_size = mm->chunk_size << order; + struct drm_buddy_block *block; + struct drm_buddy_block *buddy; + LIST_HEAD(dfs); + int err; + int i; + + end = end - 1; + + for (i = 0; i < mm->n_roots; ++i) + list_add_tail(&mm->roots[i]->tmp_link, &dfs); + + do { + u64 block_start; + u64 block_end; + + block = list_first_entry_or_null(&dfs, + struct drm_buddy_block, + tmp_link); + if (!block) + break; + + list_del(&block->tmp_link); + + if (drm_buddy_block_order(block) < order) + continue; + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!overlaps(start, end, block_start, block_end)) + continue; + + if (drm_buddy_block_is_allocated(block)) + continue; + + if (block_start < start || block_end > end) { + u64 adjusted_start = max(block_start, start); + u64 adjusted_end = min(block_end, end); + + if (round_down(adjusted_end + 1, req_size) <= + round_up(adjusted_start, req_size)) + continue; + } + + if (!fallback && block_incompatible(block, flags)) + continue; + + if (contains(start, end, block_start, block_end) && + order == drm_buddy_block_order(block)) { + /* + * Find the free block within the range. + */ + if (drm_buddy_block_is_free(block)) + return block; + + continue; + } + + if (!drm_buddy_block_is_split(block)) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + } + + list_add(&block->right->tmp_link, &dfs); + list_add(&block->left->tmp_link, &dfs); + } while (1); + + return ERR_PTR(-ENOSPC); + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (drm_buddy_block_is_free(block) && + drm_buddy_block_is_free(buddy))) + __drm_buddy_free(mm, block, false); + return ERR_PTR(err); +} + +static struct drm_buddy_block * +__drm_buddy_alloc_range_bias(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags) +{ + struct drm_buddy_block *block; + bool fallback = false; + + block = __alloc_range_bias(mm, start, end, order, + flags, fallback); + if (IS_ERR(block)) + return __alloc_range_bias(mm, start, end, order, + flags, !fallback); + + return block; +} + +static struct drm_buddy_block * +get_maxblock(struct drm_buddy *mm, + unsigned int order, + enum drm_buddy_free_tree tree) +{ + struct drm_buddy_block *max_block = NULL, *block = NULL; + struct rb_root *root; + unsigned int i; + + for (i = order; i <= mm->max_order; ++i) { + root = &mm->free_trees[tree][i]; + block = rbtree_last_free_block(root); + if (!block) + continue; + + if (!max_block) { + max_block = block; + continue; + } + + if (drm_buddy_block_offset(block) > + drm_buddy_block_offset(max_block)) { + max_block = block; + } + } + + return max_block; +} + +static struct drm_buddy_block * +alloc_from_freetree(struct drm_buddy *mm, + unsigned int order, + unsigned long flags) +{ + struct drm_buddy_block *block = NULL; + struct rb_root *root; + enum drm_buddy_free_tree tree; + unsigned int tmp; + int err; + + tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? + DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + + if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { + block = get_maxblock(mm, order, tree); + if (block) + /* Store the obtained block order */ + tmp = drm_buddy_block_order(block); + } else { + for (tmp = order; tmp <= mm->max_order; ++tmp) { + /* Get RB tree root for this order and tree */ + root = &mm->free_trees[tree][tmp]; + block = rbtree_last_free_block(root); + if (block) + break; + } + } + + if (!block) { + /* Try allocating from the other tree */ + tree = (tree == DRM_BUDDY_CLEAR_TREE) ? + DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + + for (tmp = order; tmp <= mm->max_order; ++tmp) { + root = &mm->free_trees[tree][tmp]; + block = rbtree_last_free_block(root); + if (block) + break; + } + + if (!block) + return ERR_PTR(-ENOSPC); + } + + BUG_ON(!drm_buddy_block_is_free(block)); + + while (tmp != order) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + + block = block->right; + tmp--; + } + return block; + +err_undo: + if (tmp != order) + __drm_buddy_free(mm, block, false); + return ERR_PTR(err); +} + +static int __alloc_range(struct drm_buddy *mm, + struct list_head *dfs, + u64 start, u64 size, + struct list_head *blocks, + u64 *total_allocated_on_err) +{ + struct drm_buddy_block *block; + struct drm_buddy_block *buddy; + u64 total_allocated = 0; + LIST_HEAD(allocated); + u64 end; + int err; + + end = start + size - 1; + + do { + u64 block_start; + u64 block_end; + + block = list_first_entry_or_null(dfs, + struct drm_buddy_block, + tmp_link); + if (!block) + break; + + list_del(&block->tmp_link); + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!overlaps(start, end, block_start, block_end)) + continue; + + if (drm_buddy_block_is_allocated(block)) { + err = -ENOSPC; + goto err_free; + } + + if (contains(start, end, block_start, block_end)) { + if (drm_buddy_block_is_free(block)) { + mark_allocated(mm, block); + total_allocated += drm_buddy_block_size(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + list_add_tail(&block->link, &allocated); + continue; + } else if (!mm->clear_avail) { + err = -ENOSPC; + goto err_free; + } + } + + if (!drm_buddy_block_is_split(block)) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + } + + list_add(&block->right->tmp_link, dfs); + list_add(&block->left->tmp_link, dfs); + } while (1); + + if (total_allocated < size) { + err = -ENOSPC; + goto err_free; + } + + list_splice_tail(&allocated, blocks); + + return 0; + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (drm_buddy_block_is_free(block) && + drm_buddy_block_is_free(buddy))) + __drm_buddy_free(mm, block, false); + +err_free: + if (err == -ENOSPC && total_allocated_on_err) { + list_splice_tail(&allocated, blocks); + *total_allocated_on_err = total_allocated; + } else { + drm_buddy_free_list_internal(mm, &allocated); + } + + return err; +} + +static int __drm_buddy_alloc_range(struct drm_buddy *mm, + u64 start, + u64 size, + u64 *total_allocated_on_err, + struct list_head *blocks) +{ + LIST_HEAD(dfs); + int i; + + for (i = 0; i < mm->n_roots; ++i) + list_add_tail(&mm->roots[i]->tmp_link, &dfs); + + return __alloc_range(mm, &dfs, start, size, + blocks, total_allocated_on_err); +} + +static int __alloc_contig_try_harder(struct drm_buddy *mm, + u64 size, + u64 min_block_size, + struct list_head *blocks) +{ + u64 rhs_offset, lhs_offset, lhs_size, filled; + struct drm_buddy_block *block; + unsigned int tree, order; + LIST_HEAD(blocks_lhs); + unsigned long pages; + u64 modify_size; + int err; + + modify_size = rounddown_pow_of_two(size); + pages = modify_size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + if (order == 0) + return -ENOSPC; + + for_each_free_tree(tree) { + struct rb_root *root; + struct rb_node *iter; + + root = &mm->free_trees[tree][order]; + if (rbtree_is_empty(root)) + continue; + + iter = rb_last(root); + while (iter) { + block = rbtree_get_free_block(iter); + + /* Allocate blocks traversing RHS */ + rhs_offset = drm_buddy_block_offset(block); + err = __drm_buddy_alloc_range(mm, rhs_offset, size, + &filled, blocks); + if (!err || err != -ENOSPC) + return err; + + lhs_size = max((size - filled), min_block_size); + if (!IS_ALIGNED(lhs_size, min_block_size)) + lhs_size = round_up(lhs_size, min_block_size); + + /* Allocate blocks traversing LHS */ + lhs_offset = drm_buddy_block_offset(block) - lhs_size; + err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, + NULL, &blocks_lhs); + if (!err) { + list_splice(&blocks_lhs, blocks); + return 0; + } else if (err != -ENOSPC) { + drm_buddy_free_list_internal(mm, blocks); + return err; + } + /* Free blocks for the next iteration */ + drm_buddy_free_list_internal(mm, blocks); + + iter = rb_prev(iter); + } + } + + return -ENOSPC; +} + +/** + * drm_buddy_block_trim - free unused pages + * + * @mm: DRM buddy manager + * @start: start address to begin the trimming. + * @new_size: original size requested + * @blocks: Input and output list of allocated blocks. + * MUST contain single block as input to be trimmed. + * On success will contain the newly allocated blocks + * making up the @new_size. Blocks always appear in + * ascending order + * + * For contiguous allocation, we round up the size to the nearest + * power of two value, drivers consume *actual* size, so remaining + * portions are unused and can be optionally freed with this function + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, + u64 new_size, + struct list_head *blocks) +{ + struct drm_buddy_block *parent; + struct drm_buddy_block *block; + u64 block_start, block_end; + LIST_HEAD(dfs); + u64 new_start; + int err; + + if (!list_is_singular(blocks)) + return -EINVAL; + + block = list_first_entry(blocks, + struct drm_buddy_block, + link); + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + + if (WARN_ON(!drm_buddy_block_is_allocated(block))) + return -EINVAL; + + if (new_size > drm_buddy_block_size(mm, block)) + return -EINVAL; + + if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) + return -EINVAL; + + if (new_size == drm_buddy_block_size(mm, block)) + return 0; + + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + + list_del(&block->link); + mark_free(mm, block); + mm->avail += drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail += drm_buddy_block_size(mm, block); + + /* Prevent recursively freeing this node */ + parent = block->parent; + block->parent = NULL; + + list_add(&block->tmp_link, &dfs); + err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); + if (err) { + mark_allocated(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + list_add(&block->link, blocks); + } + + block->parent = parent; + return err; +} +EXPORT_SYMBOL(drm_buddy_block_trim); + +static struct drm_buddy_block * +__drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags) +{ + if (flags & DRM_BUDDY_RANGE_ALLOCATION) + /* Allocate traversing within the range */ + return __drm_buddy_alloc_range_bias(mm, start, end, + order, flags); + else + /* Allocate from freetree */ + return alloc_from_freetree(mm, order, flags); +} + +/** + * drm_buddy_alloc_blocks - allocate power-of-two blocks + * + * @mm: DRM buddy manager to allocate from + * @start: start of the allowed range for this block + * @end: end of the allowed range for this block + * @size: size of the allocation in bytes + * @min_block_size: alignment of the allocation + * @blocks: output list head to add allocated blocks + * @flags: DRM_BUDDY_*_ALLOCATION flags + * + * alloc_range_bias() called on range limitations, which traverses + * the tree and returns the desired block. + * + * alloc_from_freetree() called when *no* range restrictions + * are enforced, which picks the block from the freetree. + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, u64 size, + u64 min_block_size, + struct list_head *blocks, + unsigned long flags) +{ + struct drm_buddy_block *block = NULL; + u64 original_size, original_min_size; + unsigned int min_order, order; + LIST_HEAD(allocated); + unsigned long pages; + int err; + + if (size < mm->chunk_size) + return -EINVAL; + + if (min_block_size < mm->chunk_size) + return -EINVAL; + + if (!is_power_of_2(min_block_size)) + return -EINVAL; + + if (!IS_ALIGNED(start | end | size, mm->chunk_size)) + return -EINVAL; + + if (end > mm->size) + return -EINVAL; + + if (range_overflows(start, size, mm->size)) + return -EINVAL; + + /* Actual range allocation */ + if (start + size == end) { + if (!IS_ALIGNED(start | end, min_block_size)) + return -EINVAL; + + return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); + } + + original_size = size; + original_min_size = min_block_size; + + /* Roundup the size to power of 2 */ + if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { + size = roundup_pow_of_two(size); + min_block_size = size; + /* Align size value to min_block_size */ + } else if (!IS_ALIGNED(size, min_block_size)) { + size = round_up(size, min_block_size); + } + + pages = size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); + + if (order > mm->max_order || size > mm->size) { + if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && + !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, original_size, + original_min_size, blocks); + + return -EINVAL; + } + + do { + order = min(order, (unsigned int)fls(pages) - 1); + BUG_ON(order > mm->max_order); + BUG_ON(order < min_order); + + do { + block = __drm_buddy_alloc_blocks(mm, start, + end, + order, + flags); + if (!IS_ERR(block)) + break; + + if (order-- == min_order) { + /* Try allocation through force merge method */ + if (mm->clear_avail && + !__force_merge(mm, start, end, min_order)) { + block = __drm_buddy_alloc_blocks(mm, start, + end, + min_order, + flags); + if (!IS_ERR(block)) { + order = min_order; + break; + } + } + + /* + * Try contiguous block allocation through + * try harder method. + */ + if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, + original_size, + original_min_size, + blocks); + err = -ENOSPC; + goto err_free; + } + } while (1); + + mark_allocated(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + kmemleak_update_trace(block); + list_add_tail(&block->link, &allocated); + + pages -= BIT(order); + + if (!pages) + break; + } while (1); + + /* Trim the allocated block to the required size */ + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { + struct list_head *trim_list; + LIST_HEAD(temp); + u64 trim_size; + + trim_list = &allocated; + trim_size = original_size; + + if (!list_is_singular(&allocated)) { + block = list_last_entry(&allocated, typeof(*block), link); + list_move(&block->link, &temp); + trim_list = &temp; + trim_size = drm_buddy_block_size(mm, block) - + (size - original_size); + } + + drm_buddy_block_trim(mm, + NULL, + trim_size, + trim_list); + + if (!list_empty(&temp)) + list_splice_tail(trim_list, &allocated); + } + + list_splice_tail(&allocated, blocks); + return 0; + +err_free: + drm_buddy_free_list_internal(mm, &allocated); + return err; +} +EXPORT_SYMBOL(drm_buddy_alloc_blocks); + +/** + * drm_buddy_block_print - print block information + * + * @mm: DRM buddy manager + * @block: DRM buddy block + * @p: DRM printer to use + */ +void drm_buddy_block_print(struct drm_buddy *mm, + struct drm_buddy_block *block, + struct drm_printer *p) +{ + u64 start = drm_buddy_block_offset(block); + u64 size = drm_buddy_block_size(mm, block); + + drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); +} +EXPORT_SYMBOL(drm_buddy_block_print); + +/** + * drm_buddy_print - print allocator state + * + * @mm: DRM buddy manager + * @p: DRM printer to use + */ +void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) +{ + int order; + + drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + + for (order = mm->max_order; order >= 0; order--) { + struct drm_buddy_block *block, *tmp; + struct rb_root *root; + u64 count = 0, free; + unsigned int tree; + + for_each_free_tree(tree) { + root = &mm->free_trees[tree][order]; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + BUG_ON(!drm_buddy_block_is_free(block)); + count++; + } + } + + drm_printf(p, "order-%2d ", order); + + free = count * (mm->chunk_size << order); + if (free < SZ_1M) + drm_printf(p, "free: %8llu KiB", free >> 10); + else + drm_printf(p, "free: %8llu MiB", free >> 20); + + drm_printf(p, ", blocks: %llu\n", count); + } +} +EXPORT_SYMBOL(drm_buddy_print); + +static void drm_buddy_module_exit(void) +{ + kmem_cache_destroy(slab_blocks); +} + +static int __init drm_buddy_module_init(void) +{ + slab_blocks = KMEM_CACHE(drm_buddy_block, 0); + if (!slab_blocks) + return -ENOMEM; + + return 0; +} + +module_init(drm_buddy_module_init); +module_exit(drm_buddy_module_exit); + +MODULE_DESCRIPTION("DRM Buddy Allocator"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5888eb147ed1..862ff4000969 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -269,10 +269,6 @@ config DRM_SCHED config DRM_PANEL_BACKLIGHT_QUIRKS tristate -config DRM_LIB_RANDOM - bool - default n - config DRM_PRIVACY_SCREEN bool default n diff --git a/drivers/gpu/drm/Kconfig.debug b/drivers/gpu/drm/Kconfig.debug index 05dc43c0b8c5..3b7886865335 100644 --- a/drivers/gpu/drm/Kconfig.debug +++ b/drivers/gpu/drm/Kconfig.debug @@ -69,7 +69,6 @@ config DRM_KUNIT_TEST select DRM_EXPORT_FOR_TESTS if m select DRM_GEM_SHMEM_HELPER select DRM_KUNIT_TEST_HELPERS - select DRM_LIB_RANDOM select DRM_SYSFB_HELPER select PRIME_NUMBERS default KUNIT_ALL_TESTS diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 75840ec4d782..892859cfe95f 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -79,7 +79,6 @@ drm-$(CONFIG_DRM_CLIENT) += \ drm_client_event.o \ drm_client_modeset.o \ drm_client_sysrq.o -drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o drm-$(CONFIG_COMPAT) += drm_ioc32.o drm-$(CONFIG_DRM_PANEL) += drm_panel.o drm-$(CONFIG_OF) += drm_of.o @@ -115,7 +114,7 @@ drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \ obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o -obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o +obj-$(CONFIG_DRM_BUDDY) += ../buddy.o drm_dma_helper-y := drm_gem_dma_helper.o drm_dma_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fbdev_dma.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 5f5fd9a911c2..874779618056 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -24,7 +24,7 @@ #ifndef __AMDGPU_VRAM_MGR_H__ #define __AMDGPU_VRAM_MGR_H__ -#include +#include struct amdgpu_vram_mgr { struct ttm_resource_manager manager; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c deleted file mode 100644 index fd34d3755f7c..000000000000 --- a/drivers/gpu/drm/drm_buddy.c +++ /dev/null @@ -1,1336 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright © 2021 Intel Corporation - */ - -#include - -#include -#include -#include -#include - -#include -#include - -enum drm_buddy_free_tree { - DRM_BUDDY_CLEAR_TREE = 0, - DRM_BUDDY_DIRTY_TREE, - DRM_BUDDY_MAX_FREE_TREES, -}; - -static struct kmem_cache *slab_blocks; - -#define for_each_free_tree(tree) \ - for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) - -static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, - struct drm_buddy_block *parent, - unsigned int order, - u64 offset) -{ - struct drm_buddy_block *block; - - BUG_ON(order > DRM_BUDDY_MAX_ORDER); - - block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); - if (!block) - return NULL; - - block->header = offset; - block->header |= order; - block->parent = parent; - - RB_CLEAR_NODE(&block->rb); - - BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); - return block; -} - -static void drm_block_free(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - kmem_cache_free(slab_blocks, block); -} - -static enum drm_buddy_free_tree -get_block_tree(struct drm_buddy_block *block) -{ - return drm_buddy_block_is_clear(block) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; -} - -static struct drm_buddy_block * -rbtree_get_free_block(const struct rb_node *node) -{ - return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; -} - -static struct drm_buddy_block * -rbtree_last_free_block(struct rb_root *root) -{ - return rbtree_get_free_block(rb_last(root)); -} - -static bool rbtree_is_empty(struct rb_root *root) -{ - return RB_EMPTY_ROOT(root); -} - -static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, - const struct drm_buddy_block *node) -{ - return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); -} - -static bool rbtree_block_offset_less(struct rb_node *block, - const struct rb_node *node) -{ - return drm_buddy_block_offset_less(rbtree_get_free_block(block), - rbtree_get_free_block(node)); -} - -static void rbtree_insert(struct drm_buddy *mm, - struct drm_buddy_block *block, - enum drm_buddy_free_tree tree) -{ - rb_add(&block->rb, - &mm->free_trees[tree][drm_buddy_block_order(block)], - rbtree_block_offset_less); -} - -static void rbtree_remove(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - unsigned int order = drm_buddy_block_order(block); - enum drm_buddy_free_tree tree; - struct rb_root *root; - - tree = get_block_tree(block); - root = &mm->free_trees[tree][order]; - - rb_erase(&block->rb, root); - RB_CLEAR_NODE(&block->rb); -} - -static void clear_reset(struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_CLEAR; -} - -static void mark_cleared(struct drm_buddy_block *block) -{ - block->header |= DRM_BUDDY_HEADER_CLEAR; -} - -static void mark_allocated(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_ALLOCATED; - - rbtree_remove(mm, block); -} - -static void mark_free(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - enum drm_buddy_free_tree tree; - - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_FREE; - - tree = get_block_tree(block); - rbtree_insert(mm, block, tree); -} - -static void mark_split(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_SPLIT; - - rbtree_remove(mm, block); -} - -static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2) -{ - return s1 <= e2 && e1 >= s2; -} - -static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) -{ - return s1 <= s2 && e1 >= e2; -} - -static struct drm_buddy_block * -__get_buddy(struct drm_buddy_block *block) -{ - struct drm_buddy_block *parent; - - parent = block->parent; - if (!parent) - return NULL; - - if (parent->left == block) - return parent->right; - - return parent->left; -} - -static unsigned int __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block, - bool force_merge) -{ - struct drm_buddy_block *parent; - unsigned int order; - - while ((parent = block->parent)) { - struct drm_buddy_block *buddy; - - buddy = __get_buddy(block); - - if (!drm_buddy_block_is_free(buddy)) - break; - - if (!force_merge) { - /* - * Check the block and its buddy clear state and exit - * the loop if they both have the dissimilar state. - */ - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) - break; - - if (drm_buddy_block_is_clear(block)) - mark_cleared(parent); - } - - rbtree_remove(mm, buddy); - if (force_merge && drm_buddy_block_is_clear(buddy)) - mm->clear_avail -= drm_buddy_block_size(mm, buddy); - - drm_block_free(mm, block); - drm_block_free(mm, buddy); - - block = parent; - } - - order = drm_buddy_block_order(block); - mark_free(mm, block); - - return order; -} - -static int __force_merge(struct drm_buddy *mm, - u64 start, - u64 end, - unsigned int min_order) -{ - unsigned int tree, order; - int i; - - if (!min_order) - return -ENOMEM; - - if (min_order > mm->max_order) - return -EINVAL; - - for_each_free_tree(tree) { - for (i = min_order - 1; i >= 0; i--) { - struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); - - while (iter) { - struct drm_buddy_block *block, *buddy; - u64 block_start, block_end; - - block = rbtree_get_free_block(iter); - iter = rb_prev(iter); - - if (!block || !block->parent) - continue; - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!contains(start, end, block_start, block_end)) - continue; - - buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) - continue; - - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); - - /* - * Advance to the next node when the current node is the buddy, - * as freeing the block will also remove its buddy from the tree. - */ - if (iter == &buddy->rb) - iter = rb_prev(iter); - - rbtree_remove(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - - order = __drm_buddy_free(mm, block, true); - if (order >= min_order) - return 0; - } - } - } - - return -ENOMEM; -} - -/** - * drm_buddy_init - init memory manager - * - * @mm: DRM buddy manager to initialize - * @size: size in bytes to manage - * @chunk_size: minimum page size in bytes for our allocations - * - * Initializes the memory manager and its resources. - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) -{ - unsigned int i, j, root_count = 0; - u64 offset = 0; - - if (size < chunk_size) - return -EINVAL; - - if (chunk_size < SZ_4K) - return -EINVAL; - - if (!is_power_of_2(chunk_size)) - return -EINVAL; - - size = round_down(size, chunk_size); - - mm->size = size; - mm->avail = size; - mm->clear_avail = 0; - mm->chunk_size = chunk_size; - mm->max_order = ilog2(size) - ilog2(chunk_size); - - BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); - - mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, - sizeof(*mm->free_trees), - GFP_KERNEL); - if (!mm->free_trees) - return -ENOMEM; - - for_each_free_tree(i) { - mm->free_trees[i] = kmalloc_array(mm->max_order + 1, - sizeof(struct rb_root), - GFP_KERNEL); - if (!mm->free_trees[i]) - goto out_free_tree; - - for (j = 0; j <= mm->max_order; ++j) - mm->free_trees[i][j] = RB_ROOT; - } - - mm->n_roots = hweight64(size); - - mm->roots = kmalloc_array(mm->n_roots, - sizeof(struct drm_buddy_block *), - GFP_KERNEL); - if (!mm->roots) - goto out_free_tree; - - /* - * Split into power-of-two blocks, in case we are given a size that is - * not itself a power-of-two. - */ - do { - struct drm_buddy_block *root; - unsigned int order; - u64 root_size; - - order = ilog2(size) - ilog2(chunk_size); - root_size = chunk_size << order; - - root = drm_block_alloc(mm, NULL, order, offset); - if (!root) - goto out_free_roots; - - mark_free(mm, root); - - BUG_ON(root_count > mm->max_order); - BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); - - mm->roots[root_count] = root; - - offset += root_size; - size -= root_size; - root_count++; - } while (size); - - return 0; - -out_free_roots: - while (root_count--) - drm_block_free(mm, mm->roots[root_count]); - kfree(mm->roots); -out_free_tree: - while (i--) - kfree(mm->free_trees[i]); - kfree(mm->free_trees); - return -ENOMEM; -} -EXPORT_SYMBOL(drm_buddy_init); - -/** - * drm_buddy_fini - tear down the memory manager - * - * @mm: DRM buddy manager to free - * - * Cleanup memory manager resources and the freetree - */ -void drm_buddy_fini(struct drm_buddy *mm) -{ - u64 root_size, size, start; - unsigned int order; - int i; - - size = mm->size; - - for (i = 0; i < mm->n_roots; ++i) { - order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); - __force_merge(mm, start, start + size, order); - - if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) - kunit_fail_current_test("buddy_fini() root"); - - drm_block_free(mm, mm->roots[i]); - - root_size = mm->chunk_size << order; - size -= root_size; - } - - WARN_ON(mm->avail != mm->size); - - for_each_free_tree(i) - kfree(mm->free_trees[i]); - kfree(mm->free_trees); - kfree(mm->roots); -} -EXPORT_SYMBOL(drm_buddy_fini); - -static int split_block(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - unsigned int block_order = drm_buddy_block_order(block) - 1; - u64 offset = drm_buddy_block_offset(block); - - BUG_ON(!drm_buddy_block_is_free(block)); - BUG_ON(!drm_buddy_block_order(block)); - - block->left = drm_block_alloc(mm, block, block_order, offset); - if (!block->left) - return -ENOMEM; - - block->right = drm_block_alloc(mm, block, block_order, - offset + (mm->chunk_size << block_order)); - if (!block->right) { - drm_block_free(mm, block->left); - return -ENOMEM; - } - - mark_split(mm, block); - - if (drm_buddy_block_is_clear(block)) { - mark_cleared(block->left); - mark_cleared(block->right); - clear_reset(block); - } - - mark_free(mm, block->left); - mark_free(mm, block->right); - - return 0; -} - -/** - * drm_get_buddy - get buddy address - * - * @block: DRM buddy block - * - * Returns the corresponding buddy block for @block, or NULL - * if this is a root block and can't be merged further. - * Requires some kind of locking to protect against - * any concurrent allocate and free operations. - */ -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block) -{ - return __get_buddy(block); -} -EXPORT_SYMBOL(drm_get_buddy); - -/** - * drm_buddy_reset_clear - reset blocks clear state - * - * @mm: DRM buddy manager - * @is_clear: blocks clear state - * - * Reset the clear state based on @is_clear value for each block - * in the freetree. - */ -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) -{ - enum drm_buddy_free_tree src_tree, dst_tree; - u64 root_size, size, start; - unsigned int order; - int i; - - size = mm->size; - for (i = 0; i < mm->n_roots; ++i) { - order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); - __force_merge(mm, start, start + size, order); - - root_size = mm->chunk_size << order; - size -= root_size; - } - - src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; - - for (i = 0; i <= mm->max_order; ++i) { - struct rb_root *root = &mm->free_trees[src_tree][i]; - struct drm_buddy_block *block, *tmp; - - rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - rbtree_remove(mm, block); - if (is_clear) { - mark_cleared(block); - mm->clear_avail += drm_buddy_block_size(mm, block); - } else { - clear_reset(block); - mm->clear_avail -= drm_buddy_block_size(mm, block); - } - - rbtree_insert(mm, block, dst_tree); - } - } -} -EXPORT_SYMBOL(drm_buddy_reset_clear); - -/** - * drm_buddy_free_block - free a block - * - * @mm: DRM buddy manager - * @block: block to be freed - */ -void drm_buddy_free_block(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - BUG_ON(!drm_buddy_block_is_allocated(block)); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); - - __drm_buddy_free(mm, block, false); -} -EXPORT_SYMBOL(drm_buddy_free_block); - -static void __drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - bool mark_clear, - bool mark_dirty) -{ - struct drm_buddy_block *block, *on; - - WARN_ON(mark_dirty && mark_clear); - - list_for_each_entry_safe(block, on, objects, link) { - if (mark_clear) - mark_cleared(block); - else if (mark_dirty) - clear_reset(block); - drm_buddy_free_block(mm, block); - cond_resched(); - } - INIT_LIST_HEAD(objects); -} - -static void drm_buddy_free_list_internal(struct drm_buddy *mm, - struct list_head *objects) -{ - /* - * Don't touch the clear/dirty bit, since allocation is still internal - * at this point. For example we might have just failed part of the - * allocation. - */ - __drm_buddy_free_list(mm, objects, false, false); -} - -/** - * drm_buddy_free_list - free blocks - * - * @mm: DRM buddy manager - * @objects: input list head to free blocks - * @flags: optional flags like DRM_BUDDY_CLEARED - */ -void drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - unsigned int flags) -{ - bool mark_clear = flags & DRM_BUDDY_CLEARED; - - __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); -} -EXPORT_SYMBOL(drm_buddy_free_list); - -static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) -{ - bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; - - return needs_clear != drm_buddy_block_is_clear(block); -} - -static struct drm_buddy_block * -__alloc_range_bias(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags, - bool fallback) -{ - u64 req_size = mm->chunk_size << order; - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; - LIST_HEAD(dfs); - int err; - int i; - - end = end - 1; - - for (i = 0; i < mm->n_roots; ++i) - list_add_tail(&mm->roots[i]->tmp_link, &dfs); - - do { - u64 block_start; - u64 block_end; - - block = list_first_entry_or_null(&dfs, - struct drm_buddy_block, - tmp_link); - if (!block) - break; - - list_del(&block->tmp_link); - - if (drm_buddy_block_order(block) < order) - continue; - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!overlaps(start, end, block_start, block_end)) - continue; - - if (drm_buddy_block_is_allocated(block)) - continue; - - if (block_start < start || block_end > end) { - u64 adjusted_start = max(block_start, start); - u64 adjusted_end = min(block_end, end); - - if (round_down(adjusted_end + 1, req_size) <= - round_up(adjusted_start, req_size)) - continue; - } - - if (!fallback && block_incompatible(block, flags)) - continue; - - if (contains(start, end, block_start, block_end) && - order == drm_buddy_block_order(block)) { - /* - * Find the free block within the range. - */ - if (drm_buddy_block_is_free(block)) - return block; - - continue; - } - - if (!drm_buddy_block_is_split(block)) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - } - - list_add(&block->right->tmp_link, &dfs); - list_add(&block->left->tmp_link, &dfs); - } while (1); - - return ERR_PTR(-ENOSPC); - -err_undo: - /* - * We really don't want to leave around a bunch of split blocks, since - * bigger is better, so make sure we merge everything back before we - * free the allocated blocks. - */ - buddy = __get_buddy(block); - if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); - return ERR_PTR(err); -} - -static struct drm_buddy_block * -__drm_buddy_alloc_range_bias(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags) -{ - struct drm_buddy_block *block; - bool fallback = false; - - block = __alloc_range_bias(mm, start, end, order, - flags, fallback); - if (IS_ERR(block)) - return __alloc_range_bias(mm, start, end, order, - flags, !fallback); - - return block; -} - -static struct drm_buddy_block * -get_maxblock(struct drm_buddy *mm, - unsigned int order, - enum drm_buddy_free_tree tree) -{ - struct drm_buddy_block *max_block = NULL, *block = NULL; - struct rb_root *root; - unsigned int i; - - for (i = order; i <= mm->max_order; ++i) { - root = &mm->free_trees[tree][i]; - block = rbtree_last_free_block(root); - if (!block) - continue; - - if (!max_block) { - max_block = block; - continue; - } - - if (drm_buddy_block_offset(block) > - drm_buddy_block_offset(max_block)) { - max_block = block; - } - } - - return max_block; -} - -static struct drm_buddy_block * -alloc_from_freetree(struct drm_buddy *mm, - unsigned int order, - unsigned long flags) -{ - struct drm_buddy_block *block = NULL; - struct rb_root *root; - enum drm_buddy_free_tree tree; - unsigned int tmp; - int err; - - tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; - - if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { - block = get_maxblock(mm, order, tree); - if (block) - /* Store the obtained block order */ - tmp = drm_buddy_block_order(block); - } else { - for (tmp = order; tmp <= mm->max_order; ++tmp) { - /* Get RB tree root for this order and tree */ - root = &mm->free_trees[tree][tmp]; - block = rbtree_last_free_block(root); - if (block) - break; - } - } - - if (!block) { - /* Try allocating from the other tree */ - tree = (tree == DRM_BUDDY_CLEAR_TREE) ? - DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - - for (tmp = order; tmp <= mm->max_order; ++tmp) { - root = &mm->free_trees[tree][tmp]; - block = rbtree_last_free_block(root); - if (block) - break; - } - - if (!block) - return ERR_PTR(-ENOSPC); - } - - BUG_ON(!drm_buddy_block_is_free(block)); - - while (tmp != order) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - - block = block->right; - tmp--; - } - return block; - -err_undo: - if (tmp != order) - __drm_buddy_free(mm, block, false); - return ERR_PTR(err); -} - -static int __alloc_range(struct drm_buddy *mm, - struct list_head *dfs, - u64 start, u64 size, - struct list_head *blocks, - u64 *total_allocated_on_err) -{ - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; - u64 total_allocated = 0; - LIST_HEAD(allocated); - u64 end; - int err; - - end = start + size - 1; - - do { - u64 block_start; - u64 block_end; - - block = list_first_entry_or_null(dfs, - struct drm_buddy_block, - tmp_link); - if (!block) - break; - - list_del(&block->tmp_link); - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!overlaps(start, end, block_start, block_end)) - continue; - - if (drm_buddy_block_is_allocated(block)) { - err = -ENOSPC; - goto err_free; - } - - if (contains(start, end, block_start, block_end)) { - if (drm_buddy_block_is_free(block)) { - mark_allocated(mm, block); - total_allocated += drm_buddy_block_size(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - list_add_tail(&block->link, &allocated); - continue; - } else if (!mm->clear_avail) { - err = -ENOSPC; - goto err_free; - } - } - - if (!drm_buddy_block_is_split(block)) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - } - - list_add(&block->right->tmp_link, dfs); - list_add(&block->left->tmp_link, dfs); - } while (1); - - if (total_allocated < size) { - err = -ENOSPC; - goto err_free; - } - - list_splice_tail(&allocated, blocks); - - return 0; - -err_undo: - /* - * We really don't want to leave around a bunch of split blocks, since - * bigger is better, so make sure we merge everything back before we - * free the allocated blocks. - */ - buddy = __get_buddy(block); - if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); - -err_free: - if (err == -ENOSPC && total_allocated_on_err) { - list_splice_tail(&allocated, blocks); - *total_allocated_on_err = total_allocated; - } else { - drm_buddy_free_list_internal(mm, &allocated); - } - - return err; -} - -static int __drm_buddy_alloc_range(struct drm_buddy *mm, - u64 start, - u64 size, - u64 *total_allocated_on_err, - struct list_head *blocks) -{ - LIST_HEAD(dfs); - int i; - - for (i = 0; i < mm->n_roots; ++i) - list_add_tail(&mm->roots[i]->tmp_link, &dfs); - - return __alloc_range(mm, &dfs, start, size, - blocks, total_allocated_on_err); -} - -static int __alloc_contig_try_harder(struct drm_buddy *mm, - u64 size, - u64 min_block_size, - struct list_head *blocks) -{ - u64 rhs_offset, lhs_offset, lhs_size, filled; - struct drm_buddy_block *block; - unsigned int tree, order; - LIST_HEAD(blocks_lhs); - unsigned long pages; - u64 modify_size; - int err; - - modify_size = rounddown_pow_of_two(size); - pages = modify_size >> ilog2(mm->chunk_size); - order = fls(pages) - 1; - if (order == 0) - return -ENOSPC; - - for_each_free_tree(tree) { - struct rb_root *root; - struct rb_node *iter; - - root = &mm->free_trees[tree][order]; - if (rbtree_is_empty(root)) - continue; - - iter = rb_last(root); - while (iter) { - block = rbtree_get_free_block(iter); - - /* Allocate blocks traversing RHS */ - rhs_offset = drm_buddy_block_offset(block); - err = __drm_buddy_alloc_range(mm, rhs_offset, size, - &filled, blocks); - if (!err || err != -ENOSPC) - return err; - - lhs_size = max((size - filled), min_block_size); - if (!IS_ALIGNED(lhs_size, min_block_size)) - lhs_size = round_up(lhs_size, min_block_size); - - /* Allocate blocks traversing LHS */ - lhs_offset = drm_buddy_block_offset(block) - lhs_size; - err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, - NULL, &blocks_lhs); - if (!err) { - list_splice(&blocks_lhs, blocks); - return 0; - } else if (err != -ENOSPC) { - drm_buddy_free_list_internal(mm, blocks); - return err; - } - /* Free blocks for the next iteration */ - drm_buddy_free_list_internal(mm, blocks); - - iter = rb_prev(iter); - } - } - - return -ENOSPC; -} - -/** - * drm_buddy_block_trim - free unused pages - * - * @mm: DRM buddy manager - * @start: start address to begin the trimming. - * @new_size: original size requested - * @blocks: Input and output list of allocated blocks. - * MUST contain single block as input to be trimmed. - * On success will contain the newly allocated blocks - * making up the @new_size. Blocks always appear in - * ascending order - * - * For contiguous allocation, we round up the size to the nearest - * power of two value, drivers consume *actual* size, so remaining - * portions are unused and can be optionally freed with this function - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_block_trim(struct drm_buddy *mm, - u64 *start, - u64 new_size, - struct list_head *blocks) -{ - struct drm_buddy_block *parent; - struct drm_buddy_block *block; - u64 block_start, block_end; - LIST_HEAD(dfs); - u64 new_start; - int err; - - if (!list_is_singular(blocks)) - return -EINVAL; - - block = list_first_entry(blocks, - struct drm_buddy_block, - link); - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block); - - if (WARN_ON(!drm_buddy_block_is_allocated(block))) - return -EINVAL; - - if (new_size > drm_buddy_block_size(mm, block)) - return -EINVAL; - - if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) - return -EINVAL; - - if (new_size == drm_buddy_block_size(mm, block)) - return 0; - - new_start = block_start; - if (start) { - new_start = *start; - - if (new_start < block_start) - return -EINVAL; - - if (!IS_ALIGNED(new_start, mm->chunk_size)) - return -EINVAL; - - if (range_overflows(new_start, new_size, block_end)) - return -EINVAL; - } - - list_del(&block->link); - mark_free(mm, block); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); - - /* Prevent recursively freeing this node */ - parent = block->parent; - block->parent = NULL; - - list_add(&block->tmp_link, &dfs); - err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); - if (err) { - mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - list_add(&block->link, blocks); - } - - block->parent = parent; - return err; -} -EXPORT_SYMBOL(drm_buddy_block_trim); - -static struct drm_buddy_block * -__drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags) -{ - if (flags & DRM_BUDDY_RANGE_ALLOCATION) - /* Allocate traversing within the range */ - return __drm_buddy_alloc_range_bias(mm, start, end, - order, flags); - else - /* Allocate from freetree */ - return alloc_from_freetree(mm, order, flags); -} - -/** - * drm_buddy_alloc_blocks - allocate power-of-two blocks - * - * @mm: DRM buddy manager to allocate from - * @start: start of the allowed range for this block - * @end: end of the allowed range for this block - * @size: size of the allocation in bytes - * @min_block_size: alignment of the allocation - * @blocks: output list head to add allocated blocks - * @flags: DRM_BUDDY_*_ALLOCATION flags - * - * alloc_range_bias() called on range limitations, which traverses - * the tree and returns the desired block. - * - * alloc_from_freetree() called when *no* range restrictions - * are enforced, which picks the block from the freetree. - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, u64 size, - u64 min_block_size, - struct list_head *blocks, - unsigned long flags) -{ - struct drm_buddy_block *block = NULL; - u64 original_size, original_min_size; - unsigned int min_order, order; - LIST_HEAD(allocated); - unsigned long pages; - int err; - - if (size < mm->chunk_size) - return -EINVAL; - - if (min_block_size < mm->chunk_size) - return -EINVAL; - - if (!is_power_of_2(min_block_size)) - return -EINVAL; - - if (!IS_ALIGNED(start | end | size, mm->chunk_size)) - return -EINVAL; - - if (end > mm->size) - return -EINVAL; - - if (range_overflows(start, size, mm->size)) - return -EINVAL; - - /* Actual range allocation */ - if (start + size == end) { - if (!IS_ALIGNED(start | end, min_block_size)) - return -EINVAL; - - return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); - } - - original_size = size; - original_min_size = min_block_size; - - /* Roundup the size to power of 2 */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { - size = roundup_pow_of_two(size); - min_block_size = size; - /* Align size value to min_block_size */ - } else if (!IS_ALIGNED(size, min_block_size)) { - size = round_up(size, min_block_size); - } - - pages = size >> ilog2(mm->chunk_size); - order = fls(pages) - 1; - min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); - - if (order > mm->max_order || size > mm->size) { - if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, original_size, - original_min_size, blocks); - - return -EINVAL; - } - - do { - order = min(order, (unsigned int)fls(pages) - 1); - BUG_ON(order > mm->max_order); - BUG_ON(order < min_order); - - do { - block = __drm_buddy_alloc_blocks(mm, start, - end, - order, - flags); - if (!IS_ERR(block)) - break; - - if (order-- == min_order) { - /* Try allocation through force merge method */ - if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { - block = __drm_buddy_alloc_blocks(mm, start, - end, - min_order, - flags); - if (!IS_ERR(block)) { - order = min_order; - break; - } - } - - /* - * Try contiguous block allocation through - * try harder method. - */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, - original_size, - original_min_size, - blocks); - err = -ENOSPC; - goto err_free; - } - } while (1); - - mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - kmemleak_update_trace(block); - list_add_tail(&block->link, &allocated); - - pages -= BIT(order); - - if (!pages) - break; - } while (1); - - /* Trim the allocated block to the required size */ - if (!(flags & DRM_BUDDY_TRIM_DISABLE) && - original_size != size) { - struct list_head *trim_list; - LIST_HEAD(temp); - u64 trim_size; - - trim_list = &allocated; - trim_size = original_size; - - if (!list_is_singular(&allocated)) { - block = list_last_entry(&allocated, typeof(*block), link); - list_move(&block->link, &temp); - trim_list = &temp; - trim_size = drm_buddy_block_size(mm, block) - - (size - original_size); - } - - drm_buddy_block_trim(mm, - NULL, - trim_size, - trim_list); - - if (!list_empty(&temp)) - list_splice_tail(trim_list, &allocated); - } - - list_splice_tail(&allocated, blocks); - return 0; - -err_free: - drm_buddy_free_list_internal(mm, &allocated); - return err; -} -EXPORT_SYMBOL(drm_buddy_alloc_blocks); - -/** - * drm_buddy_block_print - print block information - * - * @mm: DRM buddy manager - * @block: DRM buddy block - * @p: DRM printer to use - */ -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p) -{ - u64 start = drm_buddy_block_offset(block); - u64 size = drm_buddy_block_size(mm, block); - - drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); -} -EXPORT_SYMBOL(drm_buddy_block_print); - -/** - * drm_buddy_print - print allocator state - * - * @mm: DRM buddy manager - * @p: DRM printer to use - */ -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) -{ - int order; - - drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", - mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); - - for (order = mm->max_order; order >= 0; order--) { - struct drm_buddy_block *block, *tmp; - struct rb_root *root; - u64 count = 0, free; - unsigned int tree; - - for_each_free_tree(tree) { - root = &mm->free_trees[tree][order]; - - rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - BUG_ON(!drm_buddy_block_is_free(block)); - count++; - } - } - - drm_printf(p, "order-%2d ", order); - - free = count * (mm->chunk_size << order); - if (free < SZ_1M) - drm_printf(p, "free: %8llu KiB", free >> 10); - else - drm_printf(p, "free: %8llu MiB", free >> 20); - - drm_printf(p, ", blocks: %llu\n", count); - } -} -EXPORT_SYMBOL(drm_buddy_print); - -static void drm_buddy_module_exit(void) -{ - kmem_cache_destroy(slab_blocks); -} - -static int __init drm_buddy_module_init(void) -{ - slab_blocks = KMEM_CACHE(drm_buddy_block, 0); - if (!slab_blocks) - return -ENOMEM; - - return 0; -} - -module_init(drm_buddy_module_init); -module_exit(drm_buddy_module_exit); - -MODULE_DESCRIPTION("DRM Buddy Allocator"); -MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index f65fe86c02b5..eeda5daa544f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -5,7 +5,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index 4d830740946d..30246f02bcfe 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -7,7 +7,7 @@ #include "i915_scatterlist.h" #include "i915_ttm_buddy_manager.h" -#include +#include #include #include diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c index d5c6e6605086..6b256d95badd 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c @@ -5,7 +5,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/lib/drm_random.c b/drivers/gpu/drm/lib/drm_random.c deleted file mode 100644 index 0e9dba1ef4af..000000000000 --- a/drivers/gpu/drm/lib/drm_random.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#include "drm_random.h" - -u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) -{ - return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); -} -EXPORT_SYMBOL(drm_prandom_u32_max_state); - -void drm_random_reorder(unsigned int *order, unsigned int count, - struct rnd_state *state) -{ - unsigned int i, j; - - for (i = 0; i < count; ++i) { - BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); - j = drm_prandom_u32_max_state(count, state); - swap(order[i], order[j]); - } -} -EXPORT_SYMBOL(drm_random_reorder); - -unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) -{ - unsigned int *order, i; - - order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); - if (!order) - return order; - - for (i = 0; i < count; i++) - order[i] = i; - - drm_random_reorder(order, count, state); - return order; -} -EXPORT_SYMBOL(drm_random_order); diff --git a/drivers/gpu/drm/lib/drm_random.h b/drivers/gpu/drm/lib/drm_random.h deleted file mode 100644 index 9f827260a89d..000000000000 --- a/drivers/gpu/drm/lib/drm_random.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DRM_RANDOM_H__ -#define __DRM_RANDOM_H__ - -/* This is a temporary home for a couple of utility functions that should - * be transposed to lib/ at the earliest convenience. - */ - -#include - -#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ - struct rnd_state state__; \ - prandom_seed_state(&state__, (seed__)); \ - state__; \ -}) - -#define DRM_RND_STATE(name__, seed__) \ - struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) - -unsigned int *drm_random_order(unsigned int count, - struct rnd_state *state); -void drm_random_reorder(unsigned int *order, - unsigned int count, - struct rnd_state *state); -u32 drm_prandom_u32_max_state(u32 ep_ro, - struct rnd_state *state); - -#endif /* !__DRM_RANDOM_H__ */ diff --git a/drivers/gpu/drm/tests/Makefile b/drivers/gpu/drm/tests/Makefile index 87d5d5f9332a..d2e2e3d8349a 100644 --- a/drivers/gpu/drm/tests/Makefile +++ b/drivers/gpu/drm/tests/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_DRM_KUNIT_TEST) += \ drm_atomic_test.o \ drm_atomic_state_test.o \ drm_bridge_test.o \ - drm_buddy_test.o \ drm_cmdline_parser_test.o \ drm_connector_test.o \ drm_damage_helper_test.o \ diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c deleted file mode 100644 index e6f8459c6c54..000000000000 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ /dev/null @@ -1,928 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright © 2019 Intel Corporation - * Copyright © 2022 Maíra Canal - */ - -#include - -#include -#include -#include - -#include - -#include "../lib/drm_random.h" - -static unsigned int random_seed; - -static inline u64 get_size(int order, u64 chunk_size) -{ - return (1 << order) * chunk_size; -} - -static void drm_test_buddy_fragmentation_performance(struct kunit *test) -{ - struct drm_buddy_block *block, *tmp; - int num_blocks, i, ret, count = 0; - LIST_HEAD(allocated_blocks); - unsigned long elapsed_ms; - LIST_HEAD(reverse_list); - LIST_HEAD(test_blocks); - LIST_HEAD(clear_list); - LIST_HEAD(dirty_list); - LIST_HEAD(free_list); - struct drm_buddy mm; - u64 mm_size = SZ_4G; - ktime_t start, end; - - /* - * Allocation under severe fragmentation - * - * Create severe fragmentation by allocating the entire 4 GiB address space - * as tiny 8 KiB blocks but forcing a 64 KiB alignment. The resulting pattern - * leaves many scattered holes. Split the allocations into two groups and - * return them with different flags to block coalescing, then repeatedly - * allocate and free 64 KiB blocks while timing the loop. This stresses how - * quickly the allocator can satisfy larger, aligned requests from a pool of - * highly fragmented space. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - num_blocks = mm_size / SZ_64K; - - start = ktime_get(); - /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, - &allocated_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_8K); - - list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { - if (count % 4 == 0 || count % 4 == 3) - list_move_tail(&block->link, &clear_list); - else - list_move_tail(&block->link, &dirty_list); - count++; - } - - /* Free with different flags to ensure no coalescing */ - drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty_list, 0); - - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, - &test_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_64K); - drm_buddy_free_list(&mm, &test_blocks, 0); - - end = ktime_get(); - elapsed_ms = ktime_to_ms(ktime_sub(end, start)); - - kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); - - drm_buddy_fini(&mm); - - /* - * Reverse free order under fragmentation - * - * Construct a fragmented 4 GiB space by allocating every 8 KiB block with - * 64 KiB alignment, creating a dense scatter of small regions. Half of the - * blocks are selectively freed to form sparse gaps, while the remaining - * allocations are preserved, reordered in reverse, and released back with - * the cleared flag. This models a pathological reverse-ordered free pattern - * and measures how quickly the allocator can merge and reclaim space when - * deallocation occurs in the opposite order of allocation, exposing the - * cost difference between a linear freelist scan and an ordered tree lookup. - */ - ret = drm_buddy_init(&mm, mm_size, SZ_4K); - KUNIT_ASSERT_EQ(test, ret, 0); - - start = ktime_get(); - /* Allocate maximum fragmentation */ - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, - &allocated_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_8K); - - list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { - if (count % 2 == 0) - list_move_tail(&block->link, &free_list); - count++; - } - drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); - - list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) - list_move(&block->link, &reverse_list); - drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); - - end = ktime_get(); - elapsed_ms = ktime_to_ms(ktime_sub(end, start)); - - kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); - - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_range_bias(struct kunit *test) -{ - u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; - DRM_RND_STATE(prng, random_seed); - unsigned int i, count, *order; - struct drm_buddy_block *block; - unsigned long flags; - struct drm_buddy mm; - LIST_HEAD(allocated); - - bias_size = SZ_1M; - ps = roundup_pow_of_two(prandom_u32_state(&prng) % bias_size); - ps = max(SZ_4K, ps); - mm_size = (SZ_8M-1) & ~(ps-1); /* Multiple roots */ - - kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - count = mm_size / bias_size; - order = drm_random_order(count, &prng); - KUNIT_EXPECT_TRUE(test, order); - - /* - * Idea is to split the address space into uniform bias ranges, and then - * in some random order allocate within each bias, using various - * patterns within. This should detect if allocations leak out from a - * given bias, for example. - */ - - for (i = 0; i < count; i++) { - LIST_HEAD(tmp); - u32 size; - - bias_start = order[i] * bias_size; - bias_end = bias_start + bias_size; - bias_rem = bias_size; - - /* internal round_up too big */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size + ps, bias_size, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size, bias_size); - - /* size too big */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size + ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size + ps, ps); - - /* bias range too small for size */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, - bias_end, bias_size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start + ps, bias_end, bias_size, ps); - - /* bias misaligned */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, - bias_end - ps, - bias_size >> 1, bias_size >> 1, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); - - /* single big page */ - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size, bias_size, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); - - /* single page with internal round_up */ - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, ps, bias_size, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, ps, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); - - /* random size within */ - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - if (size) - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - - bias_rem -= size; - /* too big for current avail */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_rem + ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_rem + ps, ps); - - if (bias_rem) { - /* random fill of the remainder */ - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - size = max(size, ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - /* - * Intentionally allow some space to be left - * unallocated, and ideally not always on the bias - * boundaries. - */ - drm_buddy_free_list(&mm, &tmp, 0); - } else { - list_splice_tail(&tmp, &allocated); - } - } - - kfree(order); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); - - /* - * Something more free-form. Idea is to pick a random starting bias - * range within the address space and then start filling it up. Also - * randomly grow the bias range in both directions as we go along. This - * should give us bias start/end which is not always uniform like above, - * and in some cases will require the allocator to jump over already - * allocated nodes in the middle of the address space. - */ - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); - bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); - bias_end = max(bias_end, bias_start + ps); - bias_rem = bias_end - bias_start; - - do { - u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - bias_rem -= size; - - /* - * Try to randomly grow the bias range in both directions, or - * only one, or perhaps don't grow at all. - */ - do { - u32 old_bias_start = bias_start; - u32 old_bias_end = bias_end; - - if (bias_start) - bias_start -= round_up(prandom_u32_state(&prng) % bias_start, ps); - if (bias_end != mm_size) - bias_end += round_up(prandom_u32_state(&prng) % (mm_size - bias_end), ps); - - bias_rem += old_bias_start - bias_start; - bias_rem += bias_end - old_bias_end; - } while (!bias_rem && (bias_start || bias_end != mm_size)); - } while (bias_rem); - - KUNIT_ASSERT_EQ(test, bias_start, 0); - KUNIT_ASSERT_EQ(test, bias_end, mm_size); - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, bias_end, - ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc passed with bias(%x-%x), size=%u\n", - bias_start, bias_end, ps); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); - - /* - * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is - * zero. This will validate the bias range allocation in scenarios like system boot - * when no cleared blocks are available and exercise the fallback path too. The resulting - * blocks should always be dirty. - */ - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); - bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); - bias_end = max(bias_end, bias_start + ps); - bias_rem = bias_end - bias_start; - - flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - flags), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - - list_for_each_entry(block, &allocated, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_clear(struct kunit *test) -{ - unsigned long n_pages, total, i = 0; - const unsigned long ps = SZ_4K; - struct drm_buddy_block *block; - const int max_order = 12; - LIST_HEAD(allocated); - struct drm_buddy mm; - unsigned int order; - u32 mm_size, size; - LIST_HEAD(dirty); - LIST_HEAD(clean); - - mm_size = SZ_4K << max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - /* - * Idea is to allocate and free some random portion of the address space, - * returning those pages as non-dirty and randomly alternate between - * requesting dirty and non-dirty pages (not going over the limit - * we freed as non-dirty), putting that into two separate lists. - * Loop over both lists at the end checking that the dirty list - * is indeed all dirty pages and vice versa. Free it all again, - * keeping the dirty/clear status. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 5 * ps, ps, &allocated, - DRM_BUDDY_TOPDOWN_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 5 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - - n_pages = 10; - do { - unsigned long flags; - struct list_head *list; - int slot = i % 2; - - if (slot == 0) { - list = &dirty; - flags = 0; - } else { - list = &clean; - flags = DRM_BUDDY_CLEAR_ALLOCATION; - } - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, - flags), - "buddy_alloc hit an error size=%lu\n", ps); - } while (++i < n_pages); - - list_for_each_entry(block, &clean, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); - - list_for_each_entry(block, &dirty, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - - /* - * Trying to go over the clear limit for some allocation. - * The allocation should never fail with reasonable page-size. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 10 * ps, ps, &clean, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 10 * ps); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - drm_buddy_fini(&mm); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - /* - * Create a new mm. Intentionally fragment the address space by creating - * two alternating lists. Free both lists, one as dirty the other as clean. - * Try to allocate double the previous size with matching min_page_size. The - * allocation should never fail as it calls the force_merge. Also check that - * the page is always dirty after force_merge. Free the page as dirty, then - * repeat the whole thing, increment the order until we hit the max_order. - */ - - i = 0; - n_pages = mm_size / ps; - do { - struct list_head *list; - int slot = i % 2; - - if (slot == 0) - list = &dirty; - else - list = &clean; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, 0), - "buddy_alloc hit an error size=%lu\n", ps); - } while (++i < n_pages); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - - order = 1; - do { - size = SZ_4K << order; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - size, size, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%u\n", size); - total = 0; - list_for_each_entry(block, &allocated, link) { - if (size != mm_size) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - total += drm_buddy_block_size(&mm, block); - } - KUNIT_EXPECT_EQ(test, total, size); - - drm_buddy_free_list(&mm, &allocated, 0); - } while (++order <= max_order); - - drm_buddy_fini(&mm); - - /* - * Create a new mm with a non power-of-two size. Allocate a random size from each - * root, free as cleared and then call fini. This will ensure the multi-root - * force merge during fini. - */ - mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, - 4 * ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 4 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, - 2 * ps, ps, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, - ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_contiguous(struct kunit *test) -{ - const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; - unsigned long i, n_pages, total; - struct drm_buddy_block *block; - struct drm_buddy mm; - LIST_HEAD(left); - LIST_HEAD(middle); - LIST_HEAD(right); - LIST_HEAD(allocated); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - /* - * Idea is to fragment the address space by alternating block - * allocations between three different lists; one for left, middle and - * right. We can then free a list to simulate fragmentation. In - * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, - * including the try_harder path. - */ - - i = 0; - n_pages = mm_size / ps; - do { - struct list_head *list; - int slot = i % 3; - - if (slot == 0) - list = &left; - else if (slot == 1) - list = &middle; - else - list = &right; - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, 0), - "buddy_alloc hit an error size=%lu\n", - ps); - } while (++i < n_pages); - - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - - drm_buddy_free_list(&mm, &middle, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 2 * ps); - - drm_buddy_free_list(&mm, &right, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - /* - * At this point we should have enough contiguous space for 2 blocks, - * however they are never buddies (since we freed middle and right) so - * will require the try_harder logic to find them. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 2 * ps); - - drm_buddy_free_list(&mm, &left, 0); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 3 * ps); - - total = 0; - list_for_each_entry(block, &allocated, link) - total += drm_buddy_block_size(&mm, block); - - KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_pathological(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block; - const int max_order = 3; - unsigned long flags = 0; - int order, top; - struct drm_buddy mm; - LIST_HEAD(blocks); - LIST_HEAD(holes); - LIST_HEAD(tmp); - - /* - * Create a pot-sized mm, then allocate one of each possible - * order within. This should leave the mm with exactly one - * page left. Free the largest block, then whittle down again. - * Eventually we will have a fully 50% fragmented mm. - */ - - mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (top = max_order; top; top--) { - /* Make room by freeing the largest allocated block */ - block = list_first_entry_or_null(&blocks, typeof(*block), link); - if (block) { - list_del(&block->link); - drm_buddy_free_block(&mm, block); - } - - for (order = top; order--;) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, - mm_size, size, size, - &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", - order, top); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* There should be one final page for this sub-allocation */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM for hole\n"); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &holes); - - size = get_size(top, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", - top, max_order); - } - - drm_buddy_free_list(&mm, &holes, 0); - - /* Nothing larger than blocks of chunk_size now available */ - for (order = 1; order <= max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded at order %d, it should be full!", - order); - } - - list_splice_tail(&holes, &blocks); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_pessimistic(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block, *bn; - const unsigned int max_order = 16; - unsigned long flags = 0; - struct drm_buddy mm; - unsigned int order; - LIST_HEAD(blocks); - LIST_HEAD(tmp); - - /* - * Create a pot-sized mm, then allocate one of each possible - * order within. This should leave the mm with exactly one - * page left. - */ - - mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (order = 0; order < max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* And now the last remaining block available */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM on final alloc\n"); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - - /* Should be completely full! */ - for (order = max_order; order--;) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded, it should be full!"); - } - - block = list_last_entry(&blocks, typeof(*block), link); - list_del(&block->link); - drm_buddy_free_block(&mm, block); - - /* As we free in increasing size, we make available larger blocks */ - order = 1; - list_for_each_entry_safe(block, bn, &blocks, link) { - list_del(&block->link); - drm_buddy_free_block(&mm, block); - - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_del(&block->link); - drm_buddy_free_block(&mm, block); - order++; - } - - /* To confirm, now the whole mm should be available */ - size = get_size(max_order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", - max_order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_del(&block->link); - drm_buddy_free_block(&mm, block); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_optimistic(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block; - unsigned long flags = 0; - const int max_order = 16; - struct drm_buddy mm; - LIST_HEAD(blocks); - LIST_HEAD(tmp); - int order; - - /* - * Create a mm with one block of each order available, and - * try to allocate them all. - */ - - mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (order = 0; order <= max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* Should be completely full! */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded, it should be full!"); - - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_limit(struct kunit *test) -{ - u64 size = U64_MAX, start = 0; - struct drm_buddy_block *block; - unsigned long flags = 0; - LIST_HEAD(allocated); - struct drm_buddy mm; - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); - - KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, - "mm.max_order(%d) != %d\n", mm.max_order, - DRM_BUDDY_MAX_ORDER); - - size = mm.chunk_size << mm.max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, - mm.chunk_size, &allocated, flags)); - - block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); - KUNIT_EXPECT_TRUE(test, block); - - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, - "block order(%d) != %d\n", - drm_buddy_block_order(block), mm.max_order); - - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * mm.chunk_size, - "block size(%llu) != %llu\n", - drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * mm.chunk_size); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) -{ - u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; - struct drm_buddy mm; - LIST_HEAD(blocks); - int err; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - /* CONTIGUOUS allocation should succeed via try_harder fallback */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, - SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%llu\n", size); - drm_buddy_free_list(&mm, &blocks, 0); - - /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, - DRM_BUDDY_RANGE_ALLOCATION); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - drm_buddy_fini(&mm); -} - -static int drm_buddy_suite_init(struct kunit_suite *suite) -{ - while (!random_seed) - random_seed = get_random_u32(); - - kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", - random_seed); - - return 0; -} - -static struct kunit_case drm_buddy_tests[] = { - KUNIT_CASE(drm_test_buddy_alloc_limit), - KUNIT_CASE(drm_test_buddy_alloc_optimistic), - KUNIT_CASE(drm_test_buddy_alloc_pessimistic), - KUNIT_CASE(drm_test_buddy_alloc_pathological), - KUNIT_CASE(drm_test_buddy_alloc_contiguous), - KUNIT_CASE(drm_test_buddy_alloc_clear), - KUNIT_CASE(drm_test_buddy_alloc_range_bias), - KUNIT_CASE(drm_test_buddy_fragmentation_performance), - KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), - {} -}; - -static struct kunit_suite drm_buddy_test_suite = { - .name = "drm_buddy", - .suite_init = drm_buddy_suite_init, - .test_cases = drm_buddy_tests, -}; - -kunit_test_suite(drm_buddy_test_suite); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); -MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/drm/tests/drm_exec_test.c b/drivers/gpu/drm/tests/drm_exec_test.c index 3a20c788c51f..2fc47f3b463b 100644 --- a/drivers/gpu/drm/tests/drm_exec_test.c +++ b/drivers/gpu/drm/tests/drm_exec_test.c @@ -16,8 +16,6 @@ #include #include -#include "../lib/drm_random.h" - struct drm_exec_priv { struct device *dev; struct drm_device *drm; diff --git a/drivers/gpu/drm/tests/drm_mm_test.c b/drivers/gpu/drm/tests/drm_mm_test.c index aec9eccdeae9..e24a619059d8 100644 --- a/drivers/gpu/drm/tests/drm_mm_test.c +++ b/drivers/gpu/drm/tests/drm_mm_test.c @@ -16,8 +16,6 @@ #include #include -#include "../lib/drm_random.h" - enum { BEST, BOTTOMUP, diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h index e4c95f86a467..96ea8c9aae34 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h @@ -5,7 +5,7 @@ #ifndef TTM_MOCK_MANAGER_H #define TTM_MOCK_MANAGER_H -#include +#include struct ttm_mock_manager { struct ttm_resource_manager man; diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h index a71e14818ec2..babeec5511d9 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h @@ -6,7 +6,7 @@ #ifndef _XE_TTM_VRAM_MGR_TYPES_H_ #define _XE_TTM_VRAM_MGR_TYPES_H_ -#include +#include #include /** diff --git a/drivers/gpu/tests/Makefile b/drivers/gpu/tests/Makefile new file mode 100644 index 000000000000..8e7654e87d82 --- /dev/null +++ b/drivers/gpu/tests/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +gpu_buddy_tests-y = gpu_buddy_test.o gpu_random.o +obj-$(CONFIG_DRM_KUNIT_TEST) += gpu_buddy_tests.o diff --git a/drivers/gpu/tests/gpu_buddy_test.c b/drivers/gpu/tests/gpu_buddy_test.c new file mode 100644 index 000000000000..b905932da990 --- /dev/null +++ b/drivers/gpu/tests/gpu_buddy_test.c @@ -0,0 +1,928 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + * Copyright © 2022 Maíra Canal + */ + +#include + +#include +#include +#include + +#include + +#include "gpu_random.h" + +static unsigned int random_seed; + +static inline u64 get_size(int order, u64 chunk_size) +{ + return (1 << order) * chunk_size; +} + +static void drm_test_buddy_fragmentation_performance(struct kunit *test) +{ + struct drm_buddy_block *block, *tmp; + int num_blocks, i, ret, count = 0; + LIST_HEAD(allocated_blocks); + unsigned long elapsed_ms; + LIST_HEAD(reverse_list); + LIST_HEAD(test_blocks); + LIST_HEAD(clear_list); + LIST_HEAD(dirty_list); + LIST_HEAD(free_list); + struct drm_buddy mm; + u64 mm_size = SZ_4G; + ktime_t start, end; + + /* + * Allocation under severe fragmentation + * + * Create severe fragmentation by allocating the entire 4 GiB address space + * as tiny 8 KiB blocks but forcing a 64 KiB alignment. The resulting pattern + * leaves many scattered holes. Split the allocations into two groups and + * return them with different flags to block coalescing, then repeatedly + * allocate and free 64 KiB blocks while timing the loop. This stresses how + * quickly the allocator can satisfy larger, aligned requests from a pool of + * highly fragmented space. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + num_blocks = mm_size / SZ_64K; + + start = ktime_get(); + /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + &allocated_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_8K); + + list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { + if (count % 4 == 0 || count % 4 == 3) + list_move_tail(&block->link, &clear_list); + else + list_move_tail(&block->link, &dirty_list); + count++; + } + + /* Free with different flags to ensure no coalescing */ + drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty_list, 0); + + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, + &test_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_64K); + drm_buddy_free_list(&mm, &test_blocks, 0); + + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); + + drm_buddy_fini(&mm); + + /* + * Reverse free order under fragmentation + * + * Construct a fragmented 4 GiB space by allocating every 8 KiB block with + * 64 KiB alignment, creating a dense scatter of small regions. Half of the + * blocks are selectively freed to form sparse gaps, while the remaining + * allocations are preserved, reordered in reverse, and released back with + * the cleared flag. This models a pathological reverse-ordered free pattern + * and measures how quickly the allocator can merge and reclaim space when + * deallocation occurs in the opposite order of allocation, exposing the + * cost difference between a linear freelist scan and an ordered tree lookup. + */ + ret = drm_buddy_init(&mm, mm_size, SZ_4K); + KUNIT_ASSERT_EQ(test, ret, 0); + + start = ktime_get(); + /* Allocate maximum fragmentation */ + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + &allocated_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_8K); + + list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { + if (count % 2 == 0) + list_move_tail(&block->link, &free_list); + count++; + } + drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); + + list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) + list_move(&block->link, &reverse_list); + drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); + + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); + + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_range_bias(struct kunit *test) +{ + u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; + DRM_RND_STATE(prng, random_seed); + unsigned int i, count, *order; + struct drm_buddy_block *block; + unsigned long flags; + struct drm_buddy mm; + LIST_HEAD(allocated); + + bias_size = SZ_1M; + ps = roundup_pow_of_two(prandom_u32_state(&prng) % bias_size); + ps = max(SZ_4K, ps); + mm_size = (SZ_8M-1) & ~(ps-1); /* Multiple roots */ + + kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + count = mm_size / bias_size; + order = drm_random_order(count, &prng); + KUNIT_EXPECT_TRUE(test, order); + + /* + * Idea is to split the address space into uniform bias ranges, and then + * in some random order allocate within each bias, using various + * patterns within. This should detect if allocations leak out from a + * given bias, for example. + */ + + for (i = 0; i < count; i++) { + LIST_HEAD(tmp); + u32 size; + + bias_start = order[i] * bias_size; + bias_end = bias_start + bias_size; + bias_rem = bias_size; + + /* internal round_up too big */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size + ps, bias_size, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size, bias_size); + + /* size too big */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size + ps, ps); + + /* bias range too small for size */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start + ps, + bias_end, bias_size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start + ps, bias_end, bias_size, ps); + + /* bias misaligned */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start + ps, + bias_end - ps, + bias_size >> 1, bias_size >> 1, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); + + /* single big page */ + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size, bias_size, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size, bias_size); + drm_buddy_free_list(&mm, &tmp, 0); + + /* single page with internal round_up */ + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, ps, bias_size, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, ps, bias_size); + drm_buddy_free_list(&mm, &tmp, 0); + + /* random size within */ + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + if (size) + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + bias_rem -= size; + /* too big for current avail */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_rem + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_rem + ps, ps); + + if (bias_rem) { + /* random fill of the remainder */ + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + size = max(size, ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + /* + * Intentionally allow some space to be left + * unallocated, and ideally not always on the bias + * boundaries. + */ + drm_buddy_free_list(&mm, &tmp, 0); + } else { + list_splice_tail(&tmp, &allocated); + } + } + + kfree(order); + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); + + /* + * Something more free-form. Idea is to pick a random starting bias + * range within the address space and then start filling it up. Also + * randomly grow the bias range in both directions as we go along. This + * should give us bias start/end which is not always uniform like above, + * and in some cases will require the allocator to jump over already + * allocated nodes in the middle of the address space. + */ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + do { + u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + bias_rem -= size; + + /* + * Try to randomly grow the bias range in both directions, or + * only one, or perhaps don't grow at all. + */ + do { + u32 old_bias_start = bias_start; + u32 old_bias_end = bias_end; + + if (bias_start) + bias_start -= round_up(prandom_u32_state(&prng) % bias_start, ps); + if (bias_end != mm_size) + bias_end += round_up(prandom_u32_state(&prng) % (mm_size - bias_end), ps); + + bias_rem += old_bias_start - bias_start; + bias_rem += bias_end - old_bias_end; + } while (!bias_rem && (bias_start || bias_end != mm_size)); + } while (bias_rem); + + KUNIT_ASSERT_EQ(test, bias_start, 0); + KUNIT_ASSERT_EQ(test, bias_end, mm_size); + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, bias_end, + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc passed with bias(%x-%x), size=%u\n", + bias_start, bias_end, ps); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); + + /* + * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is + * zero. This will validate the bias range allocation in scenarios like system boot + * when no cleared blocks are available and exercise the fallback path too. The resulting + * blocks should always be dirty. + */ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + flags), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + list_for_each_entry(block, &allocated, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_clear(struct kunit *test) +{ + unsigned long n_pages, total, i = 0; + const unsigned long ps = SZ_4K; + struct drm_buddy_block *block; + const int max_order = 12; + LIST_HEAD(allocated); + struct drm_buddy mm; + unsigned int order; + u32 mm_size, size; + LIST_HEAD(dirty); + LIST_HEAD(clean); + + mm_size = SZ_4K << max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + /* + * Idea is to allocate and free some random portion of the address space, + * returning those pages as non-dirty and randomly alternate between + * requesting dirty and non-dirty pages (not going over the limit + * we freed as non-dirty), putting that into two separate lists. + * Loop over both lists at the end checking that the dirty list + * is indeed all dirty pages and vice versa. Free it all again, + * keeping the dirty/clear status. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 5 * ps, ps, &allocated, + DRM_BUDDY_TOPDOWN_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 5 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + + n_pages = 10; + do { + unsigned long flags; + struct list_head *list; + int slot = i % 2; + + if (slot == 0) { + list = &dirty; + flags = 0; + } else { + list = &clean; + flags = DRM_BUDDY_CLEAR_ALLOCATION; + } + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, + flags), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + list_for_each_entry(block, &clean, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + + list_for_each_entry(block, &dirty, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + + /* + * Trying to go over the clear limit for some allocation. + * The allocation should never fail with reasonable page-size. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 10 * ps, ps, &clean, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 10 * ps); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty, 0); + drm_buddy_fini(&mm); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + /* + * Create a new mm. Intentionally fragment the address space by creating + * two alternating lists. Free both lists, one as dirty the other as clean. + * Try to allocate double the previous size with matching min_page_size. The + * allocation should never fail as it calls the force_merge. Also check that + * the page is always dirty after force_merge. Free the page as dirty, then + * repeat the whole thing, increment the order until we hit the max_order. + */ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 2; + + if (slot == 0) + list = &dirty; + else + list = &clean; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, 0), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty, 0); + + order = 1; + do { + size = SZ_4K << order; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + size, size, &allocated, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%u\n", size); + total = 0; + list_for_each_entry(block, &allocated, link) { + if (size != mm_size) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + total += drm_buddy_block_size(&mm, block); + } + KUNIT_EXPECT_EQ(test, total, size); + + drm_buddy_free_list(&mm, &allocated, 0); + } while (++order <= max_order); + + drm_buddy_fini(&mm); + + /* + * Create a new mm with a non power-of-two size. Allocate a random size from each + * root, free as cleared and then call fini. This will ensure the multi-root + * force merge during fini. + */ + mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + 4 * ps, ps, &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 4 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + 2 * ps, ps, &allocated, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 2 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, + ps, ps, &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_contiguous(struct kunit *test) +{ + const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; + unsigned long i, n_pages, total; + struct drm_buddy_block *block; + struct drm_buddy mm; + LIST_HEAD(left); + LIST_HEAD(middle); + LIST_HEAD(right); + LIST_HEAD(allocated); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + /* + * Idea is to fragment the address space by alternating block + * allocations between three different lists; one for left, middle and + * right. We can then free a list to simulate fragmentation. In + * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, + * including the try_harder path. + */ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 3; + + if (slot == 0) + list = &left; + else if (slot == 1) + list = &middle; + else + list = &right; + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, 0), + "buddy_alloc hit an error size=%lu\n", + ps); + } while (++i < n_pages); + + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + + drm_buddy_free_list(&mm, &middle, 0); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 2 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 2 * ps); + + drm_buddy_free_list(&mm, &right, 0); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + /* + * At this point we should have enough contiguous space for 2 blocks, + * however they are never buddies (since we freed middle and right) so + * will require the try_harder logic to find them. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 2 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 2 * ps); + + drm_buddy_free_list(&mm, &left, 0); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 3 * ps); + + total = 0; + list_for_each_entry(block, &allocated, link) + total += drm_buddy_block_size(&mm, block); + + KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_pathological(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block; + const int max_order = 3; + unsigned long flags = 0; + int order, top; + struct drm_buddy mm; + LIST_HEAD(blocks); + LIST_HEAD(holes); + LIST_HEAD(tmp); + + /* + * Create a pot-sized mm, then allocate one of each possible + * order within. This should leave the mm with exactly one + * page left. Free the largest block, then whittle down again. + * Eventually we will have a fully 50% fragmented mm. + */ + + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (top = max_order; top; top--) { + /* Make room by freeing the largest allocated block */ + block = list_first_entry_or_null(&blocks, typeof(*block), link); + if (block) { + list_del(&block->link); + drm_buddy_free_block(&mm, block); + } + + for (order = top; order--;) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, + mm_size, size, size, + &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", + order, top); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* There should be one final page for this sub-allocation */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM for hole\n"); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &holes); + + size = get_size(top, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", + top, max_order); + } + + drm_buddy_free_list(&mm, &holes, 0); + + /* Nothing larger than blocks of chunk_size now available */ + for (order = 1; order <= max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded at order %d, it should be full!", + order); + } + + list_splice_tail(&holes, &blocks); + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_pessimistic(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block, *bn; + const unsigned int max_order = 16; + unsigned long flags = 0; + struct drm_buddy mm; + unsigned int order; + LIST_HEAD(blocks); + LIST_HEAD(tmp); + + /* + * Create a pot-sized mm, then allocate one of each possible + * order within. This should leave the mm with exactly one + * page left. + */ + + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (order = 0; order < max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* And now the last remaining block available */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM on final alloc\n"); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + + /* Should be completely full! */ + for (order = max_order; order--;) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded, it should be full!"); + } + + block = list_last_entry(&blocks, typeof(*block), link); + list_del(&block->link); + drm_buddy_free_block(&mm, block); + + /* As we free in increasing size, we make available larger blocks */ + order = 1; + list_for_each_entry_safe(block, bn, &blocks, link) { + list_del(&block->link); + drm_buddy_free_block(&mm, block); + + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_del(&block->link); + drm_buddy_free_block(&mm, block); + order++; + } + + /* To confirm, now the whole mm should be available */ + size = get_size(max_order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", + max_order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_del(&block->link); + drm_buddy_free_block(&mm, block); + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_optimistic(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block; + unsigned long flags = 0; + const int max_order = 16; + struct drm_buddy mm; + LIST_HEAD(blocks); + LIST_HEAD(tmp); + int order; + + /* + * Create a mm with one block of each order available, and + * try to allocate them all. + */ + + mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (order = 0; order <= max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* Should be completely full! */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded, it should be full!"); + + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_limit(struct kunit *test) +{ + u64 size = U64_MAX, start = 0; + struct drm_buddy_block *block; + unsigned long flags = 0; + LIST_HEAD(allocated); + struct drm_buddy mm; + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); + + KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, + "mm.max_order(%d) != %d\n", mm.max_order, + DRM_BUDDY_MAX_ORDER); + + size = mm.chunk_size << mm.max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, + mm.chunk_size, &allocated, flags)); + + block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); + KUNIT_EXPECT_TRUE(test, block); + + KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, + "block order(%d) != %d\n", + drm_buddy_block_order(block), mm.max_order); + + KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), + BIT_ULL(mm.max_order) * mm.chunk_size, + "block size(%llu) != %llu\n", + drm_buddy_block_size(&mm, block), + BIT_ULL(mm.max_order) * mm.chunk_size); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) +{ + u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; + struct drm_buddy mm; + LIST_HEAD(blocks); + int err; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + /* CONTIGUOUS allocation should succeed via try_harder fallback */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, + SZ_4K, &blocks, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%llu\n", size); + drm_buddy_free_list(&mm, &blocks, 0); + + /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, + DRM_BUDDY_RANGE_ALLOCATION); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, + DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + drm_buddy_fini(&mm); +} + +static int drm_buddy_suite_init(struct kunit_suite *suite) +{ + while (!random_seed) + random_seed = get_random_u32(); + + kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", + random_seed); + + return 0; +} + +static struct kunit_case drm_buddy_tests[] = { + KUNIT_CASE(drm_test_buddy_alloc_limit), + KUNIT_CASE(drm_test_buddy_alloc_optimistic), + KUNIT_CASE(drm_test_buddy_alloc_pessimistic), + KUNIT_CASE(drm_test_buddy_alloc_pathological), + KUNIT_CASE(drm_test_buddy_alloc_contiguous), + KUNIT_CASE(drm_test_buddy_alloc_clear), + KUNIT_CASE(drm_test_buddy_alloc_range_bias), + KUNIT_CASE(drm_test_buddy_fragmentation_performance), + KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), + {} +}; + +static struct kunit_suite drm_buddy_test_suite = { + .name = "drm_buddy", + .suite_init = drm_buddy_suite_init, + .test_cases = drm_buddy_tests, +}; + +kunit_test_suite(drm_buddy_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/tests/gpu_random.c b/drivers/gpu/tests/gpu_random.c new file mode 100644 index 000000000000..ddd1f594b5d5 --- /dev/null +++ b/drivers/gpu/tests/gpu_random.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "gpu_random.h" + +u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) +{ + return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); +} +EXPORT_SYMBOL(drm_prandom_u32_max_state); + +void drm_random_reorder(unsigned int *order, unsigned int count, + struct rnd_state *state) +{ + unsigned int i, j; + + for (i = 0; i < count; ++i) { + BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); + j = drm_prandom_u32_max_state(count, state); + swap(order[i], order[j]); + } +} +EXPORT_SYMBOL(drm_random_reorder); + +unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) +{ + unsigned int *order, i; + + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); + if (!order) + return order; + + for (i = 0; i < count; i++) + order[i] = i; + + drm_random_reorder(order, count, state); + return order; +} +EXPORT_SYMBOL(drm_random_order); diff --git a/drivers/gpu/tests/gpu_random.h b/drivers/gpu/tests/gpu_random.h new file mode 100644 index 000000000000..9f827260a89d --- /dev/null +++ b/drivers/gpu/tests/gpu_random.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DRM_RANDOM_H__ +#define __DRM_RANDOM_H__ + +/* This is a temporary home for a couple of utility functions that should + * be transposed to lib/ at the earliest convenience. + */ + +#include + +#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ + struct rnd_state state__; \ + prandom_seed_state(&state__, (seed__)); \ + state__; \ +}) + +#define DRM_RND_STATE(name__, seed__) \ + struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) + +unsigned int *drm_random_order(unsigned int count, + struct rnd_state *state); +void drm_random_reorder(unsigned int *order, + unsigned int count, + struct rnd_state *state); +u32 drm_prandom_u32_max_state(u32 ep_ro, + struct rnd_state *state); + +#endif /* !__DRM_RANDOM_H__ */ diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h deleted file mode 100644 index b909fa8f810a..000000000000 --- a/include/drm/drm_buddy.h +++ /dev/null @@ -1,171 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#ifndef __DRM_BUDDY_H__ -#define __DRM_BUDDY_H__ - -#include -#include -#include -#include -#include - -struct drm_printer; - -#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) -#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) -#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) -#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) -#define DRM_BUDDY_CLEARED BIT(4) -#define DRM_BUDDY_TRIM_DISABLE BIT(5) - -struct drm_buddy_block { -#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) -#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) -#define DRM_BUDDY_ALLOCATED (1 << 10) -#define DRM_BUDDY_FREE (2 << 10) -#define DRM_BUDDY_SPLIT (3 << 10) -#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) -/* Free to be used, if needed in the future */ -#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) -#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) - u64 header; - - struct drm_buddy_block *left; - struct drm_buddy_block *right; - struct drm_buddy_block *parent; - - void *private; /* owned by creator */ - - /* - * While the block is allocated by the user through drm_buddy_alloc*, - * the user has ownership of the link, for example to maintain within - * a list, if so desired. As soon as the block is freed with - * drm_buddy_free* ownership is given back to the mm. - */ - union { - struct rb_node rb; - struct list_head link; - }; - - struct list_head tmp_link; -}; - -/* Order-zero must be at least SZ_4K */ -#define DRM_BUDDY_MAX_ORDER (63 - 12) - -/* - * Binary Buddy System. - * - * Locking should be handled by the user, a simple mutex around - * drm_buddy_alloc* and drm_buddy_free* should suffice. - */ -struct drm_buddy { - /* Maintain a free list for each order. */ - struct rb_root **free_trees; - - /* - * Maintain explicit binary tree(s) to track the allocation of the - * address space. This gives us a simple way of finding a buddy block - * and performing the potentially recursive merge step when freeing a - * block. Nodes are either allocated or free, in which case they will - * also exist on the respective free list. - */ - struct drm_buddy_block **roots; - - /* - * Anything from here is public, and remains static for the lifetime of - * the mm. Everything above is considered do-not-touch. - */ - unsigned int n_roots; - unsigned int max_order; - - /* Must be at least SZ_4K */ - u64 chunk_size; - u64 size; - u64 avail; - u64 clear_avail; -}; - -static inline u64 -drm_buddy_block_offset(const struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_OFFSET; -} - -static inline unsigned int -drm_buddy_block_order(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_ORDER; -} - -static inline unsigned int -drm_buddy_block_state(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_STATE; -} - -static inline bool -drm_buddy_block_is_allocated(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; -} - -static inline bool -drm_buddy_block_is_clear(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_CLEAR; -} - -static inline bool -drm_buddy_block_is_free(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_FREE; -} - -static inline bool -drm_buddy_block_is_split(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; -} - -static inline u64 -drm_buddy_block_size(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - return mm->chunk_size << drm_buddy_block_order(block); -} - -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); - -void drm_buddy_fini(struct drm_buddy *mm); - -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block); - -int drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, u64 size, - u64 min_page_size, - struct list_head *blocks, - unsigned long flags); - -int drm_buddy_block_trim(struct drm_buddy *mm, - u64 *start, - u64 new_size, - struct list_head *blocks); - -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); - -void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); - -void drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - unsigned int flags); - -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p); -#endif diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h new file mode 100644 index 000000000000..b909fa8f810a --- /dev/null +++ b/include/linux/gpu_buddy.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __DRM_BUDDY_H__ +#define __DRM_BUDDY_H__ + +#include +#include +#include +#include +#include + +struct drm_printer; + +#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) +#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) +#define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) + +struct drm_buddy_block { +#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define DRM_BUDDY_ALLOCATED (1 << 10) +#define DRM_BUDDY_FREE (2 << 10) +#define DRM_BUDDY_SPLIT (3 << 10) +#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +/* Free to be used, if needed in the future */ +#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) + u64 header; + + struct drm_buddy_block *left; + struct drm_buddy_block *right; + struct drm_buddy_block *parent; + + void *private; /* owned by creator */ + + /* + * While the block is allocated by the user through drm_buddy_alloc*, + * the user has ownership of the link, for example to maintain within + * a list, if so desired. As soon as the block is freed with + * drm_buddy_free* ownership is given back to the mm. + */ + union { + struct rb_node rb; + struct list_head link; + }; + + struct list_head tmp_link; +}; + +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) + +/* + * Binary Buddy System. + * + * Locking should be handled by the user, a simple mutex around + * drm_buddy_alloc* and drm_buddy_free* should suffice. + */ +struct drm_buddy { + /* Maintain a free list for each order. */ + struct rb_root **free_trees; + + /* + * Maintain explicit binary tree(s) to track the allocation of the + * address space. This gives us a simple way of finding a buddy block + * and performing the potentially recursive merge step when freeing a + * block. Nodes are either allocated or free, in which case they will + * also exist on the respective free list. + */ + struct drm_buddy_block **roots; + + /* + * Anything from here is public, and remains static for the lifetime of + * the mm. Everything above is considered do-not-touch. + */ + unsigned int n_roots; + unsigned int max_order; + + /* Must be at least SZ_4K */ + u64 chunk_size; + u64 size; + u64 avail; + u64 clear_avail; +}; + +static inline u64 +drm_buddy_block_offset(const struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_OFFSET; +} + +static inline unsigned int +drm_buddy_block_order(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_ORDER; +} + +static inline unsigned int +drm_buddy_block_state(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_STATE; +} + +static inline bool +drm_buddy_block_is_allocated(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; +} + +static inline bool +drm_buddy_block_is_clear(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_CLEAR; +} + +static inline bool +drm_buddy_block_is_free(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_FREE; +} + +static inline bool +drm_buddy_block_is_split(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; +} + +static inline u64 +drm_buddy_block_size(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + return mm->chunk_size << drm_buddy_block_order(block); +} + +int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); + +void drm_buddy_fini(struct drm_buddy *mm); + +struct drm_buddy_block * +drm_get_buddy(struct drm_buddy_block *block); + +int drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, u64 size, + u64 min_page_size, + struct list_head *blocks, + unsigned long flags); + +int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, + u64 new_size, + struct list_head *blocks); + +void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); + +void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); + +void drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + unsigned int flags); + +void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); +void drm_buddy_block_print(struct drm_buddy *mm, + struct drm_buddy_block *block, + struct drm_printer *p); +#endif -- cgit v1.2.3 From ba110db8e1bc206c13fd7d985e79b033f53bfdea Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part two) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- Documentation/gpu/drm-mm.rst | 6 + MAINTAINERS | 8 +- drivers/gpu/Kconfig | 13 + drivers/gpu/Makefile | 1 + drivers/gpu/buddy.c | 556 ++++++++++----------- drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 12 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 79 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 18 +- drivers/gpu/drm/drm_buddy.c | 77 +++ drivers/gpu/drm/i915/i915_scatterlist.c | 8 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 55 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.h | 4 +- .../gpu/drm/i915/selftests/intel_memory_region.c | 20 +- drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c | 4 +- drivers/gpu/drm/ttm/tests/ttm_mock_manager.c | 18 +- drivers/gpu/drm/ttm/tests/ttm_mock_manager.h | 2 +- drivers/gpu/drm/xe/xe_res_cursor.h | 34 +- drivers/gpu/drm/xe/xe_svm.c | 12 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 71 +-- drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h | 2 +- drivers/gpu/tests/Makefile | 2 +- drivers/gpu/tests/gpu_buddy_test.c | 412 +++++++-------- drivers/gpu/tests/gpu_random.c | 16 +- drivers/gpu/tests/gpu_random.h | 18 +- drivers/video/Kconfig | 1 + include/drm/drm_buddy.h | 18 + include/linux/gpu_buddy.h | 124 ++--- 30 files changed, 855 insertions(+), 741 deletions(-) create mode 100644 drivers/gpu/Kconfig create mode 100644 drivers/gpu/drm/drm_buddy.c create mode 100644 include/drm/drm_buddy.h (limited to 'include') diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst index ceee0e663237..32fb506db05b 100644 --- a/Documentation/gpu/drm-mm.rst +++ b/Documentation/gpu/drm-mm.rst @@ -532,6 +532,12 @@ Buddy Allocator Function References (GPU buddy) .. kernel-doc:: drivers/gpu/buddy.c :export: +DRM Buddy Specific Logging Function References +---------------------------------------------- + +.. kernel-doc:: drivers/gpu/drm/drm_buddy.c + :export: + DRM Cache Handling and Fast WC memcpy() ======================================= diff --git a/MAINTAINERS b/MAINTAINERS index 086cbf5c36b3..f2bec2c0d7e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8797,15 +8797,17 @@ T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/gpu/drm/ttm/ F: include/drm/ttm/ -DRM BUDDY ALLOCATOR +GPU BUDDY ALLOCATOR M: Matthew Auld M: Arun Pravin R: Christian Koenig L: dri-devel@lists.freedesktop.org S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git -F: drivers/gpu/drm/drm_buddy.c -F: drivers/gpu/drm/tests/drm_buddy_test.c +F: drivers/gpu/drm_buddy.c +F: drivers/gpu/buddy.c +F: drivers/gpu/tests/gpu_buddy_test.c +F: include/linux/gpu_buddy.h F: include/drm/drm_buddy.h DRM AUTOMATED TESTING diff --git a/drivers/gpu/Kconfig b/drivers/gpu/Kconfig new file mode 100644 index 000000000000..ebb2ad4b7ea0 --- /dev/null +++ b/drivers/gpu/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +config GPU_BUDDY + bool + help + A page based buddy allocator for GPU memory. + +config GPU_BUDDY_KUNIT_TEST + tristate "KUnit tests for GPU buddy allocator" if !KUNIT_ALL_TESTS + depends on GPU_BUDDY && KUNIT + default KUNIT_ALL_TESTS + help + KUnit tests for the GPU buddy allocator. diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile index c5292ee2c852..5cd54d06e262 100644 --- a/drivers/gpu/Makefile +++ b/drivers/gpu/Makefile @@ -6,3 +6,4 @@ obj-y += host1x/ drm/ vga/ tests/ obj-$(CONFIG_IMX_IPUV3_CORE) += ipu-v3/ obj-$(CONFIG_TRACE_GPU_MEM) += trace/ obj-$(CONFIG_NOVA_CORE) += nova-core/ +obj-$(CONFIG_GPU_BUDDY) += buddy.o diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index 4cc63d961d26..603c59a2013a 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -11,27 +11,17 @@ #include #include -#include - -enum drm_buddy_free_tree { - DRM_BUDDY_CLEAR_TREE = 0, - DRM_BUDDY_DIRTY_TREE, - DRM_BUDDY_MAX_FREE_TREES, -}; static struct kmem_cache *slab_blocks; -#define for_each_free_tree(tree) \ - for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) - -static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, - struct drm_buddy_block *parent, +static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, + struct gpu_buddy_block *parent, unsigned int order, u64 offset) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; - BUG_ON(order > DRM_BUDDY_MAX_ORDER); + BUG_ON(order > GPU_BUDDY_MAX_ORDER); block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); if (!block) @@ -43,30 +33,30 @@ static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, RB_CLEAR_NODE(&block->rb); - BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); + BUG_ON(block->header & GPU_BUDDY_HEADER_UNUSED); return block; } -static void drm_block_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void gpu_block_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { kmem_cache_free(slab_blocks, block); } -static enum drm_buddy_free_tree -get_block_tree(struct drm_buddy_block *block) +static enum gpu_buddy_free_tree +get_block_tree(struct gpu_buddy_block *block) { - return drm_buddy_block_is_clear(block) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + return gpu_buddy_block_is_clear(block) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; } -static struct drm_buddy_block * +static struct gpu_buddy_block * rbtree_get_free_block(const struct rb_node *node) { - return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; + return node ? rb_entry(node, struct gpu_buddy_block, rb) : NULL; } -static struct drm_buddy_block * +static struct gpu_buddy_block * rbtree_last_free_block(struct rb_root *root) { return rbtree_get_free_block(rb_last(root)); @@ -77,33 +67,33 @@ static bool rbtree_is_empty(struct rb_root *root) return RB_EMPTY_ROOT(root); } -static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, - const struct drm_buddy_block *node) +static bool gpu_buddy_block_offset_less(const struct gpu_buddy_block *block, + const struct gpu_buddy_block *node) { - return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); + return gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node); } static bool rbtree_block_offset_less(struct rb_node *block, const struct rb_node *node) { - return drm_buddy_block_offset_less(rbtree_get_free_block(block), + return gpu_buddy_block_offset_less(rbtree_get_free_block(block), rbtree_get_free_block(node)); } -static void rbtree_insert(struct drm_buddy *mm, - struct drm_buddy_block *block, - enum drm_buddy_free_tree tree) +static void rbtree_insert(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + enum gpu_buddy_free_tree tree) { rb_add(&block->rb, - &mm->free_trees[tree][drm_buddy_block_order(block)], + &mm->free_trees[tree][gpu_buddy_block_order(block)], rbtree_block_offset_less); } -static void rbtree_remove(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void rbtree_remove(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - unsigned int order = drm_buddy_block_order(block); - enum drm_buddy_free_tree tree; + unsigned int order = gpu_buddy_block_order(block); + enum gpu_buddy_free_tree tree; struct rb_root *root; tree = get_block_tree(block); @@ -113,42 +103,42 @@ static void rbtree_remove(struct drm_buddy *mm, RB_CLEAR_NODE(&block->rb); } -static void clear_reset(struct drm_buddy_block *block) +static void clear_reset(struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_CLEAR; + block->header &= ~GPU_BUDDY_HEADER_CLEAR; } -static void mark_cleared(struct drm_buddy_block *block) +static void mark_cleared(struct gpu_buddy_block *block) { - block->header |= DRM_BUDDY_HEADER_CLEAR; + block->header |= GPU_BUDDY_HEADER_CLEAR; } -static void mark_allocated(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_allocated(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_ALLOCATED; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_ALLOCATED; rbtree_remove(mm, block); } -static void mark_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - enum drm_buddy_free_tree tree; + enum gpu_buddy_free_tree tree; - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_FREE; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_FREE; tree = get_block_tree(block); rbtree_insert(mm, block, tree); } -static void mark_split(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_split(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_SPLIT; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_SPLIT; rbtree_remove(mm, block); } @@ -163,10 +153,10 @@ static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) return s1 <= s2 && e1 >= e2; } -static struct drm_buddy_block * -__get_buddy(struct drm_buddy_block *block) +static struct gpu_buddy_block * +__get_buddy(struct gpu_buddy_block *block) { - struct drm_buddy_block *parent; + struct gpu_buddy_block *parent; parent = block->parent; if (!parent) @@ -178,19 +168,19 @@ __get_buddy(struct drm_buddy_block *block) return parent->left; } -static unsigned int __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block, +static unsigned int __gpu_buddy_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block, bool force_merge) { - struct drm_buddy_block *parent; + struct gpu_buddy_block *parent; unsigned int order; while ((parent = block->parent)) { - struct drm_buddy_block *buddy; + struct gpu_buddy_block *buddy; buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) + if (!gpu_buddy_block_is_free(buddy)) break; if (!force_merge) { @@ -198,31 +188,31 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm, * Check the block and its buddy clear state and exit * the loop if they both have the dissimilar state. */ - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) + if (gpu_buddy_block_is_clear(block) != + gpu_buddy_block_is_clear(buddy)) break; - if (drm_buddy_block_is_clear(block)) + if (gpu_buddy_block_is_clear(block)) mark_cleared(parent); } rbtree_remove(mm, buddy); - if (force_merge && drm_buddy_block_is_clear(buddy)) - mm->clear_avail -= drm_buddy_block_size(mm, buddy); + if (force_merge && gpu_buddy_block_is_clear(buddy)) + mm->clear_avail -= gpu_buddy_block_size(mm, buddy); - drm_block_free(mm, block); - drm_block_free(mm, buddy); + gpu_block_free(mm, block); + gpu_block_free(mm, buddy); block = parent; } - order = drm_buddy_block_order(block); + order = gpu_buddy_block_order(block); mark_free(mm, block); return order; } -static int __force_merge(struct drm_buddy *mm, +static int __force_merge(struct gpu_buddy *mm, u64 start, u64 end, unsigned int min_order) @@ -241,7 +231,7 @@ static int __force_merge(struct drm_buddy *mm, struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); while (iter) { - struct drm_buddy_block *block, *buddy; + struct gpu_buddy_block *block, *buddy; u64 block_start, block_end; block = rbtree_get_free_block(iter); @@ -250,18 +240,18 @@ static int __force_merge(struct drm_buddy *mm, if (!block || !block->parent) continue; - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!contains(start, end, block_start, block_end)) continue; buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) + if (!gpu_buddy_block_is_free(buddy)) continue; - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); + WARN_ON(gpu_buddy_block_is_clear(block) == + gpu_buddy_block_is_clear(buddy)); /* * Advance to the next node when the current node is the buddy, @@ -271,10 +261,10 @@ static int __force_merge(struct drm_buddy *mm, iter = rb_prev(iter); rbtree_remove(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); - order = __drm_buddy_free(mm, block, true); + order = __gpu_buddy_free(mm, block, true); if (order >= min_order) return 0; } @@ -285,9 +275,9 @@ static int __force_merge(struct drm_buddy *mm, } /** - * drm_buddy_init - init memory manager + * gpu_buddy_init - init memory manager * - * @mm: DRM buddy manager to initialize + * @mm: GPU buddy manager to initialize * @size: size in bytes to manage * @chunk_size: minimum page size in bytes for our allocations * @@ -296,7 +286,7 @@ static int __force_merge(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) +int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size) { unsigned int i, j, root_count = 0; u64 offset = 0; @@ -318,9 +308,9 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) mm->chunk_size = chunk_size; mm->max_order = ilog2(size) - ilog2(chunk_size); - BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); + BUG_ON(mm->max_order > GPU_BUDDY_MAX_ORDER); - mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, + mm->free_trees = kmalloc_array(GPU_BUDDY_MAX_FREE_TREES, sizeof(*mm->free_trees), GFP_KERNEL); if (!mm->free_trees) @@ -340,7 +330,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) mm->n_roots = hweight64(size); mm->roots = kmalloc_array(mm->n_roots, - sizeof(struct drm_buddy_block *), + sizeof(struct gpu_buddy_block *), GFP_KERNEL); if (!mm->roots) goto out_free_tree; @@ -350,21 +340,21 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) * not itself a power-of-two. */ do { - struct drm_buddy_block *root; + struct gpu_buddy_block *root; unsigned int order; u64 root_size; order = ilog2(size) - ilog2(chunk_size); root_size = chunk_size << order; - root = drm_block_alloc(mm, NULL, order, offset); + root = gpu_block_alloc(mm, NULL, order, offset); if (!root) goto out_free_roots; mark_free(mm, root); BUG_ON(root_count > mm->max_order); - BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); + BUG_ON(gpu_buddy_block_size(mm, root) < chunk_size); mm->roots[root_count] = root; @@ -377,7 +367,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) out_free_roots: while (root_count--) - drm_block_free(mm, mm->roots[root_count]); + gpu_block_free(mm, mm->roots[root_count]); kfree(mm->roots); out_free_tree: while (i--) @@ -385,16 +375,16 @@ out_free_tree: kfree(mm->free_trees); return -ENOMEM; } -EXPORT_SYMBOL(drm_buddy_init); +EXPORT_SYMBOL(gpu_buddy_init); /** - * drm_buddy_fini - tear down the memory manager + * gpu_buddy_fini - tear down the memory manager * - * @mm: DRM buddy manager to free + * @mm: GPU buddy manager to free * * Cleanup memory manager resources and the freetree */ -void drm_buddy_fini(struct drm_buddy *mm) +void gpu_buddy_fini(struct gpu_buddy *mm) { u64 root_size, size, start; unsigned int order; @@ -404,13 +394,13 @@ void drm_buddy_fini(struct drm_buddy *mm) for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); + start = gpu_buddy_block_offset(mm->roots[i]); __force_merge(mm, start, start + size, order); - if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) + if (WARN_ON(!gpu_buddy_block_is_free(mm->roots[i]))) kunit_fail_current_test("buddy_fini() root"); - drm_block_free(mm, mm->roots[i]); + gpu_block_free(mm, mm->roots[i]); root_size = mm->chunk_size << order; size -= root_size; @@ -423,31 +413,31 @@ void drm_buddy_fini(struct drm_buddy *mm) kfree(mm->free_trees); kfree(mm->roots); } -EXPORT_SYMBOL(drm_buddy_fini); +EXPORT_SYMBOL(gpu_buddy_fini); -static int split_block(struct drm_buddy *mm, - struct drm_buddy_block *block) +static int split_block(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - unsigned int block_order = drm_buddy_block_order(block) - 1; - u64 offset = drm_buddy_block_offset(block); + unsigned int block_order = gpu_buddy_block_order(block) - 1; + u64 offset = gpu_buddy_block_offset(block); - BUG_ON(!drm_buddy_block_is_free(block)); - BUG_ON(!drm_buddy_block_order(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_order(block)); - block->left = drm_block_alloc(mm, block, block_order, offset); + block->left = gpu_block_alloc(mm, block, block_order, offset); if (!block->left) return -ENOMEM; - block->right = drm_block_alloc(mm, block, block_order, + block->right = gpu_block_alloc(mm, block, block_order, offset + (mm->chunk_size << block_order)); if (!block->right) { - drm_block_free(mm, block->left); + gpu_block_free(mm, block->left); return -ENOMEM; } mark_split(mm, block); - if (drm_buddy_block_is_clear(block)) { + if (gpu_buddy_block_is_clear(block)) { mark_cleared(block->left); mark_cleared(block->right); clear_reset(block); @@ -460,34 +450,34 @@ static int split_block(struct drm_buddy *mm, } /** - * drm_get_buddy - get buddy address + * gpu_get_buddy - get buddy address * - * @block: DRM buddy block + * @block: GPU buddy block * * Returns the corresponding buddy block for @block, or NULL * if this is a root block and can't be merged further. * Requires some kind of locking to protect against * any concurrent allocate and free operations. */ -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block) +struct gpu_buddy_block * +gpu_get_buddy(struct gpu_buddy_block *block) { return __get_buddy(block); } -EXPORT_SYMBOL(drm_get_buddy); +EXPORT_SYMBOL(gpu_get_buddy); /** - * drm_buddy_reset_clear - reset blocks clear state + * gpu_buddy_reset_clear - reset blocks clear state * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @is_clear: blocks clear state * * Reset the clear state based on @is_clear value for each block * in the freetree. */ -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) +void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear) { - enum drm_buddy_free_tree src_tree, dst_tree; + enum gpu_buddy_free_tree src_tree, dst_tree; u64 root_size, size, start; unsigned int order; int i; @@ -495,60 +485,60 @@ void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) size = mm->size; for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); + start = gpu_buddy_block_offset(mm->roots[i]); __force_merge(mm, start, start + size, order); root_size = mm->chunk_size << order; size -= root_size; } - src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + src_tree = is_clear ? GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; + dst_tree = is_clear ? GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; for (i = 0; i <= mm->max_order; ++i) { struct rb_root *root = &mm->free_trees[src_tree][i]; - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { rbtree_remove(mm, block); if (is_clear) { mark_cleared(block); - mm->clear_avail += drm_buddy_block_size(mm, block); + mm->clear_avail += gpu_buddy_block_size(mm, block); } else { clear_reset(block); - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->clear_avail -= gpu_buddy_block_size(mm, block); } rbtree_insert(mm, block, dst_tree); } } } -EXPORT_SYMBOL(drm_buddy_reset_clear); +EXPORT_SYMBOL(gpu_buddy_reset_clear); /** - * drm_buddy_free_block - free a block + * gpu_buddy_free_block - free a block * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @block: block to be freed */ -void drm_buddy_free_block(struct drm_buddy *mm, - struct drm_buddy_block *block) +void gpu_buddy_free_block(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - BUG_ON(!drm_buddy_block_is_allocated(block)); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); + BUG_ON(!gpu_buddy_block_is_allocated(block)); + mm->avail += gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail += gpu_buddy_block_size(mm, block); - __drm_buddy_free(mm, block, false); + __gpu_buddy_free(mm, block, false); } -EXPORT_SYMBOL(drm_buddy_free_block); +EXPORT_SYMBOL(gpu_buddy_free_block); -static void __drm_buddy_free_list(struct drm_buddy *mm, +static void __gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, bool mark_clear, bool mark_dirty) { - struct drm_buddy_block *block, *on; + struct gpu_buddy_block *block, *on; WARN_ON(mark_dirty && mark_clear); @@ -557,13 +547,13 @@ static void __drm_buddy_free_list(struct drm_buddy *mm, mark_cleared(block); else if (mark_dirty) clear_reset(block); - drm_buddy_free_block(mm, block); + gpu_buddy_free_block(mm, block); cond_resched(); } INIT_LIST_HEAD(objects); } -static void drm_buddy_free_list_internal(struct drm_buddy *mm, +static void gpu_buddy_free_list_internal(struct gpu_buddy *mm, struct list_head *objects) { /* @@ -571,43 +561,43 @@ static void drm_buddy_free_list_internal(struct drm_buddy *mm, * at this point. For example we might have just failed part of the * allocation. */ - __drm_buddy_free_list(mm, objects, false, false); + __gpu_buddy_free_list(mm, objects, false, false); } /** - * drm_buddy_free_list - free blocks + * gpu_buddy_free_list - free blocks * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @objects: input list head to free blocks - * @flags: optional flags like DRM_BUDDY_CLEARED + * @flags: optional flags like GPU_BUDDY_CLEARED */ -void drm_buddy_free_list(struct drm_buddy *mm, +void gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, unsigned int flags) { - bool mark_clear = flags & DRM_BUDDY_CLEARED; + bool mark_clear = flags & GPU_BUDDY_CLEARED; - __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); + __gpu_buddy_free_list(mm, objects, mark_clear, !mark_clear); } -EXPORT_SYMBOL(drm_buddy_free_list); +EXPORT_SYMBOL(gpu_buddy_free_list); -static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) +static bool block_incompatible(struct gpu_buddy_block *block, unsigned int flags) { - bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; + bool needs_clear = flags & GPU_BUDDY_CLEAR_ALLOCATION; - return needs_clear != drm_buddy_block_is_clear(block); + return needs_clear != gpu_buddy_block_is_clear(block); } -static struct drm_buddy_block * -__alloc_range_bias(struct drm_buddy *mm, +static struct gpu_buddy_block * +__alloc_range_bias(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags, bool fallback) { u64 req_size = mm->chunk_size << order; - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; + struct gpu_buddy_block *block; + struct gpu_buddy_block *buddy; LIST_HEAD(dfs); int err; int i; @@ -622,23 +612,23 @@ __alloc_range_bias(struct drm_buddy *mm, u64 block_end; block = list_first_entry_or_null(&dfs, - struct drm_buddy_block, + struct gpu_buddy_block, tmp_link); if (!block) break; list_del(&block->tmp_link); - if (drm_buddy_block_order(block) < order) + if (gpu_buddy_block_order(block) < order) continue; - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!overlaps(start, end, block_start, block_end)) continue; - if (drm_buddy_block_is_allocated(block)) + if (gpu_buddy_block_is_allocated(block)) continue; if (block_start < start || block_end > end) { @@ -654,17 +644,17 @@ __alloc_range_bias(struct drm_buddy *mm, continue; if (contains(start, end, block_start, block_end) && - order == drm_buddy_block_order(block)) { + order == gpu_buddy_block_order(block)) { /* * Find the free block within the range. */ - if (drm_buddy_block_is_free(block)) + if (gpu_buddy_block_is_free(block)) return block; continue; } - if (!drm_buddy_block_is_split(block)) { + if (!gpu_buddy_block_is_split(block)) { err = split_block(mm, block); if (unlikely(err)) goto err_undo; @@ -684,19 +674,19 @@ err_undo: */ buddy = __get_buddy(block); if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); return ERR_PTR(err); } -static struct drm_buddy_block * -__drm_buddy_alloc_range_bias(struct drm_buddy *mm, +static struct gpu_buddy_block * +__gpu_buddy_alloc_range_bias(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; bool fallback = false; block = __alloc_range_bias(mm, start, end, order, @@ -708,12 +698,12 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, return block; } -static struct drm_buddy_block * -get_maxblock(struct drm_buddy *mm, +static struct gpu_buddy_block * +get_maxblock(struct gpu_buddy *mm, unsigned int order, - enum drm_buddy_free_tree tree) + enum gpu_buddy_free_tree tree) { - struct drm_buddy_block *max_block = NULL, *block = NULL; + struct gpu_buddy_block *max_block = NULL, *block = NULL; struct rb_root *root; unsigned int i; @@ -728,8 +718,8 @@ get_maxblock(struct drm_buddy *mm, continue; } - if (drm_buddy_block_offset(block) > - drm_buddy_block_offset(max_block)) { + if (gpu_buddy_block_offset(block) > + gpu_buddy_block_offset(max_block)) { max_block = block; } } @@ -737,25 +727,25 @@ get_maxblock(struct drm_buddy *mm, return max_block; } -static struct drm_buddy_block * -alloc_from_freetree(struct drm_buddy *mm, +static struct gpu_buddy_block * +alloc_from_freetree(struct gpu_buddy *mm, unsigned int order, unsigned long flags) { - struct drm_buddy_block *block = NULL; + struct gpu_buddy_block *block = NULL; struct rb_root *root; - enum drm_buddy_free_tree tree; + enum gpu_buddy_free_tree tree; unsigned int tmp; int err; - tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; - if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { + if (flags & GPU_BUDDY_TOPDOWN_ALLOCATION) { block = get_maxblock(mm, order, tree); if (block) /* Store the obtained block order */ - tmp = drm_buddy_block_order(block); + tmp = gpu_buddy_block_order(block); } else { for (tmp = order; tmp <= mm->max_order; ++tmp) { /* Get RB tree root for this order and tree */ @@ -768,8 +758,8 @@ alloc_from_freetree(struct drm_buddy *mm, if (!block) { /* Try allocating from the other tree */ - tree = (tree == DRM_BUDDY_CLEAR_TREE) ? - DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + tree = (tree == GPU_BUDDY_CLEAR_TREE) ? + GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; for (tmp = order; tmp <= mm->max_order; ++tmp) { root = &mm->free_trees[tree][tmp]; @@ -782,7 +772,7 @@ alloc_from_freetree(struct drm_buddy *mm, return ERR_PTR(-ENOSPC); } - BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); while (tmp != order) { err = split_block(mm, block); @@ -796,18 +786,18 @@ alloc_from_freetree(struct drm_buddy *mm, err_undo: if (tmp != order) - __drm_buddy_free(mm, block, false); + __gpu_buddy_free(mm, block, false); return ERR_PTR(err); } -static int __alloc_range(struct drm_buddy *mm, +static int __alloc_range(struct gpu_buddy *mm, struct list_head *dfs, u64 start, u64 size, struct list_head *blocks, u64 *total_allocated_on_err) { - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; + struct gpu_buddy_block *block; + struct gpu_buddy_block *buddy; u64 total_allocated = 0; LIST_HEAD(allocated); u64 end; @@ -820,31 +810,31 @@ static int __alloc_range(struct drm_buddy *mm, u64 block_end; block = list_first_entry_or_null(dfs, - struct drm_buddy_block, + struct gpu_buddy_block, tmp_link); if (!block) break; list_del(&block->tmp_link); - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!overlaps(start, end, block_start, block_end)) continue; - if (drm_buddy_block_is_allocated(block)) { + if (gpu_buddy_block_is_allocated(block)) { err = -ENOSPC; goto err_free; } if (contains(start, end, block_start, block_end)) { - if (drm_buddy_block_is_free(block)) { + if (gpu_buddy_block_is_free(block)) { mark_allocated(mm, block); - total_allocated += drm_buddy_block_size(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + total_allocated += gpu_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); list_add_tail(&block->link, &allocated); continue; } else if (!mm->clear_avail) { @@ -853,7 +843,7 @@ static int __alloc_range(struct drm_buddy *mm, } } - if (!drm_buddy_block_is_split(block)) { + if (!gpu_buddy_block_is_split(block)) { err = split_block(mm, block); if (unlikely(err)) goto err_undo; @@ -880,22 +870,22 @@ err_undo: */ buddy = __get_buddy(block); if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); err_free: if (err == -ENOSPC && total_allocated_on_err) { list_splice_tail(&allocated, blocks); *total_allocated_on_err = total_allocated; } else { - drm_buddy_free_list_internal(mm, &allocated); + gpu_buddy_free_list_internal(mm, &allocated); } return err; } -static int __drm_buddy_alloc_range(struct drm_buddy *mm, +static int __gpu_buddy_alloc_range(struct gpu_buddy *mm, u64 start, u64 size, u64 *total_allocated_on_err, @@ -911,13 +901,13 @@ static int __drm_buddy_alloc_range(struct drm_buddy *mm, blocks, total_allocated_on_err); } -static int __alloc_contig_try_harder(struct drm_buddy *mm, +static int __alloc_contig_try_harder(struct gpu_buddy *mm, u64 size, u64 min_block_size, struct list_head *blocks) { u64 rhs_offset, lhs_offset, lhs_size, filled; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned int tree, order; LIST_HEAD(blocks_lhs); unsigned long pages; @@ -943,8 +933,8 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, block = rbtree_get_free_block(iter); /* Allocate blocks traversing RHS */ - rhs_offset = drm_buddy_block_offset(block); - err = __drm_buddy_alloc_range(mm, rhs_offset, size, + rhs_offset = gpu_buddy_block_offset(block); + err = __gpu_buddy_alloc_range(mm, rhs_offset, size, &filled, blocks); if (!err || err != -ENOSPC) return err; @@ -954,18 +944,18 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, lhs_size = round_up(lhs_size, min_block_size); /* Allocate blocks traversing LHS */ - lhs_offset = drm_buddy_block_offset(block) - lhs_size; - err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, + lhs_offset = gpu_buddy_block_offset(block) - lhs_size; + err = __gpu_buddy_alloc_range(mm, lhs_offset, lhs_size, NULL, &blocks_lhs); if (!err) { list_splice(&blocks_lhs, blocks); return 0; } else if (err != -ENOSPC) { - drm_buddy_free_list_internal(mm, blocks); + gpu_buddy_free_list_internal(mm, blocks); return err; } /* Free blocks for the next iteration */ - drm_buddy_free_list_internal(mm, blocks); + gpu_buddy_free_list_internal(mm, blocks); iter = rb_prev(iter); } @@ -975,9 +965,9 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, } /** - * drm_buddy_block_trim - free unused pages + * gpu_buddy_block_trim - free unused pages * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. @@ -993,13 +983,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_block_trim(struct drm_buddy *mm, +int gpu_buddy_block_trim(struct gpu_buddy *mm, u64 *start, u64 new_size, struct list_head *blocks) { - struct drm_buddy_block *parent; - struct drm_buddy_block *block; + struct gpu_buddy_block *parent; + struct gpu_buddy_block *block; u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; @@ -1009,22 +999,22 @@ int drm_buddy_block_trim(struct drm_buddy *mm, return -EINVAL; block = list_first_entry(blocks, - struct drm_buddy_block, + struct gpu_buddy_block, link); - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block); + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block); - if (WARN_ON(!drm_buddy_block_is_allocated(block))) + if (WARN_ON(!gpu_buddy_block_is_allocated(block))) return -EINVAL; - if (new_size > drm_buddy_block_size(mm, block)) + if (new_size > gpu_buddy_block_size(mm, block)) return -EINVAL; if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) return -EINVAL; - if (new_size == drm_buddy_block_size(mm, block)) + if (new_size == gpu_buddy_block_size(mm, block)) return 0; new_start = block_start; @@ -1043,9 +1033,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, list_del(&block->link); mark_free(mm, block); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); + mm->avail += gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail += gpu_buddy_block_size(mm, block); /* Prevent recursively freeing this node */ parent = block->parent; @@ -1055,26 +1045,26 @@ int drm_buddy_block_trim(struct drm_buddy *mm, err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); if (err) { mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); list_add(&block->link, blocks); } block->parent = parent; return err; } -EXPORT_SYMBOL(drm_buddy_block_trim); +EXPORT_SYMBOL(gpu_buddy_block_trim); -static struct drm_buddy_block * -__drm_buddy_alloc_blocks(struct drm_buddy *mm, +static struct gpu_buddy_block * +__gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags) { - if (flags & DRM_BUDDY_RANGE_ALLOCATION) + if (flags & GPU_BUDDY_RANGE_ALLOCATION) /* Allocate traversing within the range */ - return __drm_buddy_alloc_range_bias(mm, start, end, + return __gpu_buddy_alloc_range_bias(mm, start, end, order, flags); else /* Allocate from freetree */ @@ -1082,15 +1072,15 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm, } /** - * drm_buddy_alloc_blocks - allocate power-of-two blocks + * gpu_buddy_alloc_blocks - allocate power-of-two blocks * - * @mm: DRM buddy manager to allocate from + * @mm: GPU buddy manager to allocate from * @start: start of the allowed range for this block * @end: end of the allowed range for this block * @size: size of the allocation in bytes * @min_block_size: alignment of the allocation * @blocks: output list head to add allocated blocks - * @flags: DRM_BUDDY_*_ALLOCATION flags + * @flags: GPU_BUDDY_*_ALLOCATION flags * * alloc_range_bias() called on range limitations, which traverses * the tree and returns the desired block. @@ -1101,13 +1091,13 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_alloc_blocks(struct drm_buddy *mm, +int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_block_size, struct list_head *blocks, unsigned long flags) { - struct drm_buddy_block *block = NULL; + struct gpu_buddy_block *block = NULL; u64 original_size, original_min_size; unsigned int min_order, order; LIST_HEAD(allocated); @@ -1137,14 +1127,14 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, if (!IS_ALIGNED(start | end, min_block_size)) return -EINVAL; - return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); + return __gpu_buddy_alloc_range(mm, start, size, NULL, blocks); } original_size = size; original_min_size = min_block_size; /* Roundup the size to power of 2 */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) { size = roundup_pow_of_two(size); min_block_size = size; /* Align size value to min_block_size */ @@ -1157,8 +1147,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); if (order > mm->max_order || size > mm->size) { - if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + if ((flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) return __alloc_contig_try_harder(mm, original_size, original_min_size, blocks); @@ -1171,7 +1161,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, BUG_ON(order < min_order); do { - block = __drm_buddy_alloc_blocks(mm, start, + block = __gpu_buddy_alloc_blocks(mm, start, end, order, flags); @@ -1182,7 +1172,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, /* Try allocation through force merge method */ if (mm->clear_avail && !__force_merge(mm, start, end, min_order)) { - block = __drm_buddy_alloc_blocks(mm, start, + block = __gpu_buddy_alloc_blocks(mm, start, end, min_order, flags); @@ -1196,8 +1186,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, * Try contiguous block allocation through * try harder method. */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) return __alloc_contig_try_harder(mm, original_size, original_min_size, @@ -1208,9 +1198,9 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); kmemleak_update_trace(block); list_add_tail(&block->link, &allocated); @@ -1221,7 +1211,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + if (!(flags & GPU_BUDDY_TRIM_DISABLE) && original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); @@ -1234,11 +1224,11 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, block = list_last_entry(&allocated, typeof(*block), link); list_move(&block->link, &temp); trim_list = &temp; - trim_size = drm_buddy_block_size(mm, block) - + trim_size = gpu_buddy_block_size(mm, block) - (size - original_size); } - drm_buddy_block_trim(mm, + gpu_buddy_block_trim(mm, NULL, trim_size, trim_list); @@ -1251,44 +1241,42 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, return 0; err_free: - drm_buddy_free_list_internal(mm, &allocated); + gpu_buddy_free_list_internal(mm, &allocated); return err; } -EXPORT_SYMBOL(drm_buddy_alloc_blocks); +EXPORT_SYMBOL(gpu_buddy_alloc_blocks); /** - * drm_buddy_block_print - print block information + * gpu_buddy_block_print - print block information * - * @mm: DRM buddy manager - * @block: DRM buddy block - * @p: DRM printer to use + * @mm: GPU buddy manager + * @block: GPU buddy block */ -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p) +void gpu_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - u64 start = drm_buddy_block_offset(block); - u64 size = drm_buddy_block_size(mm, block); + u64 start = gpu_buddy_block_offset(block); + u64 size = gpu_buddy_block_size(mm, block); - drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); + pr_info("%#018llx-%#018llx: %llu\n", start, start + size, size); } -EXPORT_SYMBOL(drm_buddy_block_print); +EXPORT_SYMBOL(gpu_buddy_block_print); /** - * drm_buddy_print - print allocator state + * gpu_buddy_print - print allocator state * - * @mm: DRM buddy manager - * @p: DRM printer to use + * @mm: GPU buddy manager + * @p: GPU printer to use */ -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) +void gpu_buddy_print(struct gpu_buddy *mm) { int order; - drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", - mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + pr_info("chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); for (order = mm->max_order; order >= 0; order--) { - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; struct rb_root *root; u64 count = 0, free; unsigned int tree; @@ -1297,40 +1285,38 @@ void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) root = &mm->free_trees[tree][order]; rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); count++; } } - drm_printf(p, "order-%2d ", order); - free = count * (mm->chunk_size << order); if (free < SZ_1M) - drm_printf(p, "free: %8llu KiB", free >> 10); + pr_info("order-%2d free: %8llu KiB, blocks: %llu\n", + order, free >> 10, count); else - drm_printf(p, "free: %8llu MiB", free >> 20); - - drm_printf(p, ", blocks: %llu\n", count); + pr_info("order-%2d free: %8llu MiB, blocks: %llu\n", + order, free >> 20, count); } } -EXPORT_SYMBOL(drm_buddy_print); +EXPORT_SYMBOL(gpu_buddy_print); -static void drm_buddy_module_exit(void) +static void gpu_buddy_module_exit(void) { kmem_cache_destroy(slab_blocks); } -static int __init drm_buddy_module_init(void) +static int __init gpu_buddy_module_init(void) { - slab_blocks = KMEM_CACHE(drm_buddy_block, 0); + slab_blocks = KMEM_CACHE(gpu_buddy_block, 0); if (!slab_blocks) return -ENOMEM; return 0; } -module_init(drm_buddy_module_init); -module_exit(drm_buddy_module_exit); +module_init(gpu_buddy_module_init); +module_exit(gpu_buddy_module_exit); -MODULE_DESCRIPTION("DRM Buddy Allocator"); +MODULE_DESCRIPTION("GPU Buddy Allocator"); MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 862ff4000969..758f2eb3d588 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -220,6 +220,7 @@ config DRM_GPUSVM config DRM_BUDDY tristate depends on DRM + select GPU_BUDDY help A page based buddy allocator diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 892859cfe95f..d0e37f8c2a46 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -114,7 +114,7 @@ drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \ obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o -obj-$(CONFIG_DRM_BUDDY) += ../buddy.o +obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o drm_dma_helper-y := drm_gem_dma_helper.o drm_dma_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fbdev_dma.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f582113d78b7..149f8f942eae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -5663,7 +5663,7 @@ int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_vram_mgr_resource *vres; struct ras_critical_region *region; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; int ret = 0; if (!bo || !bo->tbo.resource) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index be2e56ce1355..8908d9e08a30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -55,7 +55,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, uint64_t start, uint64_t size, struct amdgpu_res_cursor *cur) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *head, *next; struct drm_mm_node *node; @@ -71,7 +71,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, head = &to_amdgpu_vram_mgr_resource(res)->blocks; block = list_first_entry_or_null(head, - struct drm_buddy_block, + struct gpu_buddy_block, link); if (!block) goto fallback; @@ -81,7 +81,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, next = block->link.next; if (next != head) - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); } cur->start = amdgpu_vram_mgr_block_start(block) + start; @@ -125,7 +125,7 @@ fallback: */ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct drm_mm_node *node; struct list_head *next; @@ -146,7 +146,7 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) block = cur->node; next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); cur->node = block; cur->start = amdgpu_vram_mgr_block_start(block); @@ -175,7 +175,7 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) */ static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; switch (cur->mem_type) { case TTM_PL_VRAM: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9d934c07fa6b..cd94f6efb7cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_vm.h" @@ -52,15 +53,15 @@ to_amdgpu_device(struct amdgpu_vram_mgr *mgr) return container_of(mgr, struct amdgpu_device, mman.vram_mgr); } -static inline struct drm_buddy_block * +static inline struct gpu_buddy_block * amdgpu_vram_mgr_first_block(struct list_head *list) { - return list_first_entry_or_null(list, struct drm_buddy_block, link); + return list_first_entry_or_null(list, struct gpu_buddy_block, link); } static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; block = amdgpu_vram_mgr_first_block(head); @@ -71,7 +72,7 @@ static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) start = amdgpu_vram_mgr_block_start(block); size = amdgpu_vram_mgr_block_size(block); - block = list_entry(block->link.next, struct drm_buddy_block, link); + block = list_entry(block->link.next, struct gpu_buddy_block, link); if (start + size != amdgpu_vram_mgr_block_start(block)) return false; } @@ -81,7 +82,7 @@ static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 size = 0; list_for_each_entry(block, head, link) @@ -254,7 +255,7 @@ const struct attribute_group amdgpu_vram_mgr_attr_group = { * Calculate how many bytes of the DRM BUDDY block are inside visible VRAM */ static u64 amdgpu_vram_mgr_vis_size(struct amdgpu_device *adev, - struct drm_buddy_block *block) + struct gpu_buddy_block *block) { u64 start = amdgpu_vram_mgr_block_start(block); u64 end = start + amdgpu_vram_mgr_block_size(block); @@ -279,7 +280,7 @@ u64 amdgpu_vram_mgr_bo_visible_size(struct amdgpu_bo *bo) struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_resource *res = bo->tbo.resource; struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 usage = 0; if (amdgpu_gmc_vram_full_visible(&adev->gmc)) @@ -299,15 +300,15 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; struct amdgpu_vram_reservation *rsv, *temp; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; uint64_t vis_usage; list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks) { - if (drm_buddy_alloc_blocks(mm, rsv->start, rsv->start + rsv->size, + if (gpu_buddy_alloc_blocks(mm, rsv->start, rsv->start + rsv->size, rsv->size, mm->chunk_size, &rsv->allocated, - DRM_BUDDY_RANGE_ALLOCATION)) + GPU_BUDDY_RANGE_ALLOCATION)) continue; block = amdgpu_vram_mgr_first_block(&rsv->allocated); @@ -403,7 +404,7 @@ int amdgpu_vram_mgr_query_address_block_info(struct amdgpu_vram_mgr *mgr, uint64_t address, struct amdgpu_vram_block_info *info) { struct amdgpu_vram_mgr_resource *vres; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; int ret = -ENOENT; @@ -450,8 +451,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; unsigned int adjust_dcc_size = 0; - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; unsigned long pages_per_block; int r; @@ -493,17 +494,17 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, INIT_LIST_HEAD(&vres->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + vres->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + vres->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED) - vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION; + vres->flags |= GPU_BUDDY_CLEAR_ALLOCATION; if (fpfn || lpfn != mgr->mm.size) /* Allocate blocks in desired range */ - vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + vres->flags |= GPU_BUDDY_RANGE_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC && adev->gmc.gmc_funcs->get_dcc_alignment) @@ -516,7 +517,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, dcc_size = roundup_pow_of_two(vres->base.size + adjust_dcc_size); remaining_size = (u64)dcc_size; - vres->flags |= DRM_BUDDY_TRIM_DISABLE; + vres->flags |= GPU_BUDDY_TRIM_DISABLE; } mutex_lock(&mgr->lock); @@ -536,7 +537,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - r = drm_buddy_alloc_blocks(mm, fpfn, + r = gpu_buddy_alloc_blocks(mm, fpfn, lpfn, size, min_block_size, @@ -545,7 +546,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (unlikely(r == -ENOSPC) && pages_per_block == ~0ul && !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) { - vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION; + vres->flags &= ~GPU_BUDDY_CONTIGUOUS_ALLOCATION; pages_per_block = max_t(u32, 2UL << (20UL - PAGE_SHIFT), tbo->page_alignment); @@ -566,7 +567,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, list_add_tail(&vres->vres_node, &mgr->allocated_vres_list); if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { - struct drm_buddy_block *dcc_block; + struct gpu_buddy_block *dcc_block; unsigned long dcc_start; u64 trim_start; @@ -576,7 +577,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, roundup((unsigned long)amdgpu_vram_mgr_block_start(dcc_block), adjust_dcc_size); trim_start = (u64)dcc_start; - drm_buddy_block_trim(mm, &trim_start, + gpu_buddy_block_trim(mm, &trim_start, (u64)vres->base.size, &vres->blocks); } @@ -614,7 +615,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, &vres->blocks, 0); + gpu_buddy_free_list(mm, &vres->blocks, 0); mutex_unlock(&mgr->lock); error_fini: ttm_resource_fini(man, &vres->base); @@ -637,8 +638,8 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; uint64_t vis_usage = 0; mutex_lock(&mgr->lock); @@ -649,7 +650,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) vis_usage += amdgpu_vram_mgr_vis_size(adev, block); - drm_buddy_free_list(mm, &vres->blocks, vres->flags); + gpu_buddy_free_list(mm, &vres->blocks, vres->flags); amdgpu_vram_mgr_do_reserve(man); mutex_unlock(&mgr->lock); @@ -688,7 +689,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, if (!*sgt) return -ENOMEM; - /* Determine the number of DRM_BUDDY blocks to export */ + /* Determine the number of GPU_BUDDY blocks to export */ amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; @@ -704,10 +705,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg->length = 0; /* - * Walk down DRM_BUDDY blocks to populate scatterlist nodes - * @note: Use iterator api to get first the DRM_BUDDY block + * Walk down GPU_BUDDY blocks to populate scatterlist nodes + * @note: Use iterator api to get first the GPU_BUDDY block * and the number of bytes from it. Access the following - * DRM_BUDDY block(s) if more buffer needs to exported + * GPU_BUDDY block(s) if more buffer needs to exported */ amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { @@ -792,10 +793,10 @@ uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_vram_mgr *mgr) void amdgpu_vram_mgr_clear_reset_blocks(struct amdgpu_device *adev) { struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); - drm_buddy_reset_clear(mm, false); + gpu_buddy_reset_clear(mm, false); mutex_unlock(&mgr->lock); } @@ -815,7 +816,7 @@ static bool amdgpu_vram_mgr_intersects(struct ttm_resource_manager *man, size_t size) { struct amdgpu_vram_mgr_resource *mgr = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; /* Check each drm buddy block individually */ list_for_each_entry(block, &mgr->blocks, link) { @@ -848,7 +849,7 @@ static bool amdgpu_vram_mgr_compatible(struct ttm_resource_manager *man, size_t size) { struct amdgpu_vram_mgr_resource *mgr = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; /* Check each drm buddy block individually */ list_for_each_entry(block, &mgr->blocks, link) { @@ -877,7 +878,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; struct amdgpu_vram_reservation *rsv; drm_printf(printer, " vis usage:%llu\n", @@ -930,7 +931,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) mgr->default_page_size = PAGE_SIZE; man->func = &amdgpu_vram_mgr_func; - err = drm_buddy_init(&mgr->mm, man->size, PAGE_SIZE); + err = gpu_buddy_init(&mgr->mm, man->size, PAGE_SIZE); if (err) return err; @@ -965,11 +966,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) { - drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0); + gpu_buddy_free_list(&mgr->mm, &rsv->allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) - drm_buddy_fini(&mgr->mm); + gpu_buddy_fini(&mgr->mm); mutex_unlock(&mgr->lock); ttm_resource_manager_cleanup(man); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 874779618056..429a21a2e9b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -28,7 +28,7 @@ struct amdgpu_vram_mgr { struct ttm_resource_manager manager; - struct drm_buddy mm; + struct gpu_buddy mm; /* protects access to buffer objects */ struct mutex lock; struct list_head reservations_pending; @@ -57,19 +57,19 @@ struct amdgpu_vram_mgr_resource { struct amdgpu_vres_task task; }; -static inline u64 amdgpu_vram_mgr_block_start(struct drm_buddy_block *block) +static inline u64 amdgpu_vram_mgr_block_start(struct gpu_buddy_block *block) { - return drm_buddy_block_offset(block); + return gpu_buddy_block_offset(block); } -static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block) +static inline u64 amdgpu_vram_mgr_block_size(struct gpu_buddy_block *block) { - return (u64)PAGE_SIZE << drm_buddy_block_order(block); + return (u64)PAGE_SIZE << gpu_buddy_block_order(block); } -static inline bool amdgpu_vram_mgr_is_cleared(struct drm_buddy_block *block) +static inline bool amdgpu_vram_mgr_is_cleared(struct gpu_buddy_block *block) { - return drm_buddy_block_is_clear(block); + return gpu_buddy_block_is_clear(block); } static inline struct amdgpu_vram_mgr_resource * @@ -82,8 +82,8 @@ static inline void amdgpu_vram_mgr_set_cleared(struct ttm_resource *res) { struct amdgpu_vram_mgr_resource *ares = to_amdgpu_vram_mgr_resource(res); - WARN_ON(ares->flags & DRM_BUDDY_CLEARED); - ares->flags |= DRM_BUDDY_CLEARED; + WARN_ON(ares->flags & GPU_BUDDY_CLEARED); + ares->flags |= GPU_BUDDY_CLEARED; } int amdgpu_vram_mgr_query_address_block_info(struct amdgpu_vram_mgr *mgr, diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c new file mode 100644 index 000000000000..841f3de5f307 --- /dev/null +++ b/drivers/gpu/drm/drm_buddy.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +/** + * drm_buddy_block_print - print block information + * + * @mm: DRM buddy manager + * @block: DRM buddy block + * @p: DRM printer to use + */ +void drm_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + struct drm_printer *p) +{ + u64 start = gpu_buddy_block_offset(block); + u64 size = gpu_buddy_block_size(mm, block); + + drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); +} +EXPORT_SYMBOL(drm_buddy_block_print); + +/** + * drm_buddy_print - print allocator state + * + * @mm: DRM buddy manager + * @p: DRM printer to use + */ +void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer *p) +{ + int order; + + drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + + for (order = mm->max_order; order >= 0; order--) { + struct gpu_buddy_block *block, *tmp; + struct rb_root *root; + u64 count = 0, free; + unsigned int tree; + + for_each_free_tree(tree) { + root = &mm->free_trees[tree][order]; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + BUG_ON(!gpu_buddy_block_is_free(block)); + count++; + } + } + + drm_printf(p, "order-%2d ", order); + + free = count * (mm->chunk_size << order); + if (free < SZ_1M) + drm_printf(p, "free: %8llu KiB", free >> 10); + else + drm_printf(p, "free: %8llu MiB", free >> 20); + + drm_printf(p, ", blocks: %llu\n", count); + } +} +EXPORT_SYMBOL(drm_buddy_print); + +MODULE_DESCRIPTION("DRM-specific GPU Buddy Allocator Print Helpers"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index 30246f02bcfe..6a34dae13769 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -167,9 +167,9 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); const u64 size = res->size; const u32 max_segment = round_down(UINT_MAX, page_alignment); - struct drm_buddy *mm = bman_res->mm; + struct gpu_buddy *mm = bman_res->mm; struct list_head *blocks = &bman_res->blocks; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct i915_refct_sgt *rsgt; struct scatterlist *sg; struct sg_table *st; @@ -202,8 +202,8 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, list_for_each_entry(block, blocks, link) { u64 block_size, offset; - block_size = min_t(u64, size, drm_buddy_block_size(mm, block)); - offset = drm_buddy_block_offset(block); + block_size = min_t(u64, size, gpu_buddy_block_size(mm, block)); + offset = gpu_buddy_block_offset(block); while (block_size) { u64 len; diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c index 6b256d95badd..c5ca90088705 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -16,7 +17,7 @@ struct i915_ttm_buddy_manager { struct ttm_resource_manager manager; - struct drm_buddy mm; + struct gpu_buddy mm; struct list_head reserved; struct mutex lock; unsigned long visible_size; @@ -38,7 +39,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); struct i915_ttm_buddy_resource *bman_res; - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; unsigned long n_pages, lpfn; u64 min_page_size; u64 size; @@ -57,13 +58,13 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, bman_res->mm = mm; if (place->flags & TTM_PL_FLAG_TOPDOWN) - bman_res->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + bman_res->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) - bman_res->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + bman_res->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; if (place->fpfn || lpfn != man->size) - bman_res->flags |= DRM_BUDDY_RANGE_ALLOCATION; + bman_res->flags |= GPU_BUDDY_RANGE_ALLOCATION; GEM_BUG_ON(!bman_res->base.size); size = bman_res->base.size; @@ -89,7 +90,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, goto err_free_res; } - err = drm_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, + err = gpu_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, (u64)lpfn << PAGE_SHIFT, (u64)n_pages << PAGE_SHIFT, min_page_size, @@ -101,15 +102,15 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, if (lpfn <= bman->visible_size) { bman_res->used_visible_size = PFN_UP(bman_res->base.size); } else { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; list_for_each_entry(block, &bman_res->blocks, link) { unsigned long start = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; if (start < bman->visible_size) { unsigned long end = start + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); bman_res->used_visible_size += min(end, bman->visible_size) - start; @@ -126,7 +127,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, return 0; err_free_blocks: - drm_buddy_free_list(mm, &bman_res->blocks, 0); + gpu_buddy_free_list(mm, &bman_res->blocks, 0); mutex_unlock(&bman->lock); err_free_res: ttm_resource_fini(man, &bman_res->base); @@ -141,7 +142,7 @@ static void i915_ttm_buddy_man_free(struct ttm_resource_manager *man, struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); mutex_lock(&bman->lock); - drm_buddy_free_list(&bman->mm, &bman_res->blocks, 0); + gpu_buddy_free_list(&bman->mm, &bman_res->blocks, 0); bman->visible_avail += bman_res->used_visible_size; mutex_unlock(&bman->lock); @@ -156,8 +157,8 @@ static bool i915_ttm_buddy_man_intersects(struct ttm_resource_manager *man, { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &bman->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -176,9 +177,9 @@ static bool i915_ttm_buddy_man_intersects(struct ttm_resource_manager *man, /* Check each drm buddy block individually */ list_for_each_entry(block, &bman_res->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (place->fpfn < lpfn && place->lpfn > fpfn) return true; @@ -194,8 +195,8 @@ static bool i915_ttm_buddy_man_compatible(struct ttm_resource_manager *man, { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &bman->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -209,9 +210,9 @@ static bool i915_ttm_buddy_man_compatible(struct ttm_resource_manager *man, /* Check each drm buddy block individually */ list_for_each_entry(block, &bman_res->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (fpfn < place->fpfn || lpfn > place->lpfn) return false; @@ -224,7 +225,7 @@ static void i915_ttm_buddy_man_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; mutex_lock(&bman->lock); drm_printf(printer, "default_page_size: %lluKiB\n", @@ -293,7 +294,7 @@ int i915_ttm_buddy_man_init(struct ttm_device *bdev, if (!bman) return -ENOMEM; - err = drm_buddy_init(&bman->mm, size, chunk_size); + err = gpu_buddy_init(&bman->mm, size, chunk_size); if (err) goto err_free_bman; @@ -333,7 +334,7 @@ int i915_ttm_buddy_man_fini(struct ttm_device *bdev, unsigned int type) { struct ttm_resource_manager *man = ttm_manager_type(bdev, type); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; int ret; ttm_resource_manager_set_used(man, false); @@ -345,8 +346,8 @@ int i915_ttm_buddy_man_fini(struct ttm_device *bdev, unsigned int type) ttm_set_driver_manager(bdev, type, NULL); mutex_lock(&bman->lock); - drm_buddy_free_list(mm, &bman->reserved, 0); - drm_buddy_fini(mm); + gpu_buddy_free_list(mm, &bman->reserved, 0); + gpu_buddy_fini(mm); bman->visible_avail += bman->visible_reserved; WARN_ON_ONCE(bman->visible_avail != bman->visible_size); mutex_unlock(&bman->lock); @@ -371,15 +372,15 @@ int i915_ttm_buddy_man_reserve(struct ttm_resource_manager *man, u64 start, u64 size) { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; unsigned long fpfn = start >> PAGE_SHIFT; unsigned long flags = 0; int ret; - flags |= DRM_BUDDY_RANGE_ALLOCATION; + flags |= GPU_BUDDY_RANGE_ALLOCATION; mutex_lock(&bman->lock); - ret = drm_buddy_alloc_blocks(mm, start, + ret = gpu_buddy_alloc_blocks(mm, start, start + size, size, mm->chunk_size, &bman->reserved, diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h index d64620712830..1cff018c1689 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h @@ -13,7 +13,7 @@ struct ttm_device; struct ttm_resource_manager; -struct drm_buddy; +struct gpu_buddy; /** * struct i915_ttm_buddy_resource @@ -33,7 +33,7 @@ struct i915_ttm_buddy_resource { struct list_head blocks; unsigned long flags; unsigned long used_visible_size; - struct drm_buddy *mm; + struct gpu_buddy *mm; }; /** diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c index 7b856b5090f9..8307390943a2 100644 --- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c +++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include "../i915_selftest.h" @@ -371,7 +371,7 @@ static int igt_mock_splintered_region(void *arg) struct drm_i915_private *i915 = mem->i915; struct i915_ttm_buddy_resource *res; struct drm_i915_gem_object *obj; - struct drm_buddy *mm; + struct gpu_buddy *mm; unsigned int expected_order; LIST_HEAD(objects); u64 size; @@ -447,8 +447,8 @@ static int igt_mock_max_segment(void *arg) struct drm_i915_private *i915 = mem->i915; struct i915_ttm_buddy_resource *res; struct drm_i915_gem_object *obj; - struct drm_buddy_block *block; - struct drm_buddy *mm; + struct gpu_buddy_block *block; + struct gpu_buddy *mm; struct list_head *blocks; struct scatterlist *sg; I915_RND_STATE(prng); @@ -487,8 +487,8 @@ static int igt_mock_max_segment(void *arg) mm = res->mm; size = 0; list_for_each_entry(block, blocks, link) { - if (drm_buddy_block_size(mm, block) > size) - size = drm_buddy_block_size(mm, block); + if (gpu_buddy_block_size(mm, block) > size) + size = gpu_buddy_block_size(mm, block); } if (size < max_segment) { pr_err("%s: Failed to create a huge contiguous block [> %u], largest block %lld\n", @@ -527,14 +527,14 @@ static u64 igt_object_mappable_total(struct drm_i915_gem_object *obj) struct intel_memory_region *mr = obj->mm.region; struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(obj->mm.res); - struct drm_buddy *mm = bman_res->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = bman_res->mm; + struct gpu_buddy_block *block; u64 total; total = 0; list_for_each_entry(block, &bman_res->blocks, link) { - u64 start = drm_buddy_block_offset(block); - u64 end = start + drm_buddy_block_size(mm, block); + u64 start = gpu_buddy_block_offset(block); + u64 end = start + gpu_buddy_block_size(mm, block); if (start < resource_size(&mr->io)) total += min_t(u64, end, resource_size(&mr->io)) - start; diff --git a/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c b/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c index 6d95447a989d..e32f3c8d7b84 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c @@ -251,7 +251,7 @@ static void ttm_bo_validate_basic(struct kunit *test) NULL, &dummy_ttm_bo_destroy); KUNIT_EXPECT_EQ(test, err, 0); - snd_place = ttm_place_kunit_init(test, snd_mem, DRM_BUDDY_TOPDOWN_ALLOCATION); + snd_place = ttm_place_kunit_init(test, snd_mem, GPU_BUDDY_TOPDOWN_ALLOCATION); snd_placement = ttm_placement_kunit_init(test, snd_place, 1); err = ttm_bo_validate(bo, snd_placement, &ctx_val); @@ -263,7 +263,7 @@ static void ttm_bo_validate_basic(struct kunit *test) KUNIT_EXPECT_TRUE(test, ttm_tt_is_populated(bo->ttm)); KUNIT_EXPECT_EQ(test, bo->resource->mem_type, snd_mem); KUNIT_EXPECT_EQ(test, bo->resource->placement, - DRM_BUDDY_TOPDOWN_ALLOCATION); + GPU_BUDDY_TOPDOWN_ALLOCATION); ttm_bo_fini(bo); ttm_mock_manager_fini(priv->ttm_dev, snd_mem); diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c index dd395229e388..294d56d9067e 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c @@ -31,7 +31,7 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, { struct ttm_mock_manager *manager = to_mock_mgr(man); struct ttm_mock_resource *mock_res; - struct drm_buddy *mm = &manager->mm; + struct gpu_buddy *mm = &manager->mm; u64 lpfn, fpfn, alloc_size; int err; @@ -47,14 +47,14 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, INIT_LIST_HEAD(&mock_res->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - mock_res->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + mock_res->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) - mock_res->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + mock_res->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; alloc_size = (uint64_t)mock_res->base.size; mutex_lock(&manager->lock); - err = drm_buddy_alloc_blocks(mm, fpfn, lpfn, alloc_size, + err = gpu_buddy_alloc_blocks(mm, fpfn, lpfn, alloc_size, manager->default_page_size, &mock_res->blocks, mock_res->flags); @@ -67,7 +67,7 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, &mock_res->blocks, 0); + gpu_buddy_free_list(mm, &mock_res->blocks, 0); ttm_resource_fini(man, &mock_res->base); mutex_unlock(&manager->lock); @@ -79,10 +79,10 @@ static void ttm_mock_manager_free(struct ttm_resource_manager *man, { struct ttm_mock_manager *manager = to_mock_mgr(man); struct ttm_mock_resource *mock_res = to_mock_mgr_resource(res); - struct drm_buddy *mm = &manager->mm; + struct gpu_buddy *mm = &manager->mm; mutex_lock(&manager->lock); - drm_buddy_free_list(mm, &mock_res->blocks, 0); + gpu_buddy_free_list(mm, &mock_res->blocks, 0); mutex_unlock(&manager->lock); ttm_resource_fini(man, res); @@ -106,7 +106,7 @@ int ttm_mock_manager_init(struct ttm_device *bdev, u32 mem_type, u32 size) mutex_init(&manager->lock); - err = drm_buddy_init(&manager->mm, size, PAGE_SIZE); + err = gpu_buddy_init(&manager->mm, size, PAGE_SIZE); if (err) { kfree(manager); @@ -142,7 +142,7 @@ void ttm_mock_manager_fini(struct ttm_device *bdev, u32 mem_type) ttm_resource_manager_set_used(man, false); mutex_lock(&mock_man->lock); - drm_buddy_fini(&mock_man->mm); + gpu_buddy_fini(&mock_man->mm); mutex_unlock(&mock_man->lock); ttm_set_driver_manager(bdev, mem_type, NULL); diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h index 96ea8c9aae34..08710756fd8e 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h @@ -9,7 +9,7 @@ struct ttm_mock_manager { struct ttm_resource_manager man; - struct drm_buddy mm; + struct gpu_buddy mm; u64 default_page_size; /* protects allocations of mock buffer objects */ struct mutex lock; diff --git a/drivers/gpu/drm/xe/xe_res_cursor.h b/drivers/gpu/drm/xe/xe_res_cursor.h index 4e00008b7081..5f4ab08c0686 100644 --- a/drivers/gpu/drm/xe/xe_res_cursor.h +++ b/drivers/gpu/drm/xe/xe_res_cursor.h @@ -58,7 +58,7 @@ struct xe_res_cursor { /** @dma_addr: Current element in a struct drm_pagemap_addr array */ const struct drm_pagemap_addr *dma_addr; /** @mm: Buddy allocator for VRAM cursor */ - struct drm_buddy *mm; + struct gpu_buddy *mm; /** * @dma_start: DMA start address for the current segment. * This may be different to @dma_addr.addr since elements in @@ -69,7 +69,7 @@ struct xe_res_cursor { u64 dma_seg_size; }; -static struct drm_buddy *xe_res_get_buddy(struct ttm_resource *res) +static struct gpu_buddy *xe_res_get_buddy(struct ttm_resource *res) { struct ttm_resource_manager *mgr; @@ -104,30 +104,30 @@ static inline void xe_res_first(struct ttm_resource *res, case XE_PL_STOLEN: case XE_PL_VRAM0: case XE_PL_VRAM1: { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *head, *next; - struct drm_buddy *mm = xe_res_get_buddy(res); + struct gpu_buddy *mm = xe_res_get_buddy(res); head = &to_xe_ttm_vram_mgr_resource(res)->blocks; block = list_first_entry_or_null(head, - struct drm_buddy_block, + struct gpu_buddy_block, link); if (!block) goto fallback; - while (start >= drm_buddy_block_size(mm, block)) { - start -= drm_buddy_block_size(mm, block); + while (start >= gpu_buddy_block_size(mm, block)) { + start -= gpu_buddy_block_size(mm, block); next = block->link.next; if (next != head) - block = list_entry(next, struct drm_buddy_block, + block = list_entry(next, struct gpu_buddy_block, link); } cur->mm = mm; - cur->start = drm_buddy_block_offset(block) + start; - cur->size = min(drm_buddy_block_size(mm, block) - start, + cur->start = gpu_buddy_block_offset(block) + start; + cur->size = min(gpu_buddy_block_size(mm, block) - start, size); cur->remaining = size; cur->node = block; @@ -259,7 +259,7 @@ static inline void xe_res_first_dma(const struct drm_pagemap_addr *dma_addr, */ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *next; u64 start; @@ -295,18 +295,18 @@ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size) block = cur->node; next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); - while (start >= drm_buddy_block_size(cur->mm, block)) { - start -= drm_buddy_block_size(cur->mm, block); + while (start >= gpu_buddy_block_size(cur->mm, block)) { + start -= gpu_buddy_block_size(cur->mm, block); next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); } - cur->start = drm_buddy_block_offset(block) + start; - cur->size = min(drm_buddy_block_size(cur->mm, block) - start, + cur->start = gpu_buddy_block_offset(block) + start; + cur->size = min(gpu_buddy_block_size(cur->mm, block) - start, cur->remaining); cur->node = block; break; diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 213f0334518a..cda3bf7e2418 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -747,7 +747,7 @@ static u64 block_offset_to_pfn(struct drm_pagemap *dpagemap, u64 offset) return PHYS_PFN(offset + xpagemap->hpa_base); } -static struct drm_buddy *vram_to_buddy(struct xe_vram_region *vram) +static struct gpu_buddy *vram_to_buddy(struct xe_vram_region *vram) { return &vram->ttm.mm; } @@ -758,17 +758,17 @@ static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocati struct xe_bo *bo = to_xe_bo(devmem_allocation); struct ttm_resource *res = bo->ttm.resource; struct list_head *blocks = &to_xe_ttm_vram_mgr_resource(res)->blocks; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; int j = 0; list_for_each_entry(block, blocks, link) { struct xe_vram_region *vr = block->private; - struct drm_buddy *buddy = vram_to_buddy(vr); + struct gpu_buddy *buddy = vram_to_buddy(vr); u64 block_pfn = block_offset_to_pfn(devmem_allocation->dpagemap, - drm_buddy_block_offset(block)); + gpu_buddy_block_offset(block)); int i; - for (i = 0; i < drm_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) + for (i = 0; i < gpu_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) pfn[j++] = block_pfn + i; } @@ -1033,7 +1033,7 @@ static int xe_drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, struct dma_fence *pre_migrate_fence = NULL; struct xe_device *xe = vr->xe; struct device *dev = xe->drm.dev; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct xe_validation_ctx vctx; struct list_head *blocks; struct drm_exec exec; diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index 6553a19f7cf2..d119217d566a 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -16,16 +17,16 @@ #include "xe_ttm_vram_mgr.h" #include "xe_vram_types.h" -static inline struct drm_buddy_block * +static inline struct gpu_buddy_block * xe_ttm_vram_mgr_first_block(struct list_head *list) { - return list_first_entry_or_null(list, struct drm_buddy_block, link); + return list_first_entry_or_null(list, struct gpu_buddy_block, link); } -static inline bool xe_is_vram_mgr_blocks_contiguous(struct drm_buddy *mm, +static inline bool xe_is_vram_mgr_blocks_contiguous(struct gpu_buddy *mm, struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; block = xe_ttm_vram_mgr_first_block(head); @@ -33,12 +34,12 @@ static inline bool xe_is_vram_mgr_blocks_contiguous(struct drm_buddy *mm, return false; while (head != block->link.next) { - start = drm_buddy_block_offset(block); - size = drm_buddy_block_size(mm, block); + start = gpu_buddy_block_offset(block); + size = gpu_buddy_block_size(mm, block); - block = list_entry(block->link.next, struct drm_buddy_block, + block = list_entry(block->link.next, struct gpu_buddy_block, link); - if (start + size != drm_buddy_block_offset(block)) + if (start + size != gpu_buddy_block_offset(block)) return false; } @@ -52,7 +53,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, { struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres; - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; u64 size, min_page_size; unsigned long lpfn; int err; @@ -79,10 +80,10 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, INIT_LIST_HEAD(&vres->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + vres->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->fpfn || lpfn != man->size >> PAGE_SHIFT) - vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + vres->flags |= GPU_BUDDY_RANGE_ALLOCATION; if (WARN_ON(!vres->base.size)) { err = -EINVAL; @@ -118,27 +119,27 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, lpfn = max_t(unsigned long, place->fpfn + (size >> PAGE_SHIFT), lpfn); } - err = drm_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, + err = gpu_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, (u64)lpfn << PAGE_SHIFT, size, min_page_size, &vres->blocks, vres->flags); if (err) goto error_unlock; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, NULL, vres->base.size, &vres->blocks)) + if (!gpu_buddy_block_trim(mm, NULL, vres->base.size, &vres->blocks)) size = vres->base.size; } if (lpfn <= mgr->visible_size >> PAGE_SHIFT) { vres->used_visible_size = size; } else { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; list_for_each_entry(block, &vres->blocks, link) { - u64 start = drm_buddy_block_offset(block); + u64 start = gpu_buddy_block_offset(block); if (start < mgr->visible_size) { - u64 end = start + drm_buddy_block_size(mm, block); + u64 end = start + gpu_buddy_block_size(mm, block); vres->used_visible_size += min(end, mgr->visible_size) - start; @@ -158,11 +159,11 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, * the object. */ if (vres->base.placement & TTM_PL_FLAG_CONTIGUOUS) { - struct drm_buddy_block *block = list_first_entry(&vres->blocks, + struct gpu_buddy_block *block = list_first_entry(&vres->blocks, typeof(*block), link); - vres->base.start = drm_buddy_block_offset(block) >> PAGE_SHIFT; + vres->base.start = gpu_buddy_block_offset(block) >> PAGE_SHIFT; } else { vres->base.start = XE_BO_INVALID_OFFSET; } @@ -184,10 +185,10 @@ static void xe_ttm_vram_mgr_del(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); - drm_buddy_free_list(mm, &vres->blocks, 0); + gpu_buddy_free_list(mm, &vres->blocks, 0); mgr->visible_avail += vres->used_visible_size; mutex_unlock(&mgr->lock); @@ -200,7 +201,7 @@ static void xe_ttm_vram_mgr_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); drm_printf(printer, "default_page_size: %lluKiB\n", @@ -223,8 +224,8 @@ static bool xe_ttm_vram_mgr_intersects(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -234,9 +235,9 @@ static bool xe_ttm_vram_mgr_intersects(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (place->fpfn < lpfn && place->lpfn > fpfn) return true; @@ -253,8 +254,8 @@ static bool xe_ttm_vram_mgr_compatible(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -264,9 +265,9 @@ static bool xe_ttm_vram_mgr_compatible(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (fpfn < place->fpfn || lpfn > place->lpfn) return false; @@ -296,7 +297,7 @@ static void xe_ttm_vram_mgr_fini(struct drm_device *dev, void *arg) WARN_ON_ONCE(mgr->visible_avail != mgr->visible_size); - drm_buddy_fini(&mgr->mm); + gpu_buddy_fini(&mgr->mm); ttm_resource_manager_cleanup(&mgr->manager); @@ -327,7 +328,7 @@ int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr, mgr->visible_avail = io_size; ttm_resource_manager_init(man, &xe->ttm, size); - err = drm_buddy_init(&mgr->mm, man->size, default_page_size); + err = gpu_buddy_init(&mgr->mm, man->size, default_page_size); if (err) return err; @@ -375,7 +376,7 @@ int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, if (!*sgt) return -ENOMEM; - /* Determine the number of DRM_BUDDY blocks to export */ + /* Determine the number of GPU_BUDDY blocks to export */ xe_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; @@ -392,10 +393,10 @@ int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, sg->length = 0; /* - * Walk down DRM_BUDDY blocks to populate scatterlist nodes - * @note: Use iterator api to get first the DRM_BUDDY block + * Walk down GPU_BUDDY blocks to populate scatterlist nodes + * @note: Use iterator api to get first the GPU_BUDDY block * and the number of bytes from it. Access the following - * DRM_BUDDY block(s) if more buffer needs to exported + * GPU_BUDDY block(s) if more buffer needs to exported */ xe_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h index babeec5511d9..9106da056b49 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h @@ -18,7 +18,7 @@ struct xe_ttm_vram_mgr { /** @manager: Base TTM resource manager */ struct ttm_resource_manager manager; /** @mm: DRM buddy allocator which manages the VRAM */ - struct drm_buddy mm; + struct gpu_buddy mm; /** @visible_size: Proped size of the CPU visible portion */ u64 visible_size; /** @visible_avail: CPU visible portion still unallocated */ diff --git a/drivers/gpu/tests/Makefile b/drivers/gpu/tests/Makefile index 8e7654e87d82..4183e6e2de45 100644 --- a/drivers/gpu/tests/Makefile +++ b/drivers/gpu/tests/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 gpu_buddy_tests-y = gpu_buddy_test.o gpu_random.o -obj-$(CONFIG_DRM_KUNIT_TEST) += gpu_buddy_tests.o +obj-$(CONFIG_GPU_BUDDY_KUNIT_TEST) += gpu_buddy_tests.o diff --git a/drivers/gpu/tests/gpu_buddy_test.c b/drivers/gpu/tests/gpu_buddy_test.c index b905932da990..450e71deed90 100644 --- a/drivers/gpu/tests/gpu_buddy_test.c +++ b/drivers/gpu/tests/gpu_buddy_test.c @@ -21,9 +21,9 @@ static inline u64 get_size(int order, u64 chunk_size) return (1 << order) * chunk_size; } -static void drm_test_buddy_fragmentation_performance(struct kunit *test) +static void gpu_test_buddy_fragmentation_performance(struct kunit *test) { - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; int num_blocks, i, ret, count = 0; LIST_HEAD(allocated_blocks); unsigned long elapsed_ms; @@ -32,7 +32,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) LIST_HEAD(clear_list); LIST_HEAD(dirty_list); LIST_HEAD(free_list); - struct drm_buddy mm; + struct gpu_buddy mm; u64 mm_size = SZ_4G; ktime_t start, end; @@ -47,7 +47,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) * quickly the allocator can satisfy larger, aligned requests from a pool of * highly fragmented space. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); num_blocks = mm_size / SZ_64K; @@ -55,7 +55,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) start = ktime_get(); /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, &allocated_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_8K); @@ -68,21 +68,21 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) } /* Free with different flags to ensure no coalescing */ - drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty_list, 0); + gpu_buddy_free_list(&mm, &clear_list, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty_list, 0); for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, &test_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_64K); - drm_buddy_free_list(&mm, &test_blocks, 0); + gpu_buddy_free_list(&mm, &test_blocks, 0); end = ktime_get(); elapsed_ms = ktime_to_ms(ktime_sub(end, start)); kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); /* * Reverse free order under fragmentation @@ -96,13 +96,13 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) * deallocation occurs in the opposite order of allocation, exposing the * cost difference between a linear freelist scan and an ordered tree lookup. */ - ret = drm_buddy_init(&mm, mm_size, SZ_4K); + ret = gpu_buddy_init(&mm, mm_size, SZ_4K); KUNIT_ASSERT_EQ(test, ret, 0); start = ktime_get(); /* Allocate maximum fragmentation */ for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, &allocated_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_8K); @@ -111,28 +111,28 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) list_move_tail(&block->link, &free_list); count++; } - drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &free_list, GPU_BUDDY_CLEARED); list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) list_move(&block->link, &reverse_list); - drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &reverse_list, GPU_BUDDY_CLEARED); end = ktime_get(); elapsed_ms = ktime_to_ms(ktime_sub(end, start)); kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_range_bias(struct kunit *test) +static void gpu_test_buddy_alloc_range_bias(struct kunit *test) { u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; - DRM_RND_STATE(prng, random_seed); + GPU_RND_STATE(prng, random_seed); unsigned int i, count, *order; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(allocated); bias_size = SZ_1M; @@ -142,11 +142,11 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); count = mm_size / bias_size; - order = drm_random_order(count, &prng); + order = gpu_random_order(count, &prng); KUNIT_EXPECT_TRUE(test, order); /* @@ -166,79 +166,79 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) /* internal round_up too big */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size + ps, bias_size, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size, bias_size); /* size too big */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size + ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size + ps, ps); /* bias range too small for size */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, + gpu_buddy_alloc_blocks(&mm, bias_start + ps, bias_end, bias_size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start + ps, bias_end, bias_size, ps); /* bias misaligned */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, + gpu_buddy_alloc_blocks(&mm, bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); /* single big page */ KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size, bias_size, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); /* single page with internal round_up */ KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, ps, bias_size, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, ps, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); /* random size within */ size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); if (size) KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); bias_rem -= size; /* too big for current avail */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_rem + ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_rem + ps, ps); @@ -248,10 +248,10 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) size = max(size, ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); /* @@ -259,15 +259,15 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) * unallocated, and ideally not always on the bias * boundaries. */ - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); } else { list_splice_tail(&tmp, &allocated); } } kfree(order); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); /* * Something more free-form. Idea is to pick a random starting bias @@ -278,7 +278,7 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) * allocated nodes in the middle of the address space. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); @@ -290,10 +290,10 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); bias_rem -= size; @@ -319,24 +319,24 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) KUNIT_ASSERT_EQ(test, bias_start, 0); KUNIT_ASSERT_EQ(test, bias_end, mm_size); KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, bias_end, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc passed with bias(%x-%x), size=%u\n", bias_start, bias_end, ps); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); /* - * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is + * Allocate cleared blocks in the bias range when the GPU buddy's clear avail is * zero. This will validate the bias range allocation in scenarios like system boot * when no cleared blocks are available and exercise the fallback path too. The resulting * blocks should always be dirty. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); @@ -344,11 +344,11 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) bias_end = max(bias_end, bias_start + ps); bias_rem = bias_end - bias_start; - flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + flags = GPU_BUDDY_CLEAR_ALLOCATION | GPU_BUDDY_RANGE_ALLOCATION; size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, flags), @@ -356,27 +356,27 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) bias_start, bias_end, size, ps); list_for_each_entry(block, &allocated, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_clear(struct kunit *test) +static void gpu_test_buddy_alloc_clear(struct kunit *test) { unsigned long n_pages, total, i = 0; const unsigned long ps = SZ_4K; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; const int max_order = 12; LIST_HEAD(allocated); - struct drm_buddy mm; + struct gpu_buddy mm; unsigned int order; u32 mm_size, size; LIST_HEAD(dirty); LIST_HEAD(clean); mm_size = SZ_4K << max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -389,11 +389,11 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) * is indeed all dirty pages and vice versa. Free it all again, * keeping the dirty/clear status. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 5 * ps, ps, &allocated, - DRM_BUDDY_TOPDOWN_ALLOCATION), + GPU_BUDDY_TOPDOWN_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 5 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); n_pages = 10; do { @@ -406,37 +406,37 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) flags = 0; } else { list = &clean; - flags = DRM_BUDDY_CLEAR_ALLOCATION; + flags = GPU_BUDDY_CLEAR_ALLOCATION; } - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, flags), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); list_for_each_entry(block, &clean, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), true); list_for_each_entry(block, &dirty, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); /* * Trying to go over the clear limit for some allocation. * The allocation should never fail with reasonable page-size. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 10 * ps, ps, &clean, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 10 * ps); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty, 0); + gpu_buddy_fini(&mm); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); /* * Create a new mm. Intentionally fragment the address space by creating @@ -458,34 +458,34 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) else list = &clean; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, 0), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty, 0); order = 1; do { size = SZ_4K << order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, size, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%u\n", size); total = 0; list_for_each_entry(block, &allocated, link) { if (size != mm_size) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - total += drm_buddy_block_size(&mm, block); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); + total += gpu_buddy_block_size(&mm, block); } KUNIT_EXPECT_EQ(test, total, size); - drm_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_free_list(&mm, &allocated, 0); } while (++order <= max_order); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); /* * Create a new mm with a non power-of-two size. Allocate a random size from each @@ -494,44 +494,44 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) */ mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, 4 * ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 4 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, 2 * ps, ps, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc hit an error size=%lu\n", ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_contiguous(struct kunit *test) +static void gpu_test_buddy_alloc_contiguous(struct kunit *test) { const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; unsigned long i, n_pages, total; - struct drm_buddy_block *block; - struct drm_buddy mm; + struct gpu_buddy_block *block; + struct gpu_buddy mm; LIST_HEAD(left); LIST_HEAD(middle); LIST_HEAD(right); LIST_HEAD(allocated); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); /* * Idea is to fragment the address space by alternating block * allocations between three different lists; one for left, middle and * right. We can then free a list to simulate fragmentation. In - * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, + * particular we want to exercise the GPU_BUDDY_CONTIGUOUS_ALLOCATION, * including the try_harder path. */ @@ -548,66 +548,66 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test) else list = &right; KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, 0), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); - drm_buddy_free_list(&mm, &middle, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &middle, 0); + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &right, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &right, 0); + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); /* * At this point we should have enough contiguous space for 2 blocks, * however they are never buddies (since we freed middle and right) so * will require the try_harder logic to find them. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &left, 0); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &left, 0); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 3 * ps); total = 0; list_for_each_entry(block, &allocated, link) - total += drm_buddy_block_size(&mm, block); + total += gpu_buddy_block_size(&mm, block); KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_pathological(struct kunit *test) +static void gpu_test_buddy_alloc_pathological(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; const int max_order = 3; unsigned long flags = 0; int order, top; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); LIST_HEAD(holes); LIST_HEAD(tmp); @@ -620,7 +620,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) */ mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -630,18 +630,18 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) block = list_first_entry_or_null(&blocks, typeof(*block), link); if (block) { list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); } for (order = top; order--;) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", order, top); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -649,45 +649,45 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) /* There should be one final page for this sub-allocation */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM for hole\n"); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &holes); size = get_size(top, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", top, max_order); } - drm_buddy_free_list(&mm, &holes, 0); + gpu_buddy_free_list(&mm, &holes, 0); /* Nothing larger than blocks of chunk_size now available */ for (order = 1; order <= max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at order %d, it should be full!", order); } list_splice_tail(&holes, &blocks); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_pessimistic(struct kunit *test) +static void gpu_test_buddy_alloc_pessimistic(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block, *bn; + struct gpu_buddy_block *block, *bn; const unsigned int max_order = 16; unsigned long flags = 0; - struct drm_buddy mm; + struct gpu_buddy mm; unsigned int order; LIST_HEAD(blocks); LIST_HEAD(tmp); @@ -699,19 +699,19 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) */ mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order < max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -719,11 +719,11 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* And now the last remaining block available */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM on final alloc\n"); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -731,58 +731,58 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* Should be completely full! */ for (order = max_order; order--;) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); } block = list_last_entry(&blocks, typeof(*block), link); list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); /* As we free in increasing size, we make available larger blocks */ order = 1; list_for_each_entry_safe(block, bn, &blocks, link) { list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); order++; } /* To confirm, now the whole mm should be available */ size = get_size(max_order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", max_order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_del(&block->link); - drm_buddy_free_block(&mm, block); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_block(&mm, block); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_optimistic(struct kunit *test) +static void gpu_test_buddy_alloc_optimistic(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags = 0; const int max_order = 16; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); LIST_HEAD(tmp); int order; @@ -794,19 +794,19 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order <= max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -814,115 +814,115 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) /* Should be completely full! */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_limit(struct kunit *test) +static void gpu_test_buddy_alloc_limit(struct kunit *test) { u64 size = U64_MAX, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags = 0; LIST_HEAD(allocated); - struct drm_buddy mm; + struct gpu_buddy mm; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, size, SZ_4K)); - KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, + KUNIT_EXPECT_EQ_MSG(test, mm.max_order, GPU_BUDDY_MAX_ORDER, "mm.max_order(%d) != %d\n", mm.max_order, - DRM_BUDDY_MAX_ORDER); + GPU_BUDDY_MAX_ORDER); size = mm.chunk_size << mm.max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, + KUNIT_EXPECT_FALSE(test, gpu_buddy_alloc_blocks(&mm, start, size, size, mm.chunk_size, &allocated, flags)); - block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); + block = list_first_entry_or_null(&allocated, struct gpu_buddy_block, link); KUNIT_EXPECT_TRUE(test, block); - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, + KUNIT_EXPECT_EQ_MSG(test, gpu_buddy_block_order(block), mm.max_order, "block order(%d) != %d\n", - drm_buddy_block_order(block), mm.max_order); + gpu_buddy_block_order(block), mm.max_order); - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), + KUNIT_EXPECT_EQ_MSG(test, gpu_buddy_block_size(&mm, block), BIT_ULL(mm.max_order) * mm.chunk_size, "block size(%llu) != %llu\n", - drm_buddy_block_size(&mm, block), + gpu_buddy_block_size(&mm, block), BIT_ULL(mm.max_order) * mm.chunk_size); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) +static void gpu_test_buddy_alloc_exceeds_max_order(struct kunit *test) { u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); int err; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); /* CONTIGUOUS allocation should succeed via try_harder fallback */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%llu\n", size); - drm_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_free_list(&mm, &blocks, 0); /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); KUNIT_EXPECT_EQ(test, err, -EINVAL); /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, - DRM_BUDDY_RANGE_ALLOCATION); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, + GPU_BUDDY_RANGE_ALLOCATION); KUNIT_EXPECT_EQ(test, err, -EINVAL); /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, + GPU_BUDDY_CONTIGUOUS_ALLOCATION | GPU_BUDDY_RANGE_ALLOCATION); KUNIT_EXPECT_EQ(test, err, -EINVAL); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); } -static int drm_buddy_suite_init(struct kunit_suite *suite) +static int gpu_buddy_suite_init(struct kunit_suite *suite) { while (!random_seed) random_seed = get_random_u32(); - kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", + kunit_info(suite, "Testing GPU buddy manager, with random_seed=0x%x\n", random_seed); return 0; } -static struct kunit_case drm_buddy_tests[] = { - KUNIT_CASE(drm_test_buddy_alloc_limit), - KUNIT_CASE(drm_test_buddy_alloc_optimistic), - KUNIT_CASE(drm_test_buddy_alloc_pessimistic), - KUNIT_CASE(drm_test_buddy_alloc_pathological), - KUNIT_CASE(drm_test_buddy_alloc_contiguous), - KUNIT_CASE(drm_test_buddy_alloc_clear), - KUNIT_CASE(drm_test_buddy_alloc_range_bias), - KUNIT_CASE(drm_test_buddy_fragmentation_performance), - KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), +static struct kunit_case gpu_buddy_tests[] = { + KUNIT_CASE(gpu_test_buddy_alloc_limit), + KUNIT_CASE(gpu_test_buddy_alloc_optimistic), + KUNIT_CASE(gpu_test_buddy_alloc_pessimistic), + KUNIT_CASE(gpu_test_buddy_alloc_pathological), + KUNIT_CASE(gpu_test_buddy_alloc_contiguous), + KUNIT_CASE(gpu_test_buddy_alloc_clear), + KUNIT_CASE(gpu_test_buddy_alloc_range_bias), + KUNIT_CASE(gpu_test_buddy_fragmentation_performance), + KUNIT_CASE(gpu_test_buddy_alloc_exceeds_max_order), {} }; -static struct kunit_suite drm_buddy_test_suite = { - .name = "drm_buddy", - .suite_init = drm_buddy_suite_init, - .test_cases = drm_buddy_tests, +static struct kunit_suite gpu_buddy_test_suite = { + .name = "gpu_buddy", + .suite_init = gpu_buddy_suite_init, + .test_cases = gpu_buddy_tests, }; -kunit_test_suite(drm_buddy_test_suite); +kunit_test_suite(gpu_buddy_test_suite); MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); +MODULE_DESCRIPTION("Kunit test for gpu_buddy functions"); MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/tests/gpu_random.c b/drivers/gpu/tests/gpu_random.c index ddd1f594b5d5..6356372f7e52 100644 --- a/drivers/gpu/tests/gpu_random.c +++ b/drivers/gpu/tests/gpu_random.c @@ -8,26 +8,26 @@ #include "gpu_random.h" -u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) +u32 gpu_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) { return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); } -EXPORT_SYMBOL(drm_prandom_u32_max_state); +EXPORT_SYMBOL(gpu_prandom_u32_max_state); -void drm_random_reorder(unsigned int *order, unsigned int count, +void gpu_random_reorder(unsigned int *order, unsigned int count, struct rnd_state *state) { unsigned int i, j; for (i = 0; i < count; ++i) { BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); - j = drm_prandom_u32_max_state(count, state); + j = gpu_prandom_u32_max_state(count, state); swap(order[i], order[j]); } } -EXPORT_SYMBOL(drm_random_reorder); +EXPORT_SYMBOL(gpu_random_reorder); -unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) +unsigned int *gpu_random_order(unsigned int count, struct rnd_state *state) { unsigned int *order, i; @@ -38,7 +38,7 @@ unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) for (i = 0; i < count; i++) order[i] = i; - drm_random_reorder(order, count, state); + gpu_random_reorder(order, count, state); return order; } -EXPORT_SYMBOL(drm_random_order); +EXPORT_SYMBOL(gpu_random_order); diff --git a/drivers/gpu/tests/gpu_random.h b/drivers/gpu/tests/gpu_random.h index 9f827260a89d..b68cf3448264 100644 --- a/drivers/gpu/tests/gpu_random.h +++ b/drivers/gpu/tests/gpu_random.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DRM_RANDOM_H__ -#define __DRM_RANDOM_H__ +#ifndef __GPU_RANDOM_H__ +#define __GPU_RANDOM_H__ /* This is a temporary home for a couple of utility functions that should * be transposed to lib/ at the earliest convenience. @@ -8,21 +8,21 @@ #include -#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ +#define GPU_RND_STATE_INITIALIZER(seed__) ({ \ struct rnd_state state__; \ prandom_seed_state(&state__, (seed__)); \ state__; \ }) -#define DRM_RND_STATE(name__, seed__) \ - struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) +#define GPU_RND_STATE(name__, seed__) \ + struct rnd_state name__ = GPU_RND_STATE_INITIALIZER(seed__) -unsigned int *drm_random_order(unsigned int count, +unsigned int *gpu_random_order(unsigned int count, struct rnd_state *state); -void drm_random_reorder(unsigned int *order, +void gpu_random_reorder(unsigned int *order, unsigned int count, struct rnd_state *state); -u32 drm_prandom_u32_max_state(u32 ep_ro, +u32 gpu_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state); -#endif /* !__DRM_RANDOM_H__ */ +#endif /* !__GPU_RANDOM_H__ */ diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index d51777df12d1..0adb1e2fa533 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -37,6 +37,7 @@ source "drivers/char/agp/Kconfig" source "drivers/gpu/vga/Kconfig" +source "drivers/gpu/Kconfig" source "drivers/gpu/host1x/Kconfig" source "drivers/gpu/ipu-v3/Kconfig" source "drivers/gpu/nova-core/Kconfig" diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h new file mode 100644 index 000000000000..3054369bebff --- /dev/null +++ b/include/drm/drm_buddy.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __DRM_BUDDY_H__ +#define __DRM_BUDDY_H__ + +#include + +struct drm_printer; + +/* DRM-specific GPU Buddy Allocator print helpers */ +void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer *p); +void drm_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + struct drm_printer *p); +#endif diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index b909fa8f810a..07ac65db6d2e 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -3,8 +3,8 @@ * Copyright © 2021 Intel Corporation */ -#ifndef __DRM_BUDDY_H__ -#define __DRM_BUDDY_H__ +#ifndef __GPU_BUDDY_H__ +#define __GPU_BUDDY_H__ #include #include @@ -12,38 +12,45 @@ #include #include -struct drm_printer; - -#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) -#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) -#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) -#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) -#define DRM_BUDDY_CLEARED BIT(4) -#define DRM_BUDDY_TRIM_DISABLE BIT(5) - -struct drm_buddy_block { -#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) -#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) -#define DRM_BUDDY_ALLOCATED (1 << 10) -#define DRM_BUDDY_FREE (2 << 10) -#define DRM_BUDDY_SPLIT (3 << 10) -#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +#define GPU_BUDDY_RANGE_ALLOCATION BIT(0) +#define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) +#define GPU_BUDDY_CLEARED BIT(4) +#define GPU_BUDDY_TRIM_DISABLE BIT(5) + +enum gpu_buddy_free_tree { + GPU_BUDDY_CLEAR_TREE = 0, + GPU_BUDDY_DIRTY_TREE, + GPU_BUDDY_MAX_FREE_TREES, +}; + +#define for_each_free_tree(tree) \ + for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) + +struct gpu_buddy_block { +#define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define GPU_BUDDY_ALLOCATED (1 << 10) +#define GPU_BUDDY_FREE (2 << 10) +#define GPU_BUDDY_SPLIT (3 << 10) +#define GPU_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) /* Free to be used, if needed in the future */ -#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) -#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) +#define GPU_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define GPU_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) u64 header; - struct drm_buddy_block *left; - struct drm_buddy_block *right; - struct drm_buddy_block *parent; + struct gpu_buddy_block *left; + struct gpu_buddy_block *right; + struct gpu_buddy_block *parent; void *private; /* owned by creator */ /* - * While the block is allocated by the user through drm_buddy_alloc*, + * While the block is allocated by the user through gpu_buddy_alloc*, * the user has ownership of the link, for example to maintain within * a list, if so desired. As soon as the block is freed with - * drm_buddy_free* ownership is given back to the mm. + * gpu_buddy_free* ownership is given back to the mm. */ union { struct rb_node rb; @@ -54,15 +61,15 @@ struct drm_buddy_block { }; /* Order-zero must be at least SZ_4K */ -#define DRM_BUDDY_MAX_ORDER (63 - 12) +#define GPU_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. * * Locking should be handled by the user, a simple mutex around - * drm_buddy_alloc* and drm_buddy_free* should suffice. + * gpu_buddy_alloc* and gpu_buddy_free* should suffice. */ -struct drm_buddy { +struct gpu_buddy { /* Maintain a free list for each order. */ struct rb_root **free_trees; @@ -73,7 +80,7 @@ struct drm_buddy { * block. Nodes are either allocated or free, in which case they will * also exist on the respective free list. */ - struct drm_buddy_block **roots; + struct gpu_buddy_block **roots; /* * Anything from here is public, and remains static for the lifetime of @@ -90,82 +97,81 @@ struct drm_buddy { }; static inline u64 -drm_buddy_block_offset(const struct drm_buddy_block *block) +gpu_buddy_block_offset(const struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_OFFSET; + return block->header & GPU_BUDDY_HEADER_OFFSET; } static inline unsigned int -drm_buddy_block_order(struct drm_buddy_block *block) +gpu_buddy_block_order(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_ORDER; + return block->header & GPU_BUDDY_HEADER_ORDER; } static inline unsigned int -drm_buddy_block_state(struct drm_buddy_block *block) +gpu_buddy_block_state(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_STATE; + return block->header & GPU_BUDDY_HEADER_STATE; } static inline bool -drm_buddy_block_is_allocated(struct drm_buddy_block *block) +gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; + return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; } static inline bool -drm_buddy_block_is_clear(struct drm_buddy_block *block) +gpu_buddy_block_is_clear(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_CLEAR; + return block->header & GPU_BUDDY_HEADER_CLEAR; } static inline bool -drm_buddy_block_is_free(struct drm_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_FREE; + return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; } static inline bool -drm_buddy_block_is_split(struct drm_buddy_block *block) +gpu_buddy_block_is_split(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; + return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; } static inline u64 -drm_buddy_block_size(struct drm_buddy *mm, - struct drm_buddy_block *block) +gpu_buddy_block_size(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - return mm->chunk_size << drm_buddy_block_order(block); + return mm->chunk_size << gpu_buddy_block_order(block); } -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); +int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); -void drm_buddy_fini(struct drm_buddy *mm); +void gpu_buddy_fini(struct gpu_buddy *mm); -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block); +struct gpu_buddy_block * +gpu_get_buddy(struct gpu_buddy_block *block); -int drm_buddy_alloc_blocks(struct drm_buddy *mm, +int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, struct list_head *blocks, unsigned long flags); -int drm_buddy_block_trim(struct drm_buddy *mm, +int gpu_buddy_block_trim(struct gpu_buddy *mm, u64 *start, u64 new_size, struct list_head *blocks); -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); +void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear); -void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); +void gpu_buddy_free_block(struct gpu_buddy *mm, struct gpu_buddy_block *block); -void drm_buddy_free_list(struct drm_buddy *mm, +void gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, unsigned int flags); -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p); +void gpu_buddy_print(struct gpu_buddy *mm); +void gpu_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block); #endif -- cgit v1.2.3 From 6d438685340df6ac8570326aaa51c3603a2fe25c Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 5 Feb 2026 15:10:44 +0100 Subject: drm/fbdev-emulation: Remove empty placeholders Only DRM clients for fbdev emulation invoke fbdev helpers. Hence remove the empty placeholders for non-fbdev builds, as they are unused. Signed-off-by: Thomas Zimmermann Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260205141142.412048-1-tzimmermann@suse.de --- include/drm/drm_fb_helper.h | 105 -------------------------------------------- 1 file changed, 105 deletions(-) (limited to 'include') diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index 05cca77b7249..15274b8a1d97 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -271,111 +271,6 @@ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd, int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper); -#else -static inline void drm_fb_helper_prepare(struct drm_device *dev, - struct drm_fb_helper *helper, - unsigned int preferred_bpp, - const struct drm_fb_helper_funcs *funcs) -{ -} - -static inline void drm_fb_helper_unprepare(struct drm_fb_helper *fb_helper) -{ -} - -static inline int drm_fb_helper_init(struct drm_device *dev, - struct drm_fb_helper *helper) -{ - /* So drivers can use it to free the struct */ - helper->dev = dev; - dev->fb_helper = helper; - - return 0; -} - -static inline void drm_fb_helper_fini(struct drm_fb_helper *helper) -{ - if (helper && helper->dev) - helper->dev->fb_helper = NULL; -} - -static inline int drm_fb_helper_blank(int blank, struct fb_info *info) -{ - return 0; -} - -static inline int drm_fb_helper_pan_display(struct fb_var_screeninfo *var, - struct fb_info *info) -{ - return 0; -} - -static inline int drm_fb_helper_set_par(struct fb_info *info) -{ - return 0; -} - -static inline int drm_fb_helper_check_var(struct fb_var_screeninfo *var, - struct fb_info *info) -{ - return 0; -} - -static inline int -drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) -{ - return 0; -} - -static inline void drm_fb_helper_unregister_info(struct drm_fb_helper *fb_helper) -{ -} - -static inline void -drm_fb_helper_fill_info(struct fb_info *info, - struct drm_fb_helper *fb_helper, - struct drm_fb_helper_surface_size *sizes) -{ -} - -static inline int drm_fb_helper_setcmap(struct fb_cmap *cmap, - struct fb_info *info) -{ - return 0; -} - -static inline int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd, - unsigned long arg) -{ - return 0; -} - -#ifdef CONFIG_FB_DEFERRED_IO -static inline void drm_fb_helper_deferred_io(struct fb_info *info, - struct list_head *pagelist) -{ -} -#endif - -static inline void drm_fb_helper_set_suspend(struct drm_fb_helper *fb_helper, - bool suspend) -{ -} - -static inline void -drm_fb_helper_set_suspend_unlocked(struct drm_fb_helper *fb_helper, bool suspend) -{ -} - -static inline int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) -{ - return 0; -} - -static inline int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper) -{ - return 0; -} #endif #endif -- cgit v1.2.3 From dc90ead44054736131f73b1dd319b8be06088d36 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 30 Jan 2026 12:51:06 +0000 Subject: drm/xe/uapi: update used tracking kernel-doc In commit 4d0b035fd6da ("drm/xe/uapi: loosen used tracking restriction") we dropped the CAP_PERMON restriction but missed updating the corresponding kernel-doc. Fix that. v2 (Sanjay): - Don't drop the note around the extra cpu_visible_used expectations. Reported-by: Ulisses Furquim Fixes: 4d0b035fd6da ("drm/xe/uapi: loosen used tracking restriction") Signed-off-by: Matthew Auld Cc: Sanjay Yadav Reviewed-by: Sanjay Yadav Link: https://patch.msgid.link/20260130125105.451229-2-matthew.auld@intel.com --- include/uapi/drm/xe_drm.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 077e66a682e2..c9e70f78e723 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -335,10 +335,6 @@ struct drm_xe_mem_region { __u64 total_size; /** * @used: Estimate of the memory used in bytes for this region. - * - * Requires CAP_PERFMON or CAP_SYS_ADMIN to get reliable - * accounting. Without this the value here will always equal - * zero. */ __u64 used; /** @@ -363,9 +359,7 @@ struct drm_xe_mem_region { * @cpu_visible_used: Estimate of CPU visible memory used, in * bytes. * - * Requires CAP_PERFMON or CAP_SYS_ADMIN to get reliable - * accounting. Without this the value here will always equal - * zero. Note this is only currently tracked for + * Note this is only currently tracked for * DRM_XE_MEM_REGION_CLASS_VRAM regions (for other types the value * here will always be zero). */ -- cgit v1.2.3 From 62918542b7bf08860a60ebbde7654486e0ac0776 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Mon, 16 Jun 2025 16:59:52 +0100 Subject: dma-fence: Fix sparse warnings due __rcu annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __rcu annotations on the return types from dma_fence_driver_name() and dma_fence_timeline_name() cause sparse to complain because both the constant signaled strings, and the strings return by the dma_fence_ops are not __rcu annotated. For a simple fix it is easiest to cast them with __rcu added and undo the smarts from the tracpoints side of things. There is no functional change since the rest is left in place. Later we can consider changing the dma_fence_ops return types too, and handle all the individual drivers which define them. Signed-off-by: Tvrtko Ursulin Fixes: 506aa8b02a8d ("dma-fence: Add safe access helpers and document the rules") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506162214.1eA69hLe-lkp@intel.com/ Reviewed-by: Christian König Link: https://lore.kernel.org/r/20250616155952.24259-1-tvrtko.ursulin@igalia.com Signed-off-by: Christian König --- drivers/dma-buf/dma-fence.c | 8 ++++---- include/trace/events/dma_fence.h | 35 +++++------------------------------ 2 files changed, 9 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index c9a036b0d592..e05beae6e407 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -1133,9 +1133,9 @@ const char __rcu *dma_fence_driver_name(struct dma_fence *fence) "RCU protection is required for safe access to returned string"); if (!dma_fence_test_signaled_flag(fence)) - return fence->ops->get_driver_name(fence); + return (const char __rcu *)fence->ops->get_driver_name(fence); else - return "detached-driver"; + return (const char __rcu *)"detached-driver"; } EXPORT_SYMBOL(dma_fence_driver_name); @@ -1165,8 +1165,8 @@ const char __rcu *dma_fence_timeline_name(struct dma_fence *fence) "RCU protection is required for safe access to returned string"); if (!dma_fence_test_signaled_flag(fence)) - return fence->ops->get_timeline_name(fence); + return (const char __rcu *)fence->ops->get_driver_name(fence); else - return "signaled-timeline"; + return (const char __rcu *)"signaled-timeline"; } EXPORT_SYMBOL(dma_fence_timeline_name); diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h index 4814a65b68dc..3abba45c0601 100644 --- a/include/trace/events/dma_fence.h +++ b/include/trace/events/dma_fence.h @@ -9,37 +9,12 @@ struct dma_fence; -DECLARE_EVENT_CLASS(dma_fence, - - TP_PROTO(struct dma_fence *fence), - - TP_ARGS(fence), - - TP_STRUCT__entry( - __string(driver, dma_fence_driver_name(fence)) - __string(timeline, dma_fence_timeline_name(fence)) - __field(unsigned int, context) - __field(unsigned int, seqno) - ), - - TP_fast_assign( - __assign_str(driver); - __assign_str(timeline); - __entry->context = fence->context; - __entry->seqno = fence->seqno; - ), - - TP_printk("driver=%s timeline=%s context=%u seqno=%u", - __get_str(driver), __get_str(timeline), __entry->context, - __entry->seqno) -); - /* * Safe only for call sites which are guaranteed to not race with fence * signaling,holding the fence->lock and having checked for not signaled, or the * signaling path itself. */ -DECLARE_EVENT_CLASS(dma_fence_unsignaled, +DECLARE_EVENT_CLASS(dma_fence, TP_PROTO(struct dma_fence *fence), @@ -64,14 +39,14 @@ DECLARE_EVENT_CLASS(dma_fence_unsignaled, __entry->seqno) ); -DEFINE_EVENT(dma_fence_unsignaled, dma_fence_emit, +DEFINE_EVENT(dma_fence, dma_fence_emit, TP_PROTO(struct dma_fence *fence), TP_ARGS(fence) ); -DEFINE_EVENT(dma_fence_unsignaled, dma_fence_init, +DEFINE_EVENT(dma_fence, dma_fence_init, TP_PROTO(struct dma_fence *fence), @@ -85,14 +60,14 @@ DEFINE_EVENT(dma_fence, dma_fence_destroy, TP_ARGS(fence) ); -DEFINE_EVENT(dma_fence_unsignaled, dma_fence_enable_signal, +DEFINE_EVENT(dma_fence, dma_fence_enable_signal, TP_PROTO(struct dma_fence *fence), TP_ARGS(fence) ); -DEFINE_EVENT(dma_fence_unsignaled, dma_fence_signaled, +DEFINE_EVENT(dma_fence, dma_fence_signaled, TP_PROTO(struct dma_fence *fence), -- cgit v1.2.3 From 24a4241995ab7456c6751e0bd63382a95e70757f Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Mon, 2 Feb 2026 15:11:54 +0530 Subject: drm/colorop: Add destroy helper for colorop objects Add a helper that performs common cleanup and frees the associated object. This can be used by drivers if they do not require any driver-specific teardown. v2: - Add function documentation only before definition (Jani) Signed-off-by: Chaitanya Kumar Borah Reviewed-by: Suraj Kandpal Reviewed-by: Uma Shankar Reviewed-by: Alex Hung Acked-by: Jani Nikula Signed-off-by: Suraj Kandpal Link: https://patch.msgid.link/20260202094202.2871478-2-chaitanya.kumar.borah@intel.com --- drivers/gpu/drm/drm_colorop.c | 15 +++++++++++++++ include/drm/drm_colorop.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/drm_colorop.c b/drivers/gpu/drm/drm_colorop.c index 44eb823585d2..c226870fde9e 100644 --- a/drivers/gpu/drm/drm_colorop.c +++ b/drivers/gpu/drm/drm_colorop.c @@ -178,6 +178,21 @@ void drm_colorop_cleanup(struct drm_colorop *colorop) } EXPORT_SYMBOL(drm_colorop_cleanup); +/** + * drm_colorop_destroy - destroy colorop + * @colorop: drm colorop + * + * Destroys @colorop by performing common DRM cleanup and freeing the + * colorop object. This can be used by drivers if they do not + * require any driver-specific teardown. + */ +void drm_colorop_destroy(struct drm_colorop *colorop) +{ + drm_colorop_cleanup(colorop); + kfree(colorop); +} +EXPORT_SYMBOL(drm_colorop_destroy); + /** * drm_colorop_pipeline_destroy - Helper for color pipeline destruction * diff --git a/include/drm/drm_colorop.h b/include/drm/drm_colorop.h index a3a32f9f918c..3056f3f02597 100644 --- a/include/drm/drm_colorop.h +++ b/include/drm/drm_colorop.h @@ -420,6 +420,8 @@ void drm_colorop_atomic_destroy_state(struct drm_colorop *colorop, */ void drm_colorop_reset(struct drm_colorop *colorop); +void drm_colorop_destroy(struct drm_colorop *colorop); + /** * drm_colorop_index - find the index of a registered colorop * @colorop: colorop to find index for -- cgit v1.2.3 From 2864667476a40525511a1e854bcfa7c90392a990 Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Mon, 2 Feb 2026 15:11:55 +0530 Subject: drm: Allow driver-managed destruction of colorop objects Some drivers might want to embed struct drm_colorop inside driver-specific objects, similar to planes or CRTCs. In such cases, freeing only the drm_colorop is incorrect. Add a drm_colorop_funcs callback to allow drivers to provide a destroy hook that cleans up the full enclosing object. Make changes in helper functions to accept helper functions as argument. Pass NULL for now to retain current behavior. Signed-off-by: Chaitanya Kumar Borah Reviewed-by: Suraj Kandpal Reviewed-by: Uma Shankar Reviewed-by: Alex Hung Acked-by: Jani Nikula Signed-off-by: Suraj Kandpal Link: https://patch.msgid.link/20260202094202.2871478-3-chaitanya.kumar.borah@intel.com --- .../drm/amd/display/amdgpu_dm/amdgpu_dm_colorop.c | 18 +++++++------ drivers/gpu/drm/drm_colorop.c | 31 +++++++++++++++------- .../gpu/drm/i915/display/intel_color_pipeline.c | 8 +++--- drivers/gpu/drm/vkms/vkms_colorop.c | 10 ++++--- include/drm/drm_colorop.h | 30 +++++++++++++++++---- 5 files changed, 66 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_colorop.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_colorop.c index a2de3bba8346..dfdb4fb4219f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_colorop.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_colorop.c @@ -72,7 +72,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, + ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, NULL, amdgpu_dm_supported_degam_tfs, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) @@ -89,7 +89,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_mult_init(dev, ops[i], plane, DRM_COLOROP_FLAG_ALLOW_BYPASS); + ret = drm_plane_colorop_mult_init(dev, ops[i], plane, NULL, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; @@ -104,7 +104,8 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, DRM_COLOROP_FLAG_ALLOW_BYPASS); + ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, NULL, + DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; @@ -120,7 +121,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, + ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, NULL, amdgpu_dm_supported_shaper_tfs, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) @@ -137,7 +138,8 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_lut_init(dev, ops[i], plane, MAX_COLOR_LUT_ENTRIES, + ret = drm_plane_colorop_curve_1d_lut_init(dev, ops[i], plane, NULL, + MAX_COLOR_LUT_ENTRIES, DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) @@ -154,7 +156,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_3dlut_init(dev, ops[i], plane, LUT3D_SIZE, + ret = drm_plane_colorop_3dlut_init(dev, ops[i], plane, NULL, LUT3D_SIZE, DRM_COLOROP_LUT3D_INTERPOLATION_TETRAHEDRAL, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) @@ -172,7 +174,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, + ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, NULL, amdgpu_dm_supported_blnd_tfs, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) @@ -189,7 +191,7 @@ int amdgpu_dm_initialize_default_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_lut_init(dev, ops[i], plane, MAX_COLOR_LUT_ENTRIES, + ret = drm_plane_colorop_curve_1d_lut_init(dev, ops[i], plane, NULL, MAX_COLOR_LUT_ENTRIES, DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) diff --git a/drivers/gpu/drm/drm_colorop.c b/drivers/gpu/drm/drm_colorop.c index c226870fde9e..2bce29176ab3 100644 --- a/drivers/gpu/drm/drm_colorop.c +++ b/drivers/gpu/drm/drm_colorop.c @@ -93,7 +93,8 @@ static const struct drm_prop_enum_list drm_colorop_lut3d_interpolation_list[] = /* Init Helpers */ static int drm_plane_colorop_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, enum drm_colorop_type type, + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + enum drm_colorop_type type, uint32_t flags) { struct drm_mode_config *config = &dev->mode_config; @@ -109,6 +110,7 @@ static int drm_plane_colorop_init(struct drm_device *dev, struct drm_colorop *co colorop->type = type; colorop->plane = plane; colorop->next = NULL; + colorop->funcs = funcs; list_add_tail(&colorop->head, &config->colorop_list); colorop->index = config->num_colorop++; @@ -218,6 +220,7 @@ EXPORT_SYMBOL(drm_colorop_pipeline_destroy); * @dev: DRM device * @colorop: The drm_colorop object to initialize * @plane: The associated drm_plane + * @funcs: control functions for the new colorop * @supported_tfs: A bitfield of supported drm_plane_colorop_curve_1d_init enum values, * created using BIT(curve_type) and combined with the OR '|' * operator. @@ -225,7 +228,8 @@ EXPORT_SYMBOL(drm_colorop_pipeline_destroy); * @return zero on success, -E value on failure */ int drm_plane_colorop_curve_1d_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, u64 supported_tfs, uint32_t flags) + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + u64 supported_tfs, uint32_t flags) { struct drm_prop_enum_list enum_list[DRM_COLOROP_1D_CURVE_COUNT]; int i, len; @@ -246,7 +250,7 @@ int drm_plane_colorop_curve_1d_init(struct drm_device *dev, struct drm_colorop * return -EINVAL; } - ret = drm_plane_colorop_init(dev, colorop, plane, DRM_COLOROP_1D_CURVE, flags); + ret = drm_plane_colorop_init(dev, colorop, plane, funcs, DRM_COLOROP_1D_CURVE, flags); if (ret) return ret; @@ -303,20 +307,23 @@ static int drm_colorop_create_data_prop(struct drm_device *dev, struct drm_color * @dev: DRM device * @colorop: The drm_colorop object to initialize * @plane: The associated drm_plane + * @funcs: control functions for new colorop * @lut_size: LUT size supported by driver * @interpolation: 1D LUT interpolation type * @flags: bitmask of misc, see DRM_COLOROP_FLAG_* defines. * @return zero on success, -E value on failure */ int drm_plane_colorop_curve_1d_lut_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t lut_size, + struct drm_plane *plane, + const struct drm_colorop_funcs *funcs, + uint32_t lut_size, enum drm_colorop_lut1d_interpolation_type interpolation, uint32_t flags) { struct drm_property *prop; int ret; - ret = drm_plane_colorop_init(dev, colorop, plane, DRM_COLOROP_1D_LUT, flags); + ret = drm_plane_colorop_init(dev, colorop, plane, funcs, DRM_COLOROP_1D_LUT, flags); if (ret) return ret; @@ -354,11 +361,12 @@ int drm_plane_colorop_curve_1d_lut_init(struct drm_device *dev, struct drm_color EXPORT_SYMBOL(drm_plane_colorop_curve_1d_lut_init); int drm_plane_colorop_ctm_3x4_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t flags) + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + uint32_t flags) { int ret; - ret = drm_plane_colorop_init(dev, colorop, plane, DRM_COLOROP_CTM_3X4, flags); + ret = drm_plane_colorop_init(dev, colorop, plane, funcs, DRM_COLOROP_CTM_3X4, flags); if (ret) return ret; @@ -378,16 +386,18 @@ EXPORT_SYMBOL(drm_plane_colorop_ctm_3x4_init); * @dev: DRM device * @colorop: The drm_colorop object to initialize * @plane: The associated drm_plane + * @funcs: control functions for the new colorop * @flags: bitmask of misc, see DRM_COLOROP_FLAG_* defines. * @return zero on success, -E value on failure */ int drm_plane_colorop_mult_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t flags) + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + uint32_t flags) { struct drm_property *prop; int ret; - ret = drm_plane_colorop_init(dev, colorop, plane, DRM_COLOROP_MULTIPLIER, flags); + ret = drm_plane_colorop_init(dev, colorop, plane, funcs, DRM_COLOROP_MULTIPLIER, flags); if (ret) return ret; @@ -406,6 +416,7 @@ EXPORT_SYMBOL(drm_plane_colorop_mult_init); int drm_plane_colorop_3dlut_init(struct drm_device *dev, struct drm_colorop *colorop, struct drm_plane *plane, + const struct drm_colorop_funcs *funcs, uint32_t lut_size, enum drm_colorop_lut3d_interpolation_type interpolation, uint32_t flags) @@ -413,7 +424,7 @@ int drm_plane_colorop_3dlut_init(struct drm_device *dev, struct drm_colorop *col struct drm_property *prop; int ret; - ret = drm_plane_colorop_init(dev, colorop, plane, DRM_COLOROP_3D_LUT, flags); + ret = drm_plane_colorop_init(dev, colorop, plane, funcs, DRM_COLOROP_3D_LUT, flags); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/display/intel_color_pipeline.c b/drivers/gpu/drm/i915/display/intel_color_pipeline.c index 04af552b3648..d3d73d60727c 100644 --- a/drivers/gpu/drm/i915/display/intel_color_pipeline.c +++ b/drivers/gpu/drm/i915/display/intel_color_pipeline.c @@ -25,7 +25,7 @@ int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_en colorop = intel_colorop_create(INTEL_PLANE_CB_PRE_CSC_LUT); - ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, + ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, NULL, PLANE_DEGAMMA_SIZE, DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, DRM_COLOROP_FLAG_ALLOW_BYPASS); @@ -39,7 +39,7 @@ int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_en prev_op = &colorop->base; colorop = intel_colorop_create(INTEL_PLANE_CB_CSC); - ret = drm_plane_colorop_ctm_3x4_init(dev, &colorop->base, plane, + ret = drm_plane_colorop_ctm_3x4_init(dev, &colorop->base, plane, NULL, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) return ret; @@ -52,7 +52,7 @@ int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_en plane->type == DRM_PLANE_TYPE_PRIMARY) { colorop = intel_colorop_create(INTEL_PLANE_CB_3DLUT); - ret = drm_plane_colorop_3dlut_init(dev, &colorop->base, plane, 17, + ret = drm_plane_colorop_3dlut_init(dev, &colorop->base, plane, NULL, 17, DRM_COLOROP_LUT3D_INTERPOLATION_TETRAHEDRAL, true); if (ret) @@ -64,7 +64,7 @@ int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_en } colorop = intel_colorop_create(INTEL_PLANE_CB_POST_CSC_LUT); - ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, + ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane, NULL, PLANE_GAMMA_SIZE, DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR, DRM_COLOROP_FLAG_ALLOW_BYPASS); diff --git a/drivers/gpu/drm/vkms/vkms_colorop.c b/drivers/gpu/drm/vkms/vkms_colorop.c index d03a1f2e9c41..9e9dd0494628 100644 --- a/drivers/gpu/drm/vkms/vkms_colorop.c +++ b/drivers/gpu/drm/vkms/vkms_colorop.c @@ -31,7 +31,7 @@ static int vkms_initialize_color_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, supported_tfs, + ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, NULL, supported_tfs, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; @@ -48,7 +48,8 @@ static int vkms_initialize_color_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, DRM_COLOROP_FLAG_ALLOW_BYPASS); + ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, NULL, + DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; @@ -64,7 +65,8 @@ static int vkms_initialize_color_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, DRM_COLOROP_FLAG_ALLOW_BYPASS); + ret = drm_plane_colorop_ctm_3x4_init(dev, ops[i], plane, NULL, + DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; @@ -80,7 +82,7 @@ static int vkms_initialize_color_pipeline(struct drm_plane *plane, struct drm_pr goto cleanup; } - ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, supported_tfs, + ret = drm_plane_colorop_curve_1d_init(dev, ops[i], plane, NULL, supported_tfs, DRM_COLOROP_FLAG_ALLOW_BYPASS); if (ret) goto cleanup; diff --git a/include/drm/drm_colorop.h b/include/drm/drm_colorop.h index 3056f3f02597..bd082854ca74 100644 --- a/include/drm/drm_colorop.h +++ b/include/drm/drm_colorop.h @@ -187,6 +187,19 @@ struct drm_colorop_state { struct drm_atomic_state *state; }; +/** + * struct drm_colorop_funcs - driver colorop control functions + */ +struct drm_colorop_funcs { + /** + * @destroy: + * + * Clean up colorop resources. This is called at driver unload time + * through drm_mode_config_cleanup() + */ + void (*destroy)(struct drm_colorop *colorop); +}; + /** * struct drm_colorop - DRM color operation control structure * @@ -362,6 +375,8 @@ struct drm_colorop { */ struct drm_property *next_property; + /** @funcs: colorop control functions */ + const struct drm_colorop_funcs *funcs; }; #define obj_to_colorop(x) container_of(x, struct drm_colorop, base) @@ -390,17 +405,22 @@ void drm_colorop_pipeline_destroy(struct drm_device *dev); void drm_colorop_cleanup(struct drm_colorop *colorop); int drm_plane_colorop_curve_1d_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, u64 supported_tfs, uint32_t flags); + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + u64 supported_tfs, uint32_t flags); int drm_plane_colorop_curve_1d_lut_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t lut_size, + struct drm_plane *plane, + const struct drm_colorop_funcs *funcs, + uint32_t lut_size, enum drm_colorop_lut1d_interpolation_type interpolation, uint32_t flags); int drm_plane_colorop_ctm_3x4_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t flags); + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + uint32_t flags); int drm_plane_colorop_mult_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, uint32_t flags); + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, + uint32_t flags); int drm_plane_colorop_3dlut_init(struct drm_device *dev, struct drm_colorop *colorop, - struct drm_plane *plane, + struct drm_plane *plane, const struct drm_colorop_funcs *funcs, uint32_t lut_size, enum drm_colorop_lut3d_interpolation_type interpolation, uint32_t flags); -- cgit v1.2.3 From 95ffa10056b33bf5a90090b02da2edd52e1e281c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 28 Jan 2026 13:43:45 +0100 Subject: drm/atomic: Make drm_atomic_private_obj_init fallible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we're going to move the drm_private_obj state allocation to a callback, we need to be able to deal with its possible failure. Make drm_private_obj_init return an error code on failure. Suggested-by: Ville Syrjälä Reviewed-by: Thomas Zimmermann Link: https://patch.msgid.link/20260128-drm-private-obj-reset-v4-1-90891fa3d3b0@redhat.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/drm_atomic.c | 14 +++++++++----- include/drm/drm_atomic.h | 8 ++++---- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 52738b80ddbe..4191a8333fc4 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -927,12 +927,14 @@ static void drm_atomic_plane_print_state(struct drm_printer *p, * * Initialize the private object, which can be embedded into any * driver private object that needs its own atomic state. + * + * RETURNS: + * Zero on success, error code on failure */ -void -drm_atomic_private_obj_init(struct drm_device *dev, - struct drm_private_obj *obj, - struct drm_private_state *state, - const struct drm_private_state_funcs *funcs) +int drm_atomic_private_obj_init(struct drm_device *dev, + struct drm_private_obj *obj, + struct drm_private_state *state, + const struct drm_private_state_funcs *funcs) { memset(obj, 0, sizeof(*obj)); @@ -944,6 +946,8 @@ drm_atomic_private_obj_init(struct drm_device *dev, list_add_tail(&obj->head, &dev->mode_config.privobj_list); state->obj = obj; + + return 0; } EXPORT_SYMBOL(drm_atomic_private_obj_init); diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h index 178f8f62c80f..712f5fb977bf 100644 --- a/include/drm/drm_atomic.h +++ b/include/drm/drm_atomic.h @@ -723,10 +723,10 @@ struct drm_connector_state * __must_check drm_atomic_get_connector_state(struct drm_atomic_state *state, struct drm_connector *connector); -void drm_atomic_private_obj_init(struct drm_device *dev, - struct drm_private_obj *obj, - struct drm_private_state *state, - const struct drm_private_state_funcs *funcs); +int drm_atomic_private_obj_init(struct drm_device *dev, + struct drm_private_obj *obj, + struct drm_private_state *state, + const struct drm_private_state_funcs *funcs); void drm_atomic_private_obj_fini(struct drm_private_obj *obj); struct drm_private_state * __must_check -- cgit v1.2.3 From 47b5ac7daa46e2bc8e4916d856fdc036ac145bb6 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 28 Jan 2026 13:43:46 +0100 Subject: drm/atomic: Add new atomic_create_state callback to drm_private_obj The drm_private_obj initialization was inconsistent with the rest of the KMS objects. Indeed, it required to pass a preallocated state in drm_private_obj_init(), while all the others objects would have a reset callback that would be called later on to create the state. However, reset really is meant to reset the hardware and software state. That it creates an initial state is a side-effect that has been used in all objects but drm_private_obj. This is made more complex since some drm_private_obj, the DisplayPort ones in particular, need to be persistent across and suspend/resume cycle, and such a cycle would call drm_mode_config_reset(). Thus, we need to add a new callback to allocate a pristine state for a given private object. This discussion has also came up during the atomic state readout discussion, so it might be introduced into the other objects later on. Until all drivers are converted to that new allocation pattern, we will only call it if the passed state is NULL. This will be removed eventually. Reviewed-by: Dmitry Baryshkov Reviewed-by: Thomas Zimmermann Link: https://patch.msgid.link/20260128-drm-private-obj-reset-v4-2-90891fa3d3b0@redhat.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/drm_atomic.c | 18 ++++++++++++++++-- include/drm/drm_atomic.h | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 4191a8333fc4..e3029c8f02e5 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -941,11 +941,25 @@ int drm_atomic_private_obj_init(struct drm_device *dev, drm_modeset_lock_init(&obj->lock); obj->dev = dev; - obj->state = state; obj->funcs = funcs; list_add_tail(&obj->head, &dev->mode_config.privobj_list); - state->obj = obj; + /* + * Not all users of drm_atomic_private_obj_init have been + * converted to using &drm_private_obj_funcs.atomic_create_state yet. + * For the time being, let's only call reset if the passed state is + * NULL. Otherwise, we will fallback to the previous behaviour. + */ + if (!state) { + state = obj->funcs->atomic_create_state(obj); + if (IS_ERR(state)) + return PTR_ERR(state); + + obj->state = state; + } else { + obj->state = state; + state->obj = obj; + } return 0; } diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h index 712f5fb977bf..0b1b32bcd2bd 100644 --- a/include/drm/drm_atomic.h +++ b/include/drm/drm_atomic.h @@ -261,6 +261,19 @@ struct drm_private_state; * drm_atomic_get_private_obj_state(). */ struct drm_private_state_funcs { + /** + * @atomic_create_state: + * + * Allocates a pristine, initialized, state for the private + * object and returns it. + * + * RETURNS: + * + * A new, pristine, private state instance or an error pointer + * on failure. + */ + struct drm_private_state *(*atomic_create_state)(struct drm_private_obj *obj); + /** * @atomic_duplicate_state: * -- cgit v1.2.3 From e7be39ed171662474d6d5c9a83d790ef7d244bcd Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 28 Jan 2026 13:43:47 +0100 Subject: drm/atomic-helper: Add private_obj atomic_create_state helper Now that we have an atomic_create_state callback for drm_private_objs, we can provide a helper for it. It's somewhat different from the other similar helpers though, because we definitely expect drm_private_obj to be subclassed. It wouldn't make sense for a driver to use it as-is. So we can't provide a straight implementation of the atomic_create_state callback, but rather we provide the parts that will deal with the drm_private_obj initialization, and we will leave the allocation and initialization of the subclass to drivers. Reviewed-by: Dmitry Baryshkov Reviewed-by: Thomas Zimmermann Link: https://patch.msgid.link/20260128-drm-private-obj-reset-v4-3-90891fa3d3b0@redhat.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/drm_atomic_state_helper.c | 22 ++++++++++++++++++++++ include/drm/drm_atomic_state_helper.h | 3 +++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c index cee6d8fc44ad..d21f32f0ad51 100644 --- a/drivers/gpu/drm/drm_atomic_state_helper.c +++ b/drivers/gpu/drm/drm_atomic_state_helper.c @@ -714,6 +714,28 @@ void drm_atomic_helper_connector_destroy_state(struct drm_connector *connector, } EXPORT_SYMBOL(drm_atomic_helper_connector_destroy_state); +/** + * __drm_atomic_helper_private_obj_create_state - initializes private object state + * @obj: private object + * @state: new state to initialize + * + * Initializes the newly allocated @state, usually required when + * initializing the drivers. + * + * @obj is assumed to be zeroed. + * + * This is useful for drivers that use private states. + */ +void __drm_atomic_helper_private_obj_create_state(struct drm_private_obj *obj, + struct drm_private_state *state) +{ + if (state) + state->obj = obj; + + obj->state = state; +} +EXPORT_SYMBOL(__drm_atomic_helper_private_obj_create_state); + /** * __drm_atomic_helper_private_obj_duplicate_state - copy atomic private state * @obj: CRTC object diff --git a/include/drm/drm_atomic_state_helper.h b/include/drm/drm_atomic_state_helper.h index b9740edb2658..900672c6ea90 100644 --- a/include/drm/drm_atomic_state_helper.h +++ b/include/drm/drm_atomic_state_helper.h @@ -84,6 +84,9 @@ void __drm_atomic_helper_connector_destroy_state(struct drm_connector_state *state); void drm_atomic_helper_connector_destroy_state(struct drm_connector *connector, struct drm_connector_state *state); + +void __drm_atomic_helper_private_obj_create_state(struct drm_private_obj *obj, + struct drm_private_state *state); void __drm_atomic_helper_private_obj_duplicate_state(struct drm_private_obj *obj, struct drm_private_state *state); -- cgit v1.2.3 From be07d8f707e41cb694c4a56364978c30683a687d Mon Sep 17 00:00:00 2001 From: Shekhar Chauhan Date: Fri, 6 Feb 2026 15:36:08 -0300 Subject: drm/xe/nvlp: Add NVL-P platform definition Add platform definition along with device IDs for NVL-P. Here is the list of device descriptor fields and associated Bspec references: .dma_mask_size (Bspec 74198) .has_cached_pt (Bspec 71582) .has_display (Bspec 74196) .has_flat_ccs (Bspec 74110) .has_page_reclaim_hw_assist (Bspec 73451) .max_gt_per_tile (Bspec 74196) .va_bits (Bspec 74198) .vm_max_level (Bspec 59507) v2: - Add list of descriptor fields and Bspec references. (Matt) Signed-off-by: Shekhar Chauhan Reviewed-by: Matt Roper Link: https://patch.msgid.link/20260206-nvl-p-upstreaming-v3-12-636e1ad32688@intel.com Signed-off-by: Gustavo Sousa --- drivers/gpu/drm/xe/xe_bo.c | 4 ++-- drivers/gpu/drm/xe/xe_pci.c | 15 +++++++++++++++ drivers/gpu/drm/xe/xe_platform_types.h | 1 + include/drm/intel/pciids.h | 12 ++++++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index e9180b01a4e4..cb8a177ec02b 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -512,8 +512,8 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, /* * Display scanout is always non-coherent with the CPU cache. * - * For Xe_LPG and beyond, PPGTT PTE lookups are also - * non-coherent and require a CPU:WC mapping. + * For Xe_LPG and beyond up to NVL-P (excluding), PPGTT PTE + * lookups are also non-coherent and require a CPU:WC mapping. */ if ((!bo->cpu_caching && bo->flags & XE_BO_FLAG_SCANOUT) || (!xe->info.has_cached_pt && bo->flags & XE_BO_FLAG_PAGETABLE)) diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 4abd64eccf27..3e1a87dd78e0 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -450,6 +450,20 @@ static const struct xe_device_desc cri_desc = { .vm_max_level = 4, }; +static const struct xe_device_desc nvlp_desc = { + PLATFORM(NOVALAKE_P), + .dma_mask_size = 46, + .has_cached_pt = true, + .has_display = true, + .has_flat_ccs = 1, + .has_page_reclaim_hw_assist = true, + .has_pre_prod_wa = true, + .max_gt_per_tile = 2, + .require_force_probe = true, + .va_bits = 48, + .vm_max_level = 4, +}; + #undef PLATFORM __diag_pop(); @@ -479,6 +493,7 @@ static const struct pci_device_id pciidlist[] = { INTEL_WCL_IDS(INTEL_VGA_DEVICE, &ptl_desc), INTEL_NVLS_IDS(INTEL_VGA_DEVICE, &nvls_desc), INTEL_CRI_IDS(INTEL_PCI_DEVICE, &cri_desc), + INTEL_NVLP_IDS(INTEL_VGA_DEVICE, &nvlp_desc), { } }; MODULE_DEVICE_TABLE(pci, pciidlist); diff --git a/drivers/gpu/drm/xe/xe_platform_types.h b/drivers/gpu/drm/xe/xe_platform_types.h index f516dbddfd88..6cff385227ea 100644 --- a/drivers/gpu/drm/xe/xe_platform_types.h +++ b/drivers/gpu/drm/xe/xe_platform_types.h @@ -26,6 +26,7 @@ enum xe_platform { XE_PANTHERLAKE, XE_NOVALAKE_S, XE_CRESCENTISLAND, + XE_NOVALAKE_P, }; enum xe_subplatform { diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index 52520e684ab1..33b91cb2e684 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -900,4 +900,16 @@ #define INTEL_CRI_IDS(MACRO__, ...) \ MACRO__(0x674C, ## __VA_ARGS__) +/* NVL-P */ +#define INTEL_NVLP_IDS(MACRO__, ...) \ + MACRO__(0xD750, ## __VA_ARGS__), \ + MACRO__(0xD751, ## __VA_ARGS__), \ + MACRO__(0xD752, ## __VA_ARGS__), \ + MACRO__(0xD753, ## __VA_ARGS__), \ + MACRO__(0XD754, ## __VA_ARGS__), \ + MACRO__(0XD755, ## __VA_ARGS__), \ + MACRO__(0XD756, ## __VA_ARGS__), \ + MACRO__(0XD757, ## __VA_ARGS__), \ + MACRO__(0xD75F, ## __VA_ARGS__) + #endif /* __PCIIDS_H__ */ -- cgit v1.2.3 From a69d1ab971a624c6f112cea61536569d579c3215 Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com --- include/linux/migrate.h | 10 +++++++++- mm/filemap.c | 15 ++++++++++----- mm/memory.c | 3 ++- mm/migrate.c | 8 ++++---- mm/migrate_device.c | 2 +- 5 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..d98e4883f13d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1379,14 +1379,16 @@ repeat: #ifdef CONFIG_MIGRATION /** - * migration_entry_wait_on_locked - Wait for a migration entry to be removed - * @entry: migration swap entry. + * softleaf_entry_wait_on_locked - Wait for a migration entry or + * device_private entry to be removed. + * @entry: migration or device_private swap entry. * @ptl: already locked ptl. This function will drop the lock. * - * Wait for a migration entry referencing the given page to be removed. This is + * Wait for a migration entry referencing the given page, or device_private + * entry referencing a dvice_private page to be unlocked. This is * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this - * should be called while holding the ptl for the migration entry referencing + * should be called while holding the ptl for @entry referencing * the page. * * Returns after unlocking the ptl. @@ -1394,7 +1396,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. + * Similarly any path attempting to drop the last reference to a + * device-private page needs to grab the ptl to remove the device-private + * entry. */ spin_unlock(ptl); diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..0ad50df25846 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4681,7 +4681,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) unlock_page(vmf->page); put_page(vmf->page); } else { - pte_unmap_unlock(vmf->pte, vmf->ptl); + pte_unmap(vmf->pte); + softleaf_entry_wait_on_locked(entry, vmf->ptl); } } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; diff --git a/mm/migrate.c b/mm/migrate.c index 5169f9717f60..75e384b042ef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -499,7 +499,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!softleaf_is_migration(entry)) goto out; - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); return; out: spin_unlock(ptl); @@ -531,10 +531,10 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the * pgtable lock released. See comment right above pgtable - * lock release in migration_entry_wait_on_locked(). + * lock release in softleaf_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); return; } @@ -552,7 +552,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!pmd_is_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); + softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..deab89fd4541 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, } if (softleaf_is_migration(entry)) { - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); spin_unlock(ptl); return -EAGAIN; } -- cgit v1.2.3 From 15e86b3abdc20f62e0ab8fed0030ff39e7320587 Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 5 Feb 2026 15:13:26 +0530 Subject: drm/{i915, xe}: Extract pcode definitions to common header There are certain register definitions which are commonly shared by i915, xe and display. Extract the same to a common header to avoid duplication. Move GEN6_PCODE_MAILBOX to common pcode header to make intel_cdclk.c free from including i915_reg.h. v3: Include pcode header as required, instead in i915_reg.h (Jani) v2: Make the header granular and per feature (Jani) Signed-off-by: Uma Shankar Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260205094341.1882816-6-uma.shankar@intel.com --- drivers/gpu/drm/i915/display/hsw_ips.c | 1 + drivers/gpu/drm/i915/display/intel_bw.c | 1 + drivers/gpu/drm/i915/display/intel_cdclk.c | 2 +- drivers/gpu/drm/i915/display/intel_display_power.c | 1 + .../drm/i915/display/intel_display_power_well.c | 1 + drivers/gpu/drm/i915/display/intel_dram.c | 1 + drivers/gpu/drm/i915/display/intel_hdcp.c | 1 + drivers/gpu/drm/i915/display/skl_watermark.c | 1 + drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 1 + drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c | 2 + drivers/gpu/drm/i915/gt/intel_llc.c | 2 + drivers/gpu/drm/i915/gt/intel_rc6.c | 1 + drivers/gpu/drm/i915/gt/intel_rps.c | 1 + drivers/gpu/drm/i915/gvt/handlers.c | 1 + drivers/gpu/drm/i915/i915_driver.c | 1 + drivers/gpu/drm/i915/i915_hwmon.c | 2 + drivers/gpu/drm/i915/i915_reg.h | 100 ------------------- drivers/gpu/drm/i915/intel_gvt_mmio_table.c | 2 + drivers/gpu/drm/i915/intel_pcode.c | 1 + include/drm/intel/intel_pcode_regs.h | 108 +++++++++++++++++++++ 20 files changed, 130 insertions(+), 101 deletions(-) create mode 100644 include/drm/intel/intel_pcode_regs.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/hsw_ips.c b/drivers/gpu/drm/i915/display/hsw_ips.c index 0caaea2e64e1..8658872ed86f 100644 --- a/drivers/gpu/drm/i915/display/hsw_ips.c +++ b/drivers/gpu/drm/i915/display/hsw_ips.c @@ -6,6 +6,7 @@ #include #include +#include #include "hsw_ips.h" #include "i915_reg.h" diff --git a/drivers/gpu/drm/i915/display/intel_bw.c b/drivers/gpu/drm/i915/display/intel_bw.c index 8d84445c69f1..618da1dfb671 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.c +++ b/drivers/gpu/drm/i915/display/intel_bw.c @@ -5,6 +5,7 @@ #include #include +#include #include "i915_reg.h" #include "intel_bw.h" diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index 9217050a76e0..29d90d612bb2 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -27,9 +27,9 @@ #include #include +#include #include "hsw_ips.h" -#include "i915_reg.h" #include "intel_atomic.h" #include "intel_audio.h" #include "intel_cdclk.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 06adf6afbec0..cb9256f72aa9 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -7,6 +7,7 @@ #include #include +#include #include "i915_reg.h" #include "intel_backlight_regs.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_power_well.c b/drivers/gpu/drm/i915/display/intel_display_power_well.c index 78f707b00550..45c4313e6900 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power_well.c +++ b/drivers/gpu/drm/i915/display/intel_display_power_well.c @@ -6,6 +6,7 @@ #include #include +#include #include "i915_reg.h" #include "intel_backlight_regs.h" diff --git a/drivers/gpu/drm/i915/display/intel_dram.c b/drivers/gpu/drm/i915/display/intel_dram.c index 3b9879714ea9..61aefe77f90f 100644 --- a/drivers/gpu/drm/i915/display/intel_dram.c +++ b/drivers/gpu/drm/i915/display/intel_dram.c @@ -7,6 +7,7 @@ #include #include +#include #include "i915_reg.h" #include "intel_display_core.h" diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index b7479ced7871..c96f51d88186 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "i915_reg.h" #include "intel_connector.h" diff --git a/drivers/gpu/drm/i915/display/skl_watermark.c b/drivers/gpu/drm/i915/display/skl_watermark.c index b41da10f0f85..1455ea068d22 100644 --- a/drivers/gpu/drm/i915/display/skl_watermark.c +++ b/drivers/gpu/drm/i915/display/skl_watermark.c @@ -7,6 +7,7 @@ #include #include +#include #include "i915_reg.h" #include "i9xx_wm.h" diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c index 96411f357f5d..1b9cb70fc641 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c @@ -8,6 +8,7 @@ #include #include +#include #include "i915_drv.h" #include "i915_reg.h" diff --git a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c index 1154cd2b7c34..a48601395dce 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c @@ -7,6 +7,8 @@ #include #include +#include + #include "i915_drv.h" #include "i915_reg.h" #include "i915_sysfs.h" diff --git a/drivers/gpu/drm/i915/gt/intel_llc.c b/drivers/gpu/drm/i915/gt/intel_llc.c index 1d19c073ba2e..bcd707e3d436 100644 --- a/drivers/gpu/drm/i915/gt/intel_llc.c +++ b/drivers/gpu/drm/i915/gt/intel_llc.c @@ -6,6 +6,8 @@ #include #include +#include + #include "i915_drv.h" #include "i915_reg.h" #include "intel_gt.h" diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index 286d49ecc449..942ac1ebecee 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -7,6 +7,7 @@ #include #include +#include #include "display/vlv_clock.h" #include "gem/i915_gem_region.h" diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 90b7eee78f1f..844f2716a386 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -7,6 +7,7 @@ #include #include +#include #include "display/intel_display_rps.h" #include "display/vlv_clock.h" diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 6f860c320afc..2e9d9d0638ae 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -40,6 +40,7 @@ #include #include +#include #include "display/bxt_dpio_phy_regs.h" #include "display/i9xx_plane_regs.h" diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index c01a35ecfa2f..6d8fbf845bc2 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "display/i9xx_display_sr.h" #include "display/intel_bw.h" diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c index 7dfe1784153f..a94f26e3b6bf 100644 --- a/drivers/gpu/drm/i915/i915_hwmon.c +++ b/drivers/gpu/drm/i915/i915_hwmon.c @@ -9,6 +9,8 @@ #include #include +#include + #include "i915_drv.h" #include "i915_hwmon.h" #include "i915_reg.h" diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 26e5504dbc67..bb87af7d3c22 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -957,106 +957,6 @@ #define EDRAM_WAYS_IDX(cap) (((cap) >> 5) & 0x7) #define EDRAM_SETS_IDX(cap) (((cap) >> 8) & 0x3) -#define GEN6_PCODE_MAILBOX _MMIO(0x138124) -#define GEN6_PCODE_READY (1 << 31) -#define GEN6_PCODE_MB_PARAM2 REG_GENMASK(23, 16) -#define GEN6_PCODE_MB_PARAM1 REG_GENMASK(15, 8) -#define GEN6_PCODE_MB_COMMAND REG_GENMASK(7, 0) -#define GEN6_PCODE_ERROR_MASK 0xFF -#define GEN6_PCODE_SUCCESS 0x0 -#define GEN6_PCODE_ILLEGAL_CMD 0x1 -#define GEN6_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE 0x2 -#define GEN6_PCODE_TIMEOUT 0x3 -#define GEN6_PCODE_UNIMPLEMENTED_CMD 0xFF -#define GEN7_PCODE_TIMEOUT 0x2 -#define GEN7_PCODE_ILLEGAL_DATA 0x3 -#define GEN11_PCODE_ILLEGAL_SUBCOMMAND 0x4 -#define GEN11_PCODE_LOCKED 0x6 -#define GEN11_PCODE_REJECTED 0x11 -#define GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE 0x10 -#define GEN6_PCODE_WRITE_RC6VIDS 0x4 -#define GEN6_PCODE_READ_RC6VIDS 0x5 -#define GEN6_ENCODE_RC6_VID(mv) (((mv) - 245) / 5) -#define GEN6_DECODE_RC6_VID(vids) (((vids) * 5) + 245) -#define BDW_PCODE_DISPLAY_FREQ_CHANGE_REQ 0x18 -#define GEN9_PCODE_READ_MEM_LATENCY 0x6 -#define GEN9_MEM_LATENCY_LEVEL_3_7_MASK REG_GENMASK(31, 24) -#define GEN9_MEM_LATENCY_LEVEL_2_6_MASK REG_GENMASK(23, 16) -#define GEN9_MEM_LATENCY_LEVEL_1_5_MASK REG_GENMASK(15, 8) -#define GEN9_MEM_LATENCY_LEVEL_0_4_MASK REG_GENMASK(7, 0) -#define SKL_PCODE_LOAD_HDCP_KEYS 0x5 -#define SKL_PCODE_CDCLK_CONTROL 0x7 -#define SKL_CDCLK_PREPARE_FOR_CHANGE 0x3 -#define SKL_CDCLK_READY_FOR_CHANGE 0x1 -#define GEN6_PCODE_WRITE_MIN_FREQ_TABLE 0x8 -#define GEN6_PCODE_READ_MIN_FREQ_TABLE 0x9 -#define GEN6_READ_OC_PARAMS 0xc -#define ICL_PCODE_MEM_SUBSYSYSTEM_INFO 0xd -#define ICL_PCODE_MEM_SS_READ_GLOBAL_INFO (0x0 << 8) -#define ICL_PCODE_MEM_SS_READ_QGV_POINT_INFO(point) (((point) << 16) | (0x1 << 8)) -#define ADL_PCODE_MEM_SS_READ_PSF_GV_INFO ((0) | (0x2 << 8)) -#define DISPLAY_TO_PCODE_CDCLK_MAX 0x28D -#define DISPLAY_TO_PCODE_VOLTAGE_MASK REG_GENMASK(1, 0) -#define DISPLAY_TO_PCODE_VOLTAGE_MAX DISPLAY_TO_PCODE_VOLTAGE_MASK -#define DISPLAY_TO_PCODE_CDCLK_VALID REG_BIT(27) -#define DISPLAY_TO_PCODE_PIPE_COUNT_VALID REG_BIT(31) -#define DISPLAY_TO_PCODE_CDCLK_MASK REG_GENMASK(25, 16) -#define DISPLAY_TO_PCODE_PIPE_COUNT_MASK REG_GENMASK(30, 28) -#define DISPLAY_TO_PCODE_CDCLK(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_CDCLK_MASK, (x)) -#define DISPLAY_TO_PCODE_PIPE_COUNT(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_PIPE_COUNT_MASK, (x)) -#define DISPLAY_TO_PCODE_VOLTAGE(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_VOLTAGE_MASK, (x)) -#define DISPLAY_TO_PCODE_UPDATE_MASK(cdclk, num_pipes, voltage_level) \ - ((DISPLAY_TO_PCODE_CDCLK(cdclk)) | \ - (DISPLAY_TO_PCODE_PIPE_COUNT(num_pipes)) | \ - (DISPLAY_TO_PCODE_VOLTAGE(voltage_level))) -#define ICL_PCODE_SAGV_DE_MEM_SS_CONFIG 0xe -#define ICL_PCODE_REP_QGV_MASK REG_GENMASK(1, 0) -#define ICL_PCODE_REP_QGV_SAFE REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 0) -#define ICL_PCODE_REP_QGV_POLL REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 1) -#define ICL_PCODE_REP_QGV_REJECTED REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 2) -#define ADLS_PCODE_REP_PSF_MASK REG_GENMASK(3, 2) -#define ADLS_PCODE_REP_PSF_SAFE REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 0) -#define ADLS_PCODE_REP_PSF_POLL REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 1) -#define ADLS_PCODE_REP_PSF_REJECTED REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 2) -#define ICL_PCODE_REQ_QGV_PT_MASK REG_GENMASK(7, 0) -#define ICL_PCODE_REQ_QGV_PT(x) REG_FIELD_PREP(ICL_PCODE_REQ_QGV_PT_MASK, (x)) -#define ADLS_PCODE_REQ_PSF_PT_MASK REG_GENMASK(10, 8) -#define ADLS_PCODE_REQ_PSF_PT(x) REG_FIELD_PREP(ADLS_PCODE_REQ_PSF_PT_MASK, (x)) -#define GEN6_PCODE_READ_D_COMP 0x10 -#define GEN6_PCODE_WRITE_D_COMP 0x11 -#define ICL_PCODE_EXIT_TCCOLD 0x12 -#define HSW_PCODE_DE_WRITE_FREQ_REQ 0x17 -#define DISPLAY_IPS_CONTROL 0x19 -#define TGL_PCODE_TCCOLD 0x26 -#define TGL_PCODE_EXIT_TCCOLD_DATA_L_EXIT_FAILED REG_BIT(0) -#define TGL_PCODE_EXIT_TCCOLD_DATA_L_BLOCK_REQ 0 -#define TGL_PCODE_EXIT_TCCOLD_DATA_L_UNBLOCK_REQ REG_BIT(0) - /* See also IPS_CTL */ -#define IPS_PCODE_CONTROL (1 << 30) -#define HSW_PCODE_DYNAMIC_DUTY_CYCLE_CONTROL 0x1A -#define GEN9_PCODE_SAGV_CONTROL 0x21 -#define GEN9_SAGV_DISABLE 0x0 -#define GEN9_SAGV_IS_DISABLED 0x1 -#define GEN9_SAGV_ENABLE 0x3 -#define DG1_PCODE_STATUS 0x7E -#define DG1_UNCORE_GET_INIT_STATUS 0x0 -#define DG1_UNCORE_INIT_STATUS_COMPLETE 0x1 -#define PCODE_POWER_SETUP 0x7C -#define POWER_SETUP_SUBCOMMAND_READ_I1 0x4 -#define POWER_SETUP_SUBCOMMAND_WRITE_I1 0x5 -#define POWER_SETUP_I1_WATTS REG_BIT(31) -#define POWER_SETUP_I1_SHIFT 6 /* 10.6 fixed point format */ -#define POWER_SETUP_I1_DATA_MASK REG_GENMASK(15, 0) -#define POWER_SETUP_SUBCOMMAND_G8_ENABLE 0x6 -#define GEN12_PCODE_READ_SAGV_BLOCK_TIME_US 0x23 -#define XEHP_PCODE_FREQUENCY_CONFIG 0x6e /* pvc */ -/* XEHP_PCODE_FREQUENCY_CONFIG sub-commands (param1) */ -#define PCODE_MBOX_FC_SC_READ_FUSED_P0 0x0 -#define PCODE_MBOX_FC_SC_READ_FUSED_PN 0x1 -/* PCODE_MBOX_DOMAIN_* - mailbox domain IDs */ -/* XEHP_PCODE_FREQUENCY_CONFIG param2 */ -#define PCODE_MBOX_DOMAIN_NONE 0x0 -#define PCODE_MBOX_DOMAIN_MEDIAFF 0x3 #define GEN6_PCODE_DATA _MMIO(0x138128) #define GEN6_PCODE_FREQ_IA_RATIO_SHIFT 8 #define GEN6_PCODE_FREQ_RING_RATIO_SHIFT 16 diff --git a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c index c0154fd77fc9..8cfe9b56f1d0 100644 --- a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c +++ b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c @@ -3,6 +3,8 @@ * Copyright © 2020 Intel Corporation */ +#include + #include "display/bxt_dpio_phy_regs.h" #include "display/i9xx_plane_regs.h" #include "display/i9xx_wm_regs.h" diff --git a/drivers/gpu/drm/i915/intel_pcode.c b/drivers/gpu/drm/i915/intel_pcode.c index 76c5916b28f4..c07d48fc1b35 100644 --- a/drivers/gpu/drm/i915/intel_pcode.c +++ b/drivers/gpu/drm/i915/intel_pcode.c @@ -5,6 +5,7 @@ #include #include +#include #include "i915_drv.h" #include "i915_reg.h" diff --git a/include/drm/intel/intel_pcode_regs.h b/include/drm/intel/intel_pcode_regs.h new file mode 100644 index 000000000000..db989ee7c488 --- /dev/null +++ b/include/drm/intel/intel_pcode_regs.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _INTEL_PCODE_REGS_H_ +#define _INTEL_PCODE_REGS_H_ + +#define GEN6_PCODE_MAILBOX _MMIO(0x138124) +#define GEN6_PCODE_READY (1 << 31) +#define GEN6_PCODE_MB_PARAM2 REG_GENMASK(23, 16) +#define GEN6_PCODE_MB_PARAM1 REG_GENMASK(15, 8) +#define GEN6_PCODE_MB_COMMAND REG_GENMASK(7, 0) +#define GEN6_PCODE_ERROR_MASK 0xFF +#define GEN6_PCODE_SUCCESS 0x0 +#define GEN6_PCODE_ILLEGAL_CMD 0x1 +#define GEN6_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE 0x2 +#define GEN6_PCODE_TIMEOUT 0x3 +#define GEN6_PCODE_UNIMPLEMENTED_CMD 0xFF +#define GEN7_PCODE_TIMEOUT 0x2 +#define GEN7_PCODE_ILLEGAL_DATA 0x3 +#define GEN11_PCODE_ILLEGAL_SUBCOMMAND 0x4 +#define GEN11_PCODE_LOCKED 0x6 +#define GEN11_PCODE_REJECTED 0x11 +#define GEN7_PCODE_MIN_FREQ_TABLE_GT_RATIO_OUT_OF_RANGE 0x10 +#define GEN6_PCODE_WRITE_RC6VIDS 0x4 +#define GEN6_PCODE_READ_RC6VIDS 0x5 +#define GEN6_ENCODE_RC6_VID(mv) (((mv) - 245) / 5) +#define GEN6_DECODE_RC6_VID(vids) (((vids) * 5) + 245) +#define BDW_PCODE_DISPLAY_FREQ_CHANGE_REQ 0x18 +#define GEN9_PCODE_READ_MEM_LATENCY 0x6 +#define GEN9_MEM_LATENCY_LEVEL_3_7_MASK REG_GENMASK(31, 24) +#define GEN9_MEM_LATENCY_LEVEL_2_6_MASK REG_GENMASK(23, 16) +#define GEN9_MEM_LATENCY_LEVEL_1_5_MASK REG_GENMASK(15, 8) +#define GEN9_MEM_LATENCY_LEVEL_0_4_MASK REG_GENMASK(7, 0) +#define SKL_PCODE_LOAD_HDCP_KEYS 0x5 +#define SKL_PCODE_CDCLK_CONTROL 0x7 +#define SKL_CDCLK_PREPARE_FOR_CHANGE 0x3 +#define SKL_CDCLK_READY_FOR_CHANGE 0x1 +#define GEN6_PCODE_WRITE_MIN_FREQ_TABLE 0x8 +#define GEN6_PCODE_READ_MIN_FREQ_TABLE 0x9 +#define GEN6_READ_OC_PARAMS 0xc +#define ICL_PCODE_MEM_SUBSYSYSTEM_INFO 0xd +#define ICL_PCODE_MEM_SS_READ_GLOBAL_INFO (0x0 << 8) +#define ICL_PCODE_MEM_SS_READ_QGV_POINT_INFO(point) (((point) << 16) | (0x1 << 8)) +#define ADL_PCODE_MEM_SS_READ_PSF_GV_INFO ((0) | (0x2 << 8)) +#define DISPLAY_TO_PCODE_CDCLK_MAX 0x28D +#define DISPLAY_TO_PCODE_VOLTAGE_MASK REG_GENMASK(1, 0) +#define DISPLAY_TO_PCODE_VOLTAGE_MAX DISPLAY_TO_PCODE_VOLTAGE_MASK +#define DISPLAY_TO_PCODE_CDCLK_VALID REG_BIT(27) +#define DISPLAY_TO_PCODE_PIPE_COUNT_VALID REG_BIT(31) +#define DISPLAY_TO_PCODE_CDCLK_MASK REG_GENMASK(25, 16) +#define DISPLAY_TO_PCODE_PIPE_COUNT_MASK REG_GENMASK(30, 28) +#define DISPLAY_TO_PCODE_CDCLK(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_CDCLK_MASK, (x)) +#define DISPLAY_TO_PCODE_PIPE_COUNT(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_PIPE_COUNT_MASK, (x)) +#define DISPLAY_TO_PCODE_VOLTAGE(x) REG_FIELD_PREP(DISPLAY_TO_PCODE_VOLTAGE_MASK, (x)) +#define DISPLAY_TO_PCODE_UPDATE_MASK(cdclk, num_pipes, voltage_level) \ + ((DISPLAY_TO_PCODE_CDCLK(cdclk)) | \ + (DISPLAY_TO_PCODE_PIPE_COUNT(num_pipes)) | \ + (DISPLAY_TO_PCODE_VOLTAGE(voltage_level))) +#define ICL_PCODE_SAGV_DE_MEM_SS_CONFIG 0xe +#define ICL_PCODE_REP_QGV_MASK REG_GENMASK(1, 0) +#define ICL_PCODE_REP_QGV_SAFE REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 0) +#define ICL_PCODE_REP_QGV_POLL REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 1) +#define ICL_PCODE_REP_QGV_REJECTED REG_FIELD_PREP(ICL_PCODE_REP_QGV_MASK, 2) +#define ADLS_PCODE_REP_PSF_MASK REG_GENMASK(3, 2) +#define ADLS_PCODE_REP_PSF_SAFE REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 0) +#define ADLS_PCODE_REP_PSF_POLL REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 1) +#define ADLS_PCODE_REP_PSF_REJECTED REG_FIELD_PREP(ADLS_PCODE_REP_PSF_MASK, 2) +#define ICL_PCODE_REQ_QGV_PT_MASK REG_GENMASK(7, 0) +#define ICL_PCODE_REQ_QGV_PT(x) REG_FIELD_PREP(ICL_PCODE_REQ_QGV_PT_MASK, (x)) +#define ADLS_PCODE_REQ_PSF_PT_MASK REG_GENMASK(10, 8) +#define ADLS_PCODE_REQ_PSF_PT(x) REG_FIELD_PREP(ADLS_PCODE_REQ_PSF_PT_MASK, (x)) +#define GEN6_PCODE_READ_D_COMP 0x10 +#define GEN6_PCODE_WRITE_D_COMP 0x11 +#define ICL_PCODE_EXIT_TCCOLD 0x12 +#define HSW_PCODE_DE_WRITE_FREQ_REQ 0x17 +#define DISPLAY_IPS_CONTROL 0x19 +#define TGL_PCODE_TCCOLD 0x26 +#define TGL_PCODE_EXIT_TCCOLD_DATA_L_EXIT_FAILED REG_BIT(0) +#define TGL_PCODE_EXIT_TCCOLD_DATA_L_BLOCK_REQ 0 +#define TGL_PCODE_EXIT_TCCOLD_DATA_L_UNBLOCK_REQ REG_BIT(0) +/* See also IPS_CTL */ +#define IPS_PCODE_CONTROL (1 << 30) +#define HSW_PCODE_DYNAMIC_DUTY_CYCLE_CONTROL 0x1A +#define GEN9_PCODE_SAGV_CONTROL 0x21 +#define GEN9_SAGV_DISABLE 0x0 +#define GEN9_SAGV_IS_DISABLED 0x1 +#define GEN9_SAGV_ENABLE 0x3 +#define DG1_PCODE_STATUS 0x7E +#define DG1_UNCORE_GET_INIT_STATUS 0x0 +#define DG1_UNCORE_INIT_STATUS_COMPLETE 0x1 +#define PCODE_POWER_SETUP 0x7C +#define POWER_SETUP_SUBCOMMAND_READ_I1 0x4 +#define POWER_SETUP_SUBCOMMAND_WRITE_I1 0x5 +#define POWER_SETUP_I1_WATTS REG_BIT(31) +#define POWER_SETUP_I1_SHIFT 6 /* 10.6 fixed point format */ +#define POWER_SETUP_I1_DATA_MASK REG_GENMASK(15, 0) +#define POWER_SETUP_SUBCOMMAND_G8_ENABLE 0x6 +#define GEN12_PCODE_READ_SAGV_BLOCK_TIME_US 0x23 +#define XEHP_PCODE_FREQUENCY_CONFIG 0x6e /* pvc */ +/* XEHP_PCODE_FREQUENCY_CONFIG sub-commands (param1) */ +#define PCODE_MBOX_FC_SC_READ_FUSED_P0 0x0 +#define PCODE_MBOX_FC_SC_READ_FUSED_PN 0x1 +/* PCODE_MBOX_DOMAIN_* - mailbox domain IDs */ +/* XEHP_PCODE_FREQUENCY_CONFIG param2 */ +#define PCODE_MBOX_DOMAIN_NONE 0x0 +#define PCODE_MBOX_DOMAIN_MEDIAFF 0x3 + +#endif -- cgit v1.2.3 From 85bba73b29f4ae923ad508a4ce8a87c1c1ec518f Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 5 Feb 2026 15:13:31 +0530 Subject: drm/i915: Remove i915_reg.h from intel_overlay.c Move GEN2_ISR and some interrupt definitions to common header. This removes dependency of i915_reg.h from intel_overlay.c. v3: Rename interrupt header with regs suffix (Jani) v2: Create a separate file for common interrupts (Jani) Reviewed-by: Jani Nikula Signed-off-by: Uma Shankar Link: https://patch.msgid.link/20260205094341.1882816-11-uma.shankar@intel.com --- drivers/gpu/drm/i915/display/intel_display_irq.c | 1 + drivers/gpu/drm/i915/display/intel_display_regs.h | 2 ++ drivers/gpu/drm/i915/display/intel_overlay.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt.c | 1 + drivers/gpu/drm/i915/gt/intel_ring_submission.c | 1 + drivers/gpu/drm/i915/i915_irq.c | 1 + drivers/gpu/drm/i915/i915_reg.h | 37 ------------------- include/drm/intel/intel_gmd_interrupt_regs.h | 43 +++++++++++++++++++++++ 8 files changed, 50 insertions(+), 38 deletions(-) create mode 100644 include/drm/intel/intel_gmd_interrupt_regs.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_display_irq.c b/drivers/gpu/drm/i915/display/intel_display_irq.c index 0a71840041de..432a9c895c39 100644 --- a/drivers/gpu/drm/i915/display/intel_display_irq.c +++ b/drivers/gpu/drm/i915/display/intel_display_irq.c @@ -5,6 +5,7 @@ #include #include +#include #include "i915_reg.h" #include "icl_dsi_regs.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_regs.h b/drivers/gpu/drm/i915/display/intel_display_regs.h index 0ee7295e1d4e..d03f554ecd7e 100644 --- a/drivers/gpu/drm/i915/display/intel_display_regs.h +++ b/drivers/gpu/drm/i915/display/intel_display_regs.h @@ -94,6 +94,8 @@ #define VLV_ERROR_PAGE_TABLE (1 << 4) #define VLV_ERROR_CLAIM (1 << 0) +#define GEN2_ISR _MMIO(0x20ac) + #define VLV_ERROR_REGS I915_ERROR_REGS(VLV_EMR, VLV_EIR) #define _MBUS_ABOX0_CTL 0x45038 diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index ae2a3527645f..e7838f4d2dac 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -28,6 +28,7 @@ #include #include +#include #include "gem/i915_gem_internal.h" #include "gem/i915_gem_object_frontbuffer.h" @@ -37,7 +38,6 @@ #include "gt/intel_ring.h" #include "i915_drv.h" -#include "i915_reg.h" #include "intel_color_regs.h" #include "intel_de.h" #include "intel_display_regs.h" diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index ac527d878820..d76121e117e1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -5,6 +5,7 @@ #include #include +#include #include "gem/i915_gem_internal.h" #include "gem/i915_gem_lmem.h" diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index 8314a4b0505e..c1797e49811d 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -4,6 +4,7 @@ */ #include +#include #include "gem/i915_gem_internal.h" diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 3fe978d4ea53..d4d8dd0a4174 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "display/intel_display_irq.h" #include "display/intel_hotplug.h" diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 2be799ffbc2b..1be8426b6a91 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -364,7 +364,6 @@ #define GEN2_IER _MMIO(0x20a0) #define GEN2_IIR _MMIO(0x20a4) #define GEN2_IMR _MMIO(0x20a8) -#define GEN2_ISR _MMIO(0x20ac) #define GEN2_IRQ_REGS I915_IRQ_REGS(GEN2_IMR, \ GEN2_IER, \ @@ -521,42 +520,6 @@ /* These are all the "old" interrupts */ #define ILK_BSD_USER_INTERRUPT (1 << 5) -#define I915_PM_INTERRUPT (1 << 31) -#define I915_ISP_INTERRUPT (1 << 22) -#define I915_LPE_PIPE_B_INTERRUPT (1 << 21) -#define I915_LPE_PIPE_A_INTERRUPT (1 << 20) -#define I915_MIPIC_INTERRUPT (1 << 19) -#define I915_MIPIA_INTERRUPT (1 << 18) -#define I915_PIPE_CONTROL_NOTIFY_INTERRUPT (1 << 18) -#define I915_DISPLAY_PORT_INTERRUPT (1 << 17) -#define I915_DISPLAY_PIPE_C_HBLANK_INTERRUPT (1 << 16) -#define I915_MASTER_ERROR_INTERRUPT (1 << 15) -#define I915_DISPLAY_PIPE_B_HBLANK_INTERRUPT (1 << 14) -#define I915_GMCH_THERMAL_SENSOR_EVENT_INTERRUPT (1 << 14) /* p-state */ -#define I915_DISPLAY_PIPE_A_HBLANK_INTERRUPT (1 << 13) -#define I915_HWB_OOM_INTERRUPT (1 << 13) -#define I915_LPE_PIPE_C_INTERRUPT (1 << 12) -#define I915_SYNC_STATUS_INTERRUPT (1 << 12) -#define I915_MISC_INTERRUPT (1 << 11) -#define I915_DISPLAY_PLANE_A_FLIP_PENDING_INTERRUPT (1 << 11) -#define I915_DISPLAY_PIPE_C_VBLANK_INTERRUPT (1 << 10) -#define I915_DISPLAY_PLANE_B_FLIP_PENDING_INTERRUPT (1 << 10) -#define I915_DISPLAY_PIPE_C_EVENT_INTERRUPT (1 << 9) -#define I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT (1 << 9) -#define I915_DISPLAY_PIPE_C_DPBM_INTERRUPT (1 << 8) -#define I915_DISPLAY_PLANE_C_FLIP_PENDING_INTERRUPT (1 << 8) -#define I915_DISPLAY_PIPE_A_VBLANK_INTERRUPT (1 << 7) -#define I915_DISPLAY_PIPE_A_EVENT_INTERRUPT (1 << 6) -#define I915_DISPLAY_PIPE_B_VBLANK_INTERRUPT (1 << 5) -#define I915_DISPLAY_PIPE_B_EVENT_INTERRUPT (1 << 4) -#define I915_DISPLAY_PIPE_A_DPBM_INTERRUPT (1 << 3) -#define I915_DISPLAY_PIPE_B_DPBM_INTERRUPT (1 << 2) -#define I915_DEBUG_INTERRUPT (1 << 2) -#define I915_WINVALID_INTERRUPT (1 << 1) -#define I915_USER_INTERRUPT (1 << 1) -#define I915_ASLE_INTERRUPT (1 << 0) -#define I915_BSD_USER_INTERRUPT (1 << 25) - #define GEN6_BSD_RNCID _MMIO(0x12198) #define GEN7_FF_THREAD_MODE _MMIO(0x20a0) diff --git a/include/drm/intel/intel_gmd_interrupt_regs.h b/include/drm/intel/intel_gmd_interrupt_regs.h new file mode 100644 index 000000000000..dc9d5fc29ff6 --- /dev/null +++ b/include/drm/intel/intel_gmd_interrupt_regs.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _INTEL_GMD_INTERRUPT_REGS_H_ +#define _INTEL_GMD_INTERRUPT_REGS_H_ + +#define I915_PM_INTERRUPT (1 << 31) +#define I915_ISP_INTERRUPT (1 << 22) +#define I915_LPE_PIPE_B_INTERRUPT (1 << 21) +#define I915_LPE_PIPE_A_INTERRUPT (1 << 20) +#define I915_MIPIC_INTERRUPT (1 << 19) +#define I915_MIPIA_INTERRUPT (1 << 18) +#define I915_PIPE_CONTROL_NOTIFY_INTERRUPT (1 << 18) +#define I915_DISPLAY_PORT_INTERRUPT (1 << 17) +#define I915_DISPLAY_PIPE_C_HBLANK_INTERRUPT (1 << 16) +#define I915_MASTER_ERROR_INTERRUPT (1 << 15) +#define I915_DISPLAY_PIPE_B_HBLANK_INTERRUPT (1 << 14) +#define I915_GMCH_THERMAL_SENSOR_EVENT_INTERRUPT (1 << 14) /* p-state */ +#define I915_DISPLAY_PIPE_A_HBLANK_INTERRUPT (1 << 13) +#define I915_HWB_OOM_INTERRUPT (1 << 13) +#define I915_LPE_PIPE_C_INTERRUPT (1 << 12) +#define I915_SYNC_STATUS_INTERRUPT (1 << 12) +#define I915_MISC_INTERRUPT (1 << 11) +#define I915_DISPLAY_PLANE_A_FLIP_PENDING_INTERRUPT (1 << 11) +#define I915_DISPLAY_PIPE_C_VBLANK_INTERRUPT (1 << 10) +#define I915_DISPLAY_PLANE_B_FLIP_PENDING_INTERRUPT (1 << 10) +#define I915_DISPLAY_PIPE_C_EVENT_INTERRUPT (1 << 9) +#define I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT (1 << 9) +#define I915_DISPLAY_PIPE_C_DPBM_INTERRUPT (1 << 8) +#define I915_DISPLAY_PLANE_C_FLIP_PENDING_INTERRUPT (1 << 8) +#define I915_DISPLAY_PIPE_A_VBLANK_INTERRUPT (1 << 7) +#define I915_DISPLAY_PIPE_A_EVENT_INTERRUPT (1 << 6) +#define I915_DISPLAY_PIPE_B_VBLANK_INTERRUPT (1 << 5) +#define I915_DISPLAY_PIPE_B_EVENT_INTERRUPT (1 << 4) +#define I915_DISPLAY_PIPE_A_DPBM_INTERRUPT (1 << 3) +#define I915_DISPLAY_PIPE_B_DPBM_INTERRUPT (1 << 2) +#define I915_DEBUG_INTERRUPT (1 << 2) +#define I915_WINVALID_INTERRUPT (1 << 1) +#define I915_USER_INTERRUPT (1 << 1) +#define I915_ASLE_INTERRUPT (1 << 0) +#define I915_BSD_USER_INTERRUPT (1 << 25) + +#endif -- cgit v1.2.3 From a8454813854d93649dd21bc926e7c6f1d0c83e3c Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 5 Feb 2026 15:13:33 +0530 Subject: drm/i915: Remove i915_reg.h from i9xx_wm.c Move FW_BLC_SELF to common header to make i9xx_wm.c free from i915_reg.h include. Introduce a common intel_gmd_misc_regs.h to define common miscellaneous register definitions across graphics and display. v3: MISC header included as needed, drop from i915_reg (Jani) v2: Introdue a common misc header for GMD Signed-off-by: Uma Shankar Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260205094341.1882816-13-uma.shankar@intel.com --- drivers/gpu/drm/i915/display/i9xx_wm.c | 2 +- .../gpu/drm/i915/display/intel_display_debugfs.c | 1 + drivers/gpu/drm/i915/display/intel_display_regs.h | 7 ++++++- drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 1 + drivers/gpu/drm/i915/gt/intel_ring_submission.c | 1 + drivers/gpu/drm/i915/gt/intel_workarounds.c | 2 ++ drivers/gpu/drm/i915/gvt/cmd_parser.c | 1 + drivers/gpu/drm/i915/gvt/mmio_context.c | 1 + drivers/gpu/drm/i915/i915_debugfs.c | 1 + drivers/gpu/drm/i915/i915_reg.h | 19 ------------------- drivers/gpu/drm/i915/intel_clock_gating.c | 1 + drivers/gpu/drm/i915/intel_gvt_mmio_table.c | 1 + include/drm/intel/intel_gmd_misc_regs.h | 21 +++++++++++++++++++++ 13 files changed, 38 insertions(+), 21 deletions(-) create mode 100644 include/drm/intel/intel_gmd_misc_regs.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/i9xx_wm.c b/drivers/gpu/drm/i915/display/i9xx_wm.c index 39dfceb438ae..24f898efa9dd 100644 --- a/drivers/gpu/drm/i915/display/i9xx_wm.c +++ b/drivers/gpu/drm/i915/display/i9xx_wm.c @@ -6,8 +6,8 @@ #include #include +#include -#include "i915_reg.h" #include "i9xx_wm.h" #include "i9xx_wm_regs.h" #include "intel_atomic.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_debugfs.c b/drivers/gpu/drm/i915/display/intel_display_debugfs.c index aba13e8a9051..f041a7102317 100644 --- a/drivers/gpu/drm/i915/display/intel_display_debugfs.c +++ b/drivers/gpu/drm/i915/display/intel_display_debugfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "hsw_ips.h" #include "i915_reg.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_regs.h b/drivers/gpu/drm/i915/display/intel_display_regs.h index 5bc891f6de57..9f241655aa99 100644 --- a/drivers/gpu/drm/i915/display/intel_display_regs.h +++ b/drivers/gpu/drm/i915/display/intel_display_regs.h @@ -3132,6 +3132,11 @@ enum skl_power_gate { #define MTL_TRAS_MASK REG_GENMASK(16, 8) #define MTL_TRDPRE_MASK REG_GENMASK(7, 0) - +#define FW_BLC _MMIO(0x20d8) +#define FW_BLC2 _MMIO(0x20dc) +#define FW_BLC_SELF _MMIO(0x20e0) /* 915+ only */ +#define FW_BLC_SELF_EN_MASK REG_BIT(31) +#define FW_BLC_SELF_FIFO_MASK REG_BIT(16) /* 945 only */ +#define FW_BLC_SELF_EN REG_BIT(15) /* 945 only */ #endif /* __INTEL_DISPLAY_REGS_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c index 5eda98ebc1ae..ee90f5323da7 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c @@ -6,6 +6,7 @@ #include #include +#include #include "display/intel_display.h" #include "i915_drv.h" diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index c1797e49811d..099453dd9cd5 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -5,6 +5,7 @@ #include #include +#include #include "gem/i915_gem_internal.h" diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index ece88c612e27..4427812b2438 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -3,6 +3,8 @@ * Copyright © 2014-2018 Intel Corporation */ +#include + #include "i915_drv.h" #include "i915_reg.h" #include "i915_mmio_range.h" diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index bf7c3d3f5f8a..98c35c78a4ed 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -37,6 +37,7 @@ #include #include +#include #include "display/i9xx_plane_regs.h" #include "display/intel_display_regs.h" diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c index d4e9d485d382..3eb442acdf8d 100644 --- a/drivers/gpu/drm/i915/gvt/mmio_context.c +++ b/drivers/gpu/drm/i915/gvt/mmio_context.c @@ -34,6 +34,7 @@ */ #include +#include #include "gt/intel_context.h" #include "gt/intel_engine_regs.h" diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 42f6b44f0027..4778ba664ec7 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -33,6 +33,7 @@ #include #include +#include #include "gem/i915_gem_context.h" #include "gt/intel_gt.h" diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index b808d1ec5387..2bac216bd2b9 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -393,24 +393,10 @@ #define GEN2_ERROR_REGS I915_ERROR_REGS(EMR, EIR) -#define INSTPM _MMIO(0x20c0) -#define INSTPM_SELF_EN (1 << 12) /* 915GM only */ -#define INSTPM_AGPBUSY_INT_EN (1 << 11) /* gen3: when disabled, pending interrupts - will not assert AGPBUSY# and will only - be delivered when out of C3. */ -#define INSTPM_FORCE_ORDERING (1 << 7) /* GEN6+ */ -#define INSTPM_TLB_INVALIDATE (1 << 9) -#define INSTPM_SYNC_FLUSH (1 << 5) #define MEM_MODE _MMIO(0x20cc) #define MEM_DISPLAY_B_TRICKLE_FEED_DISABLE (1 << 3) /* 830 only */ #define MEM_DISPLAY_A_TRICKLE_FEED_DISABLE (1 << 2) /* 830/845 only */ #define MEM_DISPLAY_TRICKLE_FEED_DISABLE (1 << 2) /* 85x only */ -#define FW_BLC _MMIO(0x20d8) -#define FW_BLC2 _MMIO(0x20dc) -#define FW_BLC_SELF _MMIO(0x20e0) /* 915+ only */ -#define FW_BLC_SELF_EN_MASK REG_BIT(31) -#define FW_BLC_SELF_FIFO_MASK REG_BIT(16) /* 945 only */ -#define FW_BLC_SELF_EN REG_BIT(15) /* 945 only */ #define MM_BURST_LENGTH 0x00700000 #define MM_FIFO_WATERMARK 0x0001F000 #define LM_BURST_LENGTH 0x00000700 @@ -833,11 +819,6 @@ #define KVM_CONFIG_CHANGE_NOTIFICATION_SELECT REG_BIT(14) -#define DISP_ARB_CTL _MMIO(0x45000) -#define DISP_FBC_MEMORY_WAKE REG_BIT(31) -#define DISP_TILE_SURFACE_SWIZZLING REG_BIT(13) -#define DISP_FBC_WM_DIS REG_BIT(15) - #define GEN8_CHICKEN_DCPR_1 _MMIO(0x46430) #define _LATENCY_REPORTING_REMOVED_PIPE_D REG_BIT(31) #define SKL_SELECT_ALTERNATE_DC_EXIT REG_BIT(30) diff --git a/drivers/gpu/drm/i915/intel_clock_gating.c b/drivers/gpu/drm/i915/intel_clock_gating.c index 4e18d5a22112..1ad31435bd3f 100644 --- a/drivers/gpu/drm/i915/intel_clock_gating.c +++ b/drivers/gpu/drm/i915/intel_clock_gating.c @@ -26,6 +26,7 @@ */ #include +#include #include "display/i9xx_plane_regs.h" #include "display/intel_display.h" diff --git a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c index 8cfe9b56f1d0..c8a51e773086 100644 --- a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c +++ b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c @@ -4,6 +4,7 @@ */ #include +#include #include "display/bxt_dpio_phy_regs.h" #include "display/i9xx_plane_regs.h" diff --git a/include/drm/intel/intel_gmd_misc_regs.h b/include/drm/intel/intel_gmd_misc_regs.h new file mode 100644 index 000000000000..763d7711f21c --- /dev/null +++ b/include/drm/intel/intel_gmd_misc_regs.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _INTEL_GMD_MISC_REGS_H_ +#define _INTEL_GMD_MISC_REGS_H_ + +#define DISP_ARB_CTL _MMIO(0x45000) +#define DISP_FBC_MEMORY_WAKE REG_BIT(31) +#define DISP_TILE_SURFACE_SWIZZLING REG_BIT(13) +#define DISP_FBC_WM_DIS REG_BIT(15) + +#define INSTPM _MMIO(0x20c0) +#define INSTPM_SELF_EN (1 << 12) /* 915GM only */ +#define INSTPM_AGPBUSY_INT_EN (1 << 11) /* gen3: when disabled, pending interrupts + will not assert AGPBUSY# and will only + be delivered when out of C3. */ +#define INSTPM_FORCE_ORDERING (1 << 7) /* GEN6+ */ +#define INSTPM_TLB_INVALIDATE (1 << 9) +#define INSTPM_SYNC_FLUSH (1 << 5) + +#endif -- cgit v1.2.3 From 6ef8bf1e2c110ac5a2065b8dc945dffba999db5a Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 5 Feb 2026 15:13:38 +0530 Subject: drm/i915: Remove i915_reg.h from intel_display_irq.c Move VLV_IRQ_REGS to common header for interrupt to make intel_display_irq.c free from including i915_reg.h. v2: Move interrupt to dedicated header (Jani) Signed-off-by: Uma Shankar Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260205094341.1882816-18-uma.shankar@intel.com --- drivers/gpu/drm/i915/display/intel_display_irq.c | 1 - drivers/gpu/drm/i915/display/intel_display_regs.h | 5 +++ drivers/gpu/drm/i915/gt/intel_gt_irq.c | 2 + drivers/gpu/drm/i915/gt/intel_rc6.c | 1 + drivers/gpu/drm/i915/gvt/handlers.c | 1 + drivers/gpu/drm/i915/gvt/interrupt.c | 1 + drivers/gpu/drm/i915/i915_reg.h | 52 ----------------------- drivers/gpu/drm/i915/intel_clock_gating.c | 1 + drivers/gpu/drm/i915/intel_gvt_mmio_table.c | 2 + drivers/gpu/drm/i915/vlv_suspend.c | 1 + include/drm/intel/intel_gmd_interrupt_regs.h | 49 +++++++++++++++++++++ 11 files changed, 63 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_display_irq.c b/drivers/gpu/drm/i915/display/intel_display_irq.c index 432a9c895c39..bd0eb1f46919 100644 --- a/drivers/gpu/drm/i915/display/intel_display_irq.c +++ b/drivers/gpu/drm/i915/display/intel_display_irq.c @@ -7,7 +7,6 @@ #include #include -#include "i915_reg.h" #include "icl_dsi_regs.h" #include "intel_crtc.h" #include "intel_de.h" diff --git a/drivers/gpu/drm/i915/display/intel_display_regs.h b/drivers/gpu/drm/i915/display/intel_display_regs.h index dcb8cab7b30b..1c77a7de2d6e 100644 --- a/drivers/gpu/drm/i915/display/intel_display_regs.h +++ b/drivers/gpu/drm/i915/display/intel_display_regs.h @@ -1470,6 +1470,11 @@ #define DE_PIPEA_FIFO_UNDERRUN (1 << 0) #define DE_PIPE_FIFO_UNDERRUN(pipe) (1 << (8 * (pipe))) +/* Display Internal Timeout Register */ +#define RM_TIMEOUT _MMIO(0x42060) +#define RM_TIMEOUT_REG_CAPTURE _MMIO(0x420E0) +#define MMIO_TIMEOUT_US(us) ((us) << 0) + #define GEN8_DE_MISC_ISR _MMIO(0x44460) #define GEN8_DE_MISC_IMR _MMIO(0x44464) #define GEN8_DE_MISC_IIR _MMIO(0x44468) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c index 75e802e10be2..d85c849c0081 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c @@ -5,6 +5,8 @@ #include +#include + #include "i915_drv.h" #include "i915_irq.h" #include "i915_reg.h" diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index 942ac1ebecee..5c316f734c4a 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -8,6 +8,7 @@ #include #include +#include #include "display/vlv_clock.h" #include "gem/i915_gem_region.h" diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 2e9d9d0638ae..4f65ced906da 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "display/bxt_dpio_phy_regs.h" #include "display/i9xx_plane_regs.h" diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index 91d22b1c62e2..f85113218037 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -32,6 +32,7 @@ #include #include +#include #include "display/intel_display_regs.h" diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 5cb53a8c451a..7f3d5b7f7abd 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -335,9 +335,6 @@ #define VLV_GU_CTL0 _MMIO(VLV_DISPLAY_BASE + 0x2030) #define VLV_GU_CTL1 _MMIO(VLV_DISPLAY_BASE + 0x2034) -#define SCPD0 _MMIO(0x209c) /* 915+ only */ -#define SCPD_FBC_IGNORE_3D (1 << 6) -#define CSTATE_RENDER_CLOCK_GATE_DISABLE (1 << 5) #define GEN2_IER _MMIO(0x20a0) #define GEN2_IIR _MMIO(0x20a4) #define GEN2_IMR _MMIO(0x20a8) @@ -350,13 +347,6 @@ #define GINT_DIS (1 << 22) #define GCFG_DIS (1 << 8) #define VLV_GUNIT_CLOCK_GATE2 _MMIO(VLV_DISPLAY_BASE + 0x2064) -#define VLV_IIR_RW _MMIO(VLV_DISPLAY_BASE + 0x2084) -#define VLV_IER _MMIO(VLV_DISPLAY_BASE + 0x20a0) -#define VLV_IIR _MMIO(VLV_DISPLAY_BASE + 0x20a4) -#define VLV_IMR _MMIO(VLV_DISPLAY_BASE + 0x20a8) -#define VLV_ISR _MMIO(VLV_DISPLAY_BASE + 0x20ac) -#define VLV_PCBR _MMIO(VLV_DISPLAY_BASE + 0x2120) -#define VLV_PCBR_ADDR_SHIFT 12 #define EIR _MMIO(0x20b0) #define EMR _MMIO(0x20b4) @@ -682,11 +672,6 @@ #define PCH_3DCGDIS1 _MMIO(0x46024) # define VFMUNIT_CLOCK_GATE_DISABLE (1 << 11) -/* Display Internal Timeout Register */ -#define RM_TIMEOUT _MMIO(0x42060) -#define RM_TIMEOUT_REG_CAPTURE _MMIO(0x420E0) -#define MMIO_TIMEOUT_US(us) ((us) << 0) - #define VLV_MASTER_IER _MMIO(0x4400c) /* Gunit master IER */ #define MASTER_INTERRUPT_ENABLE (1 << 31) @@ -699,24 +684,6 @@ GTIER, \ GTIIR) -#define GEN8_MASTER_IRQ _MMIO(0x44200) -#define GEN8_MASTER_IRQ_CONTROL (1 << 31) -#define GEN8_PCU_IRQ (1 << 30) -#define GEN8_DE_PCH_IRQ (1 << 23) -#define GEN8_DE_MISC_IRQ (1 << 22) -#define GEN8_DE_PORT_IRQ (1 << 20) -#define GEN8_DE_PIPE_C_IRQ (1 << 18) -#define GEN8_DE_PIPE_B_IRQ (1 << 17) -#define GEN8_DE_PIPE_A_IRQ (1 << 16) -#define GEN8_DE_PIPE_IRQ(pipe) (1 << (16 + (pipe))) -#define GEN8_GT_VECS_IRQ (1 << 6) -#define GEN8_GT_GUC_IRQ (1 << 5) -#define GEN8_GT_PM_IRQ (1 << 4) -#define GEN8_GT_VCS1_IRQ (1 << 3) /* NB: VCS2 in bspec! */ -#define GEN8_GT_VCS0_IRQ (1 << 2) /* NB: VCS1 in bpsec! */ -#define GEN8_GT_BCS_IRQ (1 << 1) -#define GEN8_GT_RCS_IRQ (1 << 0) - #define GEN8_GT_ISR(which) _MMIO(0x44300 + (0x10 * (which))) #define GEN8_GT_IMR(which) _MMIO(0x44304 + (0x10 * (which))) #define GEN8_GT_IIR(which) _MMIO(0x44308 + (0x10 * (which))) @@ -742,25 +709,6 @@ GEN8_PCU_IER, \ GEN8_PCU_IIR) -#define GEN11_GU_MISC_ISR _MMIO(0x444f0) -#define GEN11_GU_MISC_IMR _MMIO(0x444f4) -#define GEN11_GU_MISC_IIR _MMIO(0x444f8) -#define GEN11_GU_MISC_IER _MMIO(0x444fc) -#define GEN11_GU_MISC_GSE (1 << 27) - -#define GEN11_GU_MISC_IRQ_REGS I915_IRQ_REGS(GEN11_GU_MISC_IMR, \ - GEN11_GU_MISC_IER, \ - GEN11_GU_MISC_IIR) - -#define GEN11_GFX_MSTR_IRQ _MMIO(0x190010) -#define GEN11_MASTER_IRQ (1 << 31) -#define GEN11_PCU_IRQ (1 << 30) -#define GEN11_GU_MISC_IRQ (1 << 29) -#define GEN11_DISPLAY_IRQ (1 << 16) -#define GEN11_GT_DW_IRQ(x) (1 << (x)) -#define GEN11_GT_DW1_IRQ (1 << 1) -#define GEN11_GT_DW0_IRQ (1 << 0) - #define DG1_MSTR_TILE_INTR _MMIO(0x190008) #define DG1_MSTR_IRQ REG_BIT(31) #define DG1_MSTR_TILE(t) REG_BIT(t) diff --git a/drivers/gpu/drm/i915/intel_clock_gating.c b/drivers/gpu/drm/i915/intel_clock_gating.c index 1ad31435bd3f..d0400ea2ffc7 100644 --- a/drivers/gpu/drm/i915/intel_clock_gating.c +++ b/drivers/gpu/drm/i915/intel_clock_gating.c @@ -27,6 +27,7 @@ #include #include +#include #include "display/i9xx_plane_regs.h" #include "display/intel_display.h" diff --git a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c index c8a51e773086..ae42818ab6e0 100644 --- a/drivers/gpu/drm/i915/intel_gvt_mmio_table.c +++ b/drivers/gpu/drm/i915/intel_gvt_mmio_table.c @@ -6,6 +6,8 @@ #include #include +#include + #include "display/bxt_dpio_phy_regs.h" #include "display/i9xx_plane_regs.h" #include "display/i9xx_wm_regs.h" diff --git a/drivers/gpu/drm/i915/vlv_suspend.c b/drivers/gpu/drm/i915/vlv_suspend.c index bace7b38329b..1e4343fe5574 100644 --- a/drivers/gpu/drm/i915/vlv_suspend.c +++ b/drivers/gpu/drm/i915/vlv_suspend.c @@ -7,6 +7,7 @@ #include #include +#include #include "gt/intel_gt_regs.h" diff --git a/include/drm/intel/intel_gmd_interrupt_regs.h b/include/drm/intel/intel_gmd_interrupt_regs.h index dc9d5fc29ff6..ce66c4151e76 100644 --- a/include/drm/intel/intel_gmd_interrupt_regs.h +++ b/include/drm/intel/intel_gmd_interrupt_regs.h @@ -40,4 +40,53 @@ #define I915_ASLE_INTERRUPT (1 << 0) #define I915_BSD_USER_INTERRUPT (1 << 25) +#define GEN8_MASTER_IRQ _MMIO(0x44200) +#define GEN8_MASTER_IRQ_CONTROL (1 << 31) +#define GEN8_PCU_IRQ (1 << 30) +#define GEN8_DE_PCH_IRQ (1 << 23) +#define GEN8_DE_MISC_IRQ (1 << 22) +#define GEN8_DE_PORT_IRQ (1 << 20) +#define GEN8_DE_PIPE_C_IRQ (1 << 18) +#define GEN8_DE_PIPE_B_IRQ (1 << 17) +#define GEN8_DE_PIPE_A_IRQ (1 << 16) +#define GEN8_DE_PIPE_IRQ(pipe) (1 << (16 + (pipe))) +#define GEN8_GT_VECS_IRQ (1 << 6) +#define GEN8_GT_GUC_IRQ (1 << 5) +#define GEN8_GT_PM_IRQ (1 << 4) +#define GEN8_GT_VCS1_IRQ (1 << 3) /* NB: VCS2 in bspec! */ +#define GEN8_GT_VCS0_IRQ (1 << 2) /* NB: VCS1 in bpsec! */ +#define GEN8_GT_BCS_IRQ (1 << 1) +#define GEN8_GT_RCS_IRQ (1 << 0) + +#define GEN11_GU_MISC_ISR _MMIO(0x444f0) +#define GEN11_GU_MISC_IMR _MMIO(0x444f4) +#define GEN11_GU_MISC_IIR _MMIO(0x444f8) +#define GEN11_GU_MISC_IER _MMIO(0x444fc) +#define GEN11_GU_MISC_GSE (1 << 27) + +#define GEN11_GU_MISC_IRQ_REGS I915_IRQ_REGS(GEN11_GU_MISC_IMR, \ + GEN11_GU_MISC_IER, \ + GEN11_GU_MISC_IIR) + +#define GEN11_GFX_MSTR_IRQ _MMIO(0x190010) +#define GEN11_MASTER_IRQ (1 << 31) +#define GEN11_PCU_IRQ (1 << 30) +#define GEN11_GU_MISC_IRQ (1 << 29) +#define GEN11_DISPLAY_IRQ (1 << 16) +#define GEN11_GT_DW_IRQ(x) (1 << (x)) +#define GEN11_GT_DW1_IRQ (1 << 1) +#define GEN11_GT_DW0_IRQ (1 << 0) + +#define SCPD0 _MMIO(0x209c) /* 915+ only */ +#define SCPD_FBC_IGNORE_3D (1 << 6) +#define CSTATE_RENDER_CLOCK_GATE_DISABLE (1 << 5) + +#define VLV_IIR_RW _MMIO(VLV_DISPLAY_BASE + 0x2084) +#define VLV_IER _MMIO(VLV_DISPLAY_BASE + 0x20a0) +#define VLV_IIR _MMIO(VLV_DISPLAY_BASE + 0x20a4) +#define VLV_IMR _MMIO(VLV_DISPLAY_BASE + 0x20a8) +#define VLV_ISR _MMIO(VLV_DISPLAY_BASE + 0x20ac) +#define VLV_PCBR _MMIO(VLV_DISPLAY_BASE + 0x2120) +#define VLV_PCBR_ADDR_SHIFT 12 + #endif -- cgit v1.2.3 From 022ac075088366b62e130da5e1b200bc93a47191 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:22 -0800 Subject: bpf: use reg->var_off instead of reg->off for pointers This commit consolidates static and varying pointer offset tracking logic. All offsets are now represented solely using `.var_off` and min/max fields. The reasons are twofold: - This simplifies pointer tracking code, as each relevant function needs to check the `.var_off` field anyway. - It makes it easier to widen pointer registers for the purpose of loop convergence checks, by forgoing the `regsafe()` logic demanding `.off` fields to be identical. The changes are spread across many functions and are hard to group into smaller patches. Some of the logical changes include: - Checks in __check_ptr_off_reg() are reordered so that the tnum_is_const() check is done before operating on reg->var_off.value. - check_packet_access() now uses check_mem_region_access() to handle possible 'off' overflow cases. - In check_helper_mem_access() utility functions like check_packet_access() are now called with 'off=0', as these utility functions now account for the complete register offset range. - In check_reg_type() a call to __check_ptr_off_reg() is added before a call to btf_struct_ids_match(). This prevents btf_struct_ids_match() from potentially working on non-constant reg->var_off.value. - regsafe() is relaxed to avoid comparing '.off' field for pointers. As a precaution, the changes are verified in [1] by adding a pass checking that no pointer has non-zero '.off' field on each do_check_insn() iteration. [1] https://github.com/eddyz87/bpf/tree/ptrs-off-migration Notable selftests changes: - `.var_off` value changed because it now combines static and varying offsets. Affected tests: - linked_list/incorrect_node_var_off - linked_list/incorrect_head_var_off2 - verifier_align/packet_variable_offset - Overflowing `smax_value` bound leads to a pointer with big negative or positive offset to be rejected immediately (previously overflowing `rX += const` instruction updated `.off` field avoiding the overflow). Affected tests: - verifier_align/dubious_pointer_arithmetic - verifier_bounds/var_off_insn_off_test1 - Invalid access to packet now reports full offset inside a packet. Affected tests: - verifier_direct_packet_access/test23_x_pkt_ptr_4 - A change in check_mem_region_access() behavior: when register `.smin_value` is negative, it reports "rX min value is negative..." before calling into __check_mem_access() which reports "invalid access to ...". In the tests below, the `.off` field was negative, while `.smin_value` remained positive. This is no longer the case after the changes in this commit. Affected tests: - verifier_gotox/jump_table_invalid_mem_acceess_neg - verifier_helper_packet_access/test15_cls_helper_fail_sub - verifier_helper_value_access/imm_out_of_bound_2 - verifier_helper_value_access/reg_out_of_bound_2 - verifier_meta_access/meta_access_test2 - verifier_value_ptr_arith/known_scalar_from_different_maps - lower_oob_arith_test_1 - value_ptr_known_scalar_3 - access_value_ptr_known_scalar - Usage of check_mem_region_access() instead of __check_mem_access() in check_packet_access() changes the reported message from "rX offset is outside ..." to "rX min/max value is outside ...". Affected tests: - verifier_xdp_direct_packet_access/* - In check_func_arg_reg_off() the check for zero offset now operates on `.var_off` field instead of `.off` field. For tests where the pattern looks like `kfunc(reg_with_var_off, ...)`, this changes the reported error: - previously the error "variable ... access ... disallowed" was reported by __check_ptr_off_reg(); - now "R1 must have zero offset ..." is reported by check_func_arg_reg_off() itself. Affected tests: - verifier/calls.c "calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset" Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-2-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +- kernel/bpf/log.c | 2 + kernel/bpf/verifier.c | 317 ++++++++------------- .../testing/selftests/bpf/prog_tests/linked_list.c | 4 +- .../selftests/bpf/progs/exceptions_assert.c | 2 +- tools/testing/selftests/bpf/progs/iters.c | 6 +- .../selftests/bpf/progs/mem_rdonly_untrusted.c | 2 +- tools/testing/selftests/bpf/progs/verifier_align.c | 40 ++- .../testing/selftests/bpf/progs/verifier_bounds.c | 2 +- .../bpf/progs/verifier_direct_packet_access.c | 4 +- tools/testing/selftests/bpf/progs/verifier_gotox.c | 4 +- .../bpf/progs/verifier_helper_packet_access.c | 2 +- .../bpf/progs/verifier_helper_value_access.c | 4 +- .../testing/selftests/bpf/progs/verifier_int_ptr.c | 2 +- .../selftests/bpf/progs/verifier_meta_access.c | 2 +- .../selftests/bpf/progs/verifier_spill_fill.c | 8 +- .../selftests/bpf/progs/verifier_stack_ptr.c | 4 +- .../selftests/bpf/progs/verifier_value_ptr_arith.c | 10 +- .../bpf/progs/verifier_xdp_direct_packet_access.c | 64 ++--- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 20 files changed, 206 insertions(+), 278 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ef8e45a362d9..a97bdbf3a07b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,8 +38,7 @@ struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* - * Fixed part of pointer offset, pointer types only. - * Or constant delta between "linked" scalars with the same ID. + * Constant delta between "linked" scalars with the same ID. */ s32 off; union { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index a0c3b35de2ce..39a731392d65 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -581,6 +581,8 @@ int tnum_strn(char *str, size_t size, struct tnum a) if (a.mask == 0) { if (is_unum_decimal(a.value)) return snprintf(str, size, "%llu", a.value); + if (is_snum_decimal(a.value)) + return snprintf(str, size, "%lld", a.value); else return snprintf(str, size, "%#llx", a.value); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3bf72eacbec2..2c5794dad668 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -654,7 +654,7 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s return -EINVAL; } - off = reg->off + reg->var_off.value; + off = reg->var_off.value; if (off % BPF_REG_SIZE) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; @@ -2281,11 +2281,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, struct btf_field_graph_root *ds_head) { - __mark_reg_known_zero(®s[regno]); + __mark_reg_known(®s[regno], ds_head->node_offset); regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[regno].btf = ds_head->btf; regs[regno].btf_id = ds_head->value_btf_id; - regs[regno].off = ds_head->node_offset; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -2316,7 +2315,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, */ return reg->type == which && reg->id == 0 && - reg->off == 0 && tnum_equals_const(reg->var_off, 0); } @@ -5302,7 +5300,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * tracks the effects of the write, considering that each stack slot in the * dynamic range is potentially written to. * - * 'off' includes 'regno->off'. * 'value_regno' can be -1, meaning that an unknown value is being written to * the stack. * @@ -5724,7 +5721,6 @@ static int check_stack_read(struct bpf_verifier_env *env, * check_stack_write_var_off. * * 'ptr_regno' is the register used as a pointer into the stack. - * 'off' includes 'ptr_regno->off', but not its variable offset (if any). * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * @@ -5761,14 +5757,14 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { - verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { - verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } @@ -5875,24 +5871,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, * is only allowed in its original, unmodified form. */ - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (reg->smin_value < 0) { + verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); + if (!fixed_off_ok && reg->var_off.value != 0) { + verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } @@ -5934,14 +5930,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. - * Since ref_ptr cannot be accessed directly by BPF insns, checks for - * reg->off and reg->ref_obj_id are not needed here. + * Since ref_ptr cannot be accessed directly by BPF insns, check for + * reg->ref_obj_id is not needed here. */ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and - * we also need to take into account the reg->off. + * we also need to take into account the reg->var_off. * * We want to support cases like: * @@ -5952,19 +5948,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * * struct foo *v; * v = func(); // PTR_TO_BTF_ID - * val->foo = v; // reg->off is zero, btf and btf_id match type - * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * val->foo = v; // reg->var_off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with * // first member type of struct after comparison fails - * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked * // to match type * - * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject * second case from above. Hence, when type is BPF_KPTR_REF, we set * strict mode to true for type match. */ - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, kptr_field->kptr.btf, kptr_field->kptr.btf_id, kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; @@ -6273,27 +6269,14 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_reg_state *reg = reg_state(env, regno); int err; - /* We may have added a variable offset to the packet pointer; but any - * reg->range we have comes after that. We are only checking the fixed - * offset. - */ - - /* We don't allow negative numbers, because we aren't tracking enough - * detail to prove they're safe. - */ - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; + if (reg->range < 0) { + verbose(env, "R%d offset is outside of the packet\n", regno); + return -EINVAL; } - err = reg->range < 0 ? -EINVAL : - __check_mem_access(env, regno, off, size, reg->range, - zero_size_allowed); - if (err) { - verbose(env, "R%d offset is outside of the packet\n", regno); + err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + if (err) return err; - } /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, @@ -6305,7 +6288,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, max_t(u32, env->prog->aux->max_pkt_offset, off + reg->umax_value + size - 1); - return err; + return 0; } /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ @@ -6522,14 +6505,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, */ ip_align = 2; - reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "misaligned packet access off %d+%s+%d+%d size %d\n", - ip_align, tn_buf, reg->off, off, size); + "misaligned packet access off %d+%s+%d size %d\n", + ip_align, tn_buf, off, size); return -EACCES; } @@ -6547,13 +6530,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, if (!strict || size == 1) return 0; - reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", - pointer_desc, tn_buf, reg->off, off, size); + verbose(env, "misaligned %saccess off %s+%d size %d\n", + pointer_desc, tn_buf, off, size); return -EACCES; } @@ -6891,7 +6874,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, regno, buf_info, off, size); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -6914,8 +6897,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > env->prog->aux->max_tp_access) - env->prog->aux->max_tp_access = off + size; + env->prog->aux->max_tp_access = max(reg->var_off.value + off + size, + env->prog->aux->max_tp_access); return 0; } @@ -6933,8 +6916,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > *max_access) - *max_access = off + size; + *max_access = max(reg->var_off.value + off + size, *max_access); return 0; } @@ -7327,13 +7309,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tname); return -EINVAL; } - if (off < 0) { - verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -7343,6 +7320,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + off += reg->var_off.value; + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (reg->type & MEM_USER) { verbose(env, "R%d is ptr_%s access user memory: off=%d\n", @@ -7589,8 +7575,8 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%d size=%d\n", - err_extra, regno, off, access_size); + verbose(env, "invalid%s stack R%d off=%lld size=%d\n", + err_extra, regno, min_off, access_size); } else { char tn_buf[48]; @@ -7636,14 +7622,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (size < 0) return size; - /* alignment checks will add in reg->off themselves */ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; - /* for access checks, reg->off is just part of off */ - off += reg->off; - if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { verbose(env, "write to change key R%d not allowed\n", regno); @@ -8122,8 +8104,6 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * on the access type and privileges, that all elements of the stack are * initialized. * - * 'off' includes 'regno->off', but not its dynamic part (if any). - * * All registers that have been spilled on the stack in the slots within the * read offsets are marked as read. */ @@ -8284,7 +8264,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size, + return check_packet_access(env, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -8292,12 +8272,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, reg->off, access_size, + return check_mem_region_access(env, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, access_type)) + if (check_map_access_type(env, regno, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, reg->off, access_size, + return check_map_access(env, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -8307,7 +8287,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, reg->off, + return check_mem_region_access(env, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -8322,16 +8302,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, reg->off, + return check_buffer_access(env, reg, regno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, - regno, reg->off, access_size, + regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, reg->off, + return check_ptr_to_btf_access(env, regs, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, @@ -8543,9 +8523,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return -EINVAL; } spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; - if (spin_lock_off != val + reg->off) { + if (spin_lock_off != val) { verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", - val + reg->off, lock_str, spin_lock_off); + val, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { @@ -8660,9 +8640,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (field_off != val + reg->off) { + if (field_off != val) { verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", - val + reg->off, struct_name, field_off); + val, struct_name, field_off); return -EINVAL; } if (map_desc->ptr) { @@ -8730,7 +8710,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - kptr_off = reg->off + reg->var_off.value; + kptr_off = reg->var_off.value; kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); @@ -9373,7 +9353,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; - int i, j; + int i, j, err; compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { @@ -9476,8 +9456,12 @@ found: return -EACCES; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, + err = __check_ptr_off_reg(env, reg, regno, true); + if (err) + return err; + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, + reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, btf_type_name(reg->btf, reg->btf_id), @@ -9555,12 +9539,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. */ - if (reg->off) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", regno); return -EINVAL; } - return __check_ptr_off_reg(env, reg, regno, false); } switch (type) { @@ -9657,7 +9640,7 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, if (reg->type == CONST_PTR_TO_DYNPTR) return reg->dynptr.type; - spi = __get_spi(reg->off); + spi = __get_spi(reg->var_off.value); if (spi < 0) { verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); return BPF_DYNPTR_TYPE_INVALID; @@ -9698,13 +9681,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, + err = check_map_access(env, regno, 0, + map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) return err; - map_off = reg->off + reg->var_off.value; + map_off = reg->var_off.value; err = map->ops->map_direct_value_addr(map, &map_addr, map_off); if (err) { verbose(env, "direct value access on string failed\n"); @@ -9741,7 +9724,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, if (!tnum_is_const(key->var_off)) return -EOPNOTSUPP; - stack_off = key->off + key->var_off.value; + stack_off = key->var_off.value; slot = -stack_off - 1; spi = slot / BPF_REG_SIZE; off = slot % BPF_REG_SIZE; @@ -11073,7 +11056,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, */ struct btf_field *field; - field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + field = reg_find_field_offset(&caller->regs[BPF_REG_1], + caller->regs[BPF_REG_1].var_off.value, BPF_RB_ROOT); if (!field || !field->graph_root.value_btf_id) return -EFAULT; @@ -11449,7 +11433,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const * and map_direct_value_addr is set. */ - fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + fmt_map_off = fmt_reg->var_off.value; err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { @@ -12755,13 +12739,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; - WARN_ON_ONCE(is_kfunc_release(meta) && - (reg->off || !tnum_is_const(reg->var_off) || - reg->var_off.value)); + WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off)); reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); - struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); + struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value, + meta->btf, ref_id, strict_type_match); /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot * actually use it -- it must cast to the underlying type. So we allow * caller to pass in the underlying type. @@ -13133,7 +13116,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } rec = reg_btf_record(reg); - head_off = reg->off + reg->var_off.value; + head_off = reg->var_off.value; field = btf_record_find(rec, head_off, head_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); @@ -13200,7 +13183,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, return -EINVAL; } - node_off = reg->off + reg->var_off.value; + node_off = reg->var_off.value; field = reg_find_field_offset(reg, node_off, node_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); @@ -14228,7 +14211,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; - insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { @@ -14459,11 +14442,13 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, enum bpf_reg_type type) { + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; s64 smin = reg->smin_value; - if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str(env, type), reg->off); + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), val); return false; } @@ -14497,13 +14482,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + ptr_limit = -ptr_reg->var_off.value; break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = (mask_to_left ? - ptr_reg->smin_value : - ptr_reg->umax_value) + ptr_reg->off; + ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; break; default: return REASON_TYPE; @@ -14734,9 +14717,6 @@ static int sanitize_err(struct bpf_verifier_env *env, * Variable offset is prohibited for unprivileged mode for simplicity since it * requires corresponding support in Spectre masking for stack ALU. See also * retrieve_ptr_limit(). - * - * - * 'off' includes 'reg->off'. */ static int check_stack_access_for_ptr_arithmetic( struct bpf_verifier_env *env, @@ -14777,11 +14757,11 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, switch (dst_reg->type) { case PTR_TO_STACK: if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, - dst_reg->off + dst_reg->var_off.value)) + dst_reg->var_off.value)) return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -14905,23 +14885,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - /* We can take a fixed offset as long as it doesn't overflow - * the s32 'off' field - */ - if (known && (ptr_reg->off + smin_val == - (s64)(s32)(ptr_reg->off + smin_val))) { - /* pointer += K. Accumulate it into fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->off = ptr_reg->off + smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } - /* A new variable offset is created. Note that off_reg->off - * == 0, since it's a scalar. + /* * dst_reg gets the pointer type and since some positive * integer value was added to the pointer, give it a new 'id' * if it's a PTR_TO_PACKET. @@ -14940,9 +14904,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (!known && reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); @@ -14964,19 +14927,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - if (known && (ptr_reg->off - smin_val == - (s64)(s32)(ptr_reg->off - smin_val))) { - /* pointer -= K. Subtract it from fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->id = ptr_reg->id; - dst_reg->off = ptr_reg->off - smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ @@ -14996,9 +14946,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr - umin_val; } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (!known && reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -16542,19 +16491,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->off < 0 || - (dst_reg->off == 0 && range_right_open)) + if (dst_reg->umax_value == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF || - dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) + if (dst_reg->umax_value > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->off; + new_range = dst_reg->umax_value; if (range_right_open) new_range++; @@ -16603,7 +16550,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. + * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -17129,29 +17076,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { - /* Old offset (both fixed and variable parts) should have been - * known-zero, because we don't allow pointer arithmetic on - * pointers that might be NULL. If we see this happening, don't - * convert the register. + /* Old offset should have been known-zero, because we don't + * allow pointer arithmetic on pointers that might be NULL. + * If we see this happening, don't convert the register. * * But in some cases, some helpers that return local kptrs - * advance offset for the returned pointer. In those cases, it - * is fine to expect to see reg->off. + * advance offset for the returned pointer. In those cases, + * it is fine to expect to see reg->var_off. */ - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) - return; if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && - WARN_ON_ONCE(reg->off)) + WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; - if (is_null) { - reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ - reg->id = 0; - reg->ref_obj_id = 0; + __mark_reg_known_zero(reg); + reg->type = SCALAR_VALUE; return; } @@ -17731,22 +17673,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { if (map->map_type == BPF_MAP_TYPE_ARENA) { __mark_reg_unknown(env, dst_reg); + dst_reg->map_ptr = map; return 0; } + __mark_reg_known(dst_reg, aux->map_off); dst_reg->type = PTR_TO_MAP_VALUE; - dst_reg->off = aux->map_off; + dst_reg->map_ptr = map; WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; + dst_reg->map_ptr = map; } else { verifier_bug(env, "unexpected src reg value for ldimm64"); return -EFAULT; @@ -19852,11 +19796,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, */ if (rold->range > rcur->range) return false; - /* If the offsets don't match, we can't trust our alignment; - * nor can we be sure that we won't fall out of range. - */ - if (rold->off != rcur->off) - return false; /* id relations must be preserved */ if (!check_ids(rold->id, rcur->id, idmap)) return false; @@ -19872,8 +19811,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; case PTR_TO_INSN: return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && - rold->off == rcur->off && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -20486,7 +20424,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * so we can assume valid iter and reg state, * no need for extra (re-)validations */ - spi = __get_spi(iter_reg->off + iter_reg->var_off.value); + spi = __get_spi(iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { loop = true; @@ -20892,19 +20830,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index, max_index; + u64 min_index = reg->umin_value; + u64 max_index = reg->umax_value; const u32 size = 8; - if (check_add_overflow(reg->umin_value, reg->off, &min_index) || - (min_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", - regno, reg->umin_value, reg->off); + if (min_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); return -ERANGE; } - if (check_add_overflow(reg->umax_value, reg->off, &max_index) || - (max_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", - regno, reg->umax_value, reg->off); + if (max_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); return -ERANGE; } diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 14c5a7ef0e87..6f25b5f39a79 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -87,12 +87,12 @@ static struct { { "incorrect_value_type", "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " "but arg is at offset=0 in struct bar" }, - { "incorrect_node_var_off", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, + { "incorrect_node_var_off", "variable ptr_ access var_off=(0x0; 0x1ffffffff) disallowed" }, { "incorrect_node_off1", "bpf_list_node not found at offset=49" }, { "incorrect_node_off2", "arg#1 offset=0, but expected bpf_list_node at offset=48 in struct foo" }, { "no_head_type", "bpf_list_head not found at offset=0" }, { "incorrect_head_var_off1", "R1 doesn't have constant offset" }, - { "incorrect_head_var_off2", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, + { "incorrect_head_var_off2", "variable ptr_ access var_off=(0x0; 0x1ffffffff) disallowed" }, { "incorrect_head_off1", "bpf_list_head not found at offset=25" }, { "incorrect_head_off2", "bpf_list_head not found at offset=1" }, { "pop_front_off", "off 48 doesn't point to 'struct bpf_spin_lock' that is at 40" }, diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index a01c2736890f..ed00dd551ffb 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -114,7 +114,7 @@ int check_assert_single_range_u64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R1=pkt(off=64,r=64) R2=pkt_end() R6=pkt(r=64) R10=fp0") +__msg(": R1=pkt(r=64,imm=64) R2=pkt_end() R6=pkt(r=64) R10=fp0") int check_assert_generic(struct __sk_buff *ctx) { u8 *data_end = (void *)(long)ctx->data_end; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7f27b517d5d5..86b74e3579d9 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1651,7 +1651,7 @@ int clean_live_states(const void *ctx) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state(void) { /* This is equivalent to C program below. @@ -1726,7 +1726,7 @@ static int noop(void) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state2(void) { /* This is equivalent to C program below. @@ -1802,7 +1802,7 @@ __naked int absent_mark_in_the_middle_state2(void) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state3(void) { /* diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 3b984b6ae7c0..5b4453747c23 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,7 +8,7 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(imm={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") __msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_align.c b/tools/testing/selftests/bpf/progs/verifier_align.c index 90362d61f1fe..24553ce62881 100644 --- a/tools/testing/selftests/bpf/progs/verifier_align.c +++ b/tools/testing/selftests/bpf/progs/verifier_align.c @@ -131,7 +131,7 @@ LBL ":" \ SEC("tc") __success __log_level(2) __flag(BPF_F_ANY_ALIGNMENT) -__msg("6: R0=pkt(off=8,r=8)") +__msg("6: R0=pkt(r=8,imm=8)") __msg("6: {{.*}} R3={{[^)]*}}var_off=(0x0; 0xff)") __msg("7: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x1fe)") __msg("8: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x3fc)") @@ -203,10 +203,10 @@ __naked void unknown_mul(void) SEC("tc") __success __log_level(2) __msg("2: {{.*}} R5=pkt(r=0)") -__msg("4: {{.*}} R5=pkt(off=14,r=0)") -__msg("5: {{.*}} R4=pkt(off=14,r=0)") +__msg("4: {{.*}} R5=pkt(r=0,imm=14)") +__msg("5: {{.*}} R4=pkt(r=0,imm=14)") __msg("9: {{.*}} R2=pkt(r=18)") -__msg("10: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xff){{.*}} R5=pkt(off=14,r=18)") +__msg("10: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xff){{.*}} R5=pkt(r=18,imm=14)") __msg("13: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)") __msg("14: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)") __naked void packet_const_offset(void) @@ -247,14 +247,14 @@ __msg("7: {{.*}} R6={{[^)]*}}var_off=(0x0; 0x3fc)") /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ -__msg("11: {{.*}} R5=pkt(id=1,off=14,") +__msg("11: {{.*}} R5=pkt(id=1,{{[^)]*}},var_off=(0x2; 0x7fc)") /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable * offset is considered using reg->aux_off_align which * is 4 and meets the load's requirements. */ -__msg("15: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("15: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Variable offset is added to R5 packet pointer, * resulting in auxiliary alignment of 4. To avoid BPF * verifier's precision backtracking logging @@ -266,37 +266,37 @@ __msg("18: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x /* Constant offset is added to R5, resulting in * reg->off of 14. */ -__msg("19: {{.*}} R5=pkt(id=2,off=14,") +__msg("19: {{.*}} R5=pkt(id=2,{{[^)]*}}var_off=(0x2; 0x7fc)") /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte * aligned, so the total offset is 4-byte aligned and * meets the load's requirements. */ -__msg("24: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("24: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ -__msg("26: {{.*}} R5=pkt(off=14,r=8)") +__msg("26: {{.*}} R5=pkt(r=8,imm=14)") /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ -__msg("28: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("28: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Constant is added to R5 again, setting reg->off to 18. */ -__msg("29: {{.*}} R5=pkt(id=3,off=18,") +__msg("29: {{.*}} R5=pkt(id=3,{{[^)]*}}var_off=(0x2; 0x7fc)") /* And once more we add a variable; resulting {{[^)]*}}var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ -__msg("31: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x7fc)") +__msg("31: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)") /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ -__msg("35: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x7fc)") +__msg("35: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)") __naked void packet_variable_offset(void) { asm volatile (" \ @@ -430,16 +430,10 @@ __msg("6: {{.*}} R5={{[^)]*}}var_off=(0x2; 0xfffffffffffffffc)") /* Checked s>=0 */ __msg("9: {{.*}} R5={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") /* packet pointer + nonnegative (4n+2) */ -__msg("11: {{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") -__msg("12: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") -/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ -__msg("15: {{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") +__msg("11: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc){{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") +__msg("12: (07) r4 += 4") +/* packet smax bound overflow */ +__msg("pkt pointer offset -9223372036854775808 is not allowed") __naked void dubious_pointer_arithmetic(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 560531404bce..d195eaa67d75 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -202,7 +202,7 @@ l0_%=: /* exit */ \ SEC("tc") __description("bounds check based on reg_off + var_off + insn_off. test1") -__failure __msg("value_size=8 off=1073741825") +__failure __msg("map_value pointer offset 1073741822 is not allowed") __naked void var_off_insn_off_test1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c index 911caa8fd1b7..4ee3b7a708f7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c @@ -412,7 +412,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("direct packet access: test17 (pruning, alignment)") -__failure __msg("misaligned packet access off 2+0+15+-4 size 4") +__failure __msg("misaligned packet access off 2+15+-4 size 4") __flag(BPF_F_STRICT_ALIGNMENT) __naked void packet_access_test17_pruning_alignment(void) { @@ -569,7 +569,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("direct packet access: test23 (x += pkt_ptr, 4)") -__failure __msg("invalid access to packet, off=0 size=8, R5(id=3,off=0,r=0)") +__failure __msg("invalid access to packet, off=31 size=8, R5(id=3,off=31,r=0)") __flag(BPF_F_ANY_ALIGNMENT) __naked void test23_x_pkt_ptr_4(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_gotox.c b/tools/testing/selftests/bpf/progs/verifier_gotox.c index 607dad058ca1..548dce00f5fb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotox.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotox.c @@ -131,7 +131,7 @@ DEFINE_INVALID_SIZE_PROG(u16, __failure __msg("Invalid read of 2 bytes from insn DEFINE_INVALID_SIZE_PROG(u8, __failure __msg("Invalid read of 1 bytes from insn_array")) SEC("socket") -__failure __msg("misaligned value access off 0+1+0 size 8") +__failure __msg("misaligned value access off 1+0 size 8") __naked void jump_table_misaligned_access(void) { asm volatile (" \ @@ -187,7 +187,7 @@ jt0_%=: \ } SEC("socket") -__failure __msg("invalid access to map value, value_size=16 off=-24 size=8") +__failure __msg("R0 min value is negative") __naked void jump_table_invalid_mem_acceess_neg(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c index 74f5f9cd153d..71cee3f58324 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c @@ -360,7 +360,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("helper access to packet: test15, cls helper fail sub") -__failure __msg("invalid access to packet") +__failure __msg("R1 min value is negative") __naked void test15_cls_helper_fail_sub(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c index 886498b5e6f3..6d2a38597c34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c @@ -1100,7 +1100,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("map helper access to adjusted map (via const imm): out-of-bound 2") -__failure __msg("invalid access to map value, value_size=16 off=-4 size=8") +__failure __msg("R2 min value is negative") __naked void imm_out_of_bound_2(void) { asm volatile (" \ @@ -1176,7 +1176,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("map helper access to adjusted map (via const reg): out-of-bound 2") -__failure __msg("invalid access to map value, value_size=16 off=-4 size=8") +__failure __msg("R2 min value is negative") __naked void reg_out_of_bound_2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c index 59e34d558654..6627f44faf4b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c @@ -65,7 +65,7 @@ __naked void ptr_to_long_half_uninitialized(void) SEC("cgroup/sysctl") __description("arg pointer to long misaligned") -__failure __msg("misaligned stack access off 0+-20+0 size 8") +__failure __msg("misaligned stack access off -20+0 size 8") __naked void arg_ptr_to_long_misaligned(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_meta_access.c b/tools/testing/selftests/bpf/progs/verifier_meta_access.c index d81722fb5f19..62235f032ffe 100644 --- a/tools/testing/selftests/bpf/progs/verifier_meta_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_meta_access.c @@ -27,7 +27,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("meta access, test2") -__failure __msg("invalid access to packet, off=-8") +__failure __msg("R0 min value is negative") __naked void meta_access_test2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 7a13dbd794b2..893d3bb024a0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -656,7 +656,7 @@ __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ __msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") @@ -673,7 +673,7 @@ __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -732,7 +732,7 @@ __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ __msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") @@ -748,7 +748,7 @@ __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c index 24aabc6083fd..8e8cf8232255 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c @@ -37,7 +37,7 @@ __naked void ptr_to_stack_store_load(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on off") -__failure __msg("misaligned stack access off 0+-8+2 size 8") +__failure __msg("misaligned stack access off -8+2 size 8") __failure_unpriv __naked void load_bad_alignment_on_off(void) { @@ -53,7 +53,7 @@ __naked void load_bad_alignment_on_off(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on reg") -__failure __msg("misaligned stack access off 0+-10+8 size 8") +__failure __msg("misaligned stack access off -10+8 size 8") __failure_unpriv __naked void load_bad_alignment_on_reg(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c index af7938ce56cb..b3b701b44550 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c @@ -346,7 +346,7 @@ l2_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar from different maps") __success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__msg_unpriv("R0 min value is negative") __retval(1) __naked void known_scalar_from_different_maps(void) { @@ -683,9 +683,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar, lower oob arith, test 1") -__failure __msg("R0 min value is outside of the allowed memory range") -__failure_unpriv -__msg_unpriv("R0 pointer arithmetic of map value goes out of range") +__failure __msg("R0 min value is negative") __naked void lower_oob_arith_test_1(void) { asm volatile (" \ @@ -840,7 +838,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr += known scalar, 3") -__failure __msg("invalid access to map value") +__failure __msg("R0 min value is negative") __failure_unpriv __naked void value_ptr_known_scalar_3(void) { @@ -1207,7 +1205,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar") -__failure __msg("R0 min value is outside of the allowed memory range") +__failure __msg("R0 min value is negative") __failure_unpriv __naked void access_value_ptr_known_scalar(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c index df2dfd1b15d1..0b86d95a4133 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c @@ -69,7 +69,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' > pkt_end, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_end_bad_access_1_1(void) { @@ -131,7 +131,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' > pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_1(void) { @@ -173,7 +173,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end > pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_2(void) { @@ -279,7 +279,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' < pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_3(void) { @@ -384,7 +384,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end < pkt_data', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_1(void) { @@ -446,7 +446,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end < pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_4(void) { @@ -487,7 +487,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' >= pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_5(void) { @@ -590,7 +590,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end >= pkt_data', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_2(void) { @@ -654,7 +654,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end >= pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_6(void) { @@ -697,7 +697,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' <= pkt_end, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_end_bad_access_1_2(void) { @@ -761,7 +761,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' <= pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_7(void) { @@ -803,7 +803,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end <= pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_8(void) { @@ -905,7 +905,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_3(void) { @@ -926,7 +926,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_5(void) { @@ -967,7 +967,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_9(void) { @@ -1009,7 +1009,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data > pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_10(void) { @@ -1031,7 +1031,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data > pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_1(void) { @@ -1115,7 +1115,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' < pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_11(void) { @@ -1137,7 +1137,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' < pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_6(void) { @@ -1220,7 +1220,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_1_1(void) { @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_2(void) { @@ -1282,7 +1282,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_12(void) { @@ -1323,7 +1323,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' >= pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_13(void) { @@ -1344,7 +1344,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' >= pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_7(void) { @@ -1426,7 +1426,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_1_2(void) { @@ -1448,7 +1448,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_3(void) { @@ -1490,7 +1490,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_14(void) { @@ -1533,7 +1533,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_4(void) { @@ -1555,7 +1555,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_8(void) { @@ -1597,7 +1597,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_15(void) { @@ -1639,7 +1639,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data <= pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_16(void) { @@ -1660,7 +1660,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data <= pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_4(void) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 9ca83dce100d..86887130a0ef 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "R1 must have zero offset when passed to release func or trusted arg to kfunc", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", -- cgit v1.2.3 From 3d91c618aca403a7f7d2064272f528a97b849475 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:24 -0800 Subject: bpf: rename bpf_reg_state->off to bpf_reg_state->delta This field is now used only for linked scalar registers tracking. Rename it to 'delta' to better describe it's purpose: constant delta between "linked" scalars with the same ID. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-4-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++--- kernel/bpf/log.c | 8 ++++---- kernel/bpf/verifier.c | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a97bdbf3a07b..c1e30096ea7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -40,7 +40,7 @@ struct bpf_reg_state { /* * Constant delta between "linked" scalars with the same ID. */ - s32 off; + s32 delta; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -145,9 +145,9 @@ struct bpf_reg_state { * Upper bit of ID is used to remember relationship between "linked" * registers. Example: * r1 = r2; both will have r1->id == r2->id == N - * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->delta == 10 * r3 = r2; both will have r3->id == r2->id == N - * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->delta == 10 */ #define BPF_ADD_CONST64 (1U << 31) #define BPF_ADD_CONST32 (1U << 30) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 39a731392d65..37d72b052192 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -694,7 +694,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); if (tnum_is_const(reg->var_off)) { - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value + reg->delta); return; } } @@ -704,7 +704,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) - verbose(env, "%+d", reg->off); + verbose(env, "%+d", reg->delta); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) @@ -716,9 +716,9 @@ static void print_reg_state(struct bpf_verifier_env *env, reg->map_ptr->key_size, reg->map_ptr->value_size); } - if (t != SCALAR_VALUE && reg->off) { + if (t != SCALAR_VALUE && reg->delta) { verbose_a("off="); - verbose_snum(env, reg->off); + verbose_snum(env, reg->delta); } if (type_is_pkt_pointer(t)) { verbose_a("r="); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c5794dad668..0162f946032f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5095,7 +5095,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, * Cleared it, since multiple rX += const are not supported. */ src_reg->id = 0; - src_reg->off = 0; + src_reg->delta = 0; } if (!src_reg->id && !tnum_is_const(src_reg->var_off)) @@ -16219,14 +16219,14 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * we cannot accumulate another val into rx->off. */ clear_id: - dst_reg->off = 0; + dst_reg->delta = 0; dst_reg->id = 0; } else { if (alu32) dst_reg->id |= BPF_ADD_CONST32; else dst_reg->id |= BPF_ADD_CONST64; - dst_reg->off = off; + dst_reg->delta = off; } } else { /* @@ -17305,18 +17305,18 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || - reg->off == known_reg->off) { + reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; copy_register_state(reg, known_reg); reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; - s32 saved_off = reg->off; + s32 saved_off = reg->delta; u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17324,7 +17324,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ - reg->off = saved_off; + reg->delta = saved_off; reg->id = saved_id; reg->subreg_def = saved_subreg_def; @@ -19629,7 +19629,7 @@ static void clear_singular_ids(struct bpf_verifier_env *env, continue; if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { reg->id = 0; - reg->off = 0; + reg->delta = 0; } })); } @@ -19766,7 +19766,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; /* Both have offset linkage: offsets must match */ - if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) + if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta) return false; if (!check_scalar_ids(rold->id, rcur->id, idmap)) -- cgit v1.2.3 From 95162db0208aee122d10ac1342fe97a1721cd258 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Feb 2026 14:46:01 +0100 Subject: drm/pagemap: pass pagemap_addr by reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing a structure by value into a function is sometimes problematic, for a number of reasons. Of of these is a warning from the 32-bit arm compiler: drivers/gpu/drm/drm_gpusvm.c: In function '__drm_gpusvm_unmap_pages': drivers/gpu/drm/drm_gpusvm.c:1152:33: note: parameter passing for argument of type 'struct drm_pagemap_addr' changed in GCC 9.1 1152 | dpagemap->ops->device_unmap(dpagemap, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1153 | dev, *addr); | ~~~~~~~~~~~ This particular problem is harmless since we are not mixing compiler versions inside of the compiler. However, passing this by reference avoids the warning along with providing slightly better calling conventions as it avoids an extra copy on the stack. Fixes: 75af93b3f5d0 ("drm/pagemap, drm/xe: Support destination migration over interconnect") Fixes: 2df55d9e66a2 ("drm/xe: Support pcie p2p dma as a fast interconnect") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Hellström Signed-off-by: Thomas Hellström Link: https://patch.msgid.link/20260216134644.1025365-1-arnd@kernel.org Acked-by: Maarten Lankhorst --- drivers/gpu/drm/drm_gpusvm.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 2 +- drivers/gpu/drm/xe/xe_svm.c | 8 ++++---- include/drm/drm_pagemap.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index c25f50cad6fe..81626b00b755 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1150,7 +1150,7 @@ static void __drm_gpusvm_unmap_pages(struct drm_gpusvm *gpusvm, addr->dir); else if (dpagemap && dpagemap->ops->device_unmap) dpagemap->ops->device_unmap(dpagemap, - dev, *addr); + dev, addr); i += 1 << addr->order; } diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index aa43a8475100..5f28f035bb1f 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -318,7 +318,7 @@ static void drm_pagemap_migrate_unmap_pages(struct device *dev, struct drm_pagemap_zdd *zdd = page->zone_device_data; struct drm_pagemap *dpagemap = zdd->dpagemap; - dpagemap->ops->device_unmap(dpagemap, dev, pagemap_addr[i]); + dpagemap->ops->device_unmap(dpagemap, dev, &pagemap_addr[i]); } else { dma_unmap_page(dev, pagemap_addr[i].addr, PAGE_SIZE << pagemap_addr[i].order, dir); diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 213f0334518a..78f4b2c60670 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -1676,13 +1676,13 @@ xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap, static void xe_drm_pagemap_device_unmap(struct drm_pagemap *dpagemap, struct device *dev, - struct drm_pagemap_addr addr) + const struct drm_pagemap_addr *addr) { - if (addr.proto != XE_INTERCONNECT_P2P) + if (addr->proto != XE_INTERCONNECT_P2P) return; - dma_unmap_resource(dev, addr.addr, PAGE_SIZE << addr.order, - addr.dir, DMA_ATTR_SKIP_CPU_SYNC); + dma_unmap_resource(dev, addr->addr, PAGE_SIZE << addr->order, + addr->dir, DMA_ATTR_SKIP_CPU_SYNC); } static void xe_pagemap_destroy_work(struct work_struct *work) diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index 2baf0861f78f..c848f578e3da 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -95,7 +95,7 @@ struct drm_pagemap_ops { */ void (*device_unmap)(struct drm_pagemap *dpagemap, struct device *dev, - struct drm_pagemap_addr addr); + const struct drm_pagemap_addr *addr); /** * @populate_mm: Populate part of the mm with @dpagemap memory, -- cgit v1.2.3 From a235d3bcd28ba2d472f5b4cb6b259baeab75bafd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Fri, 13 Feb 2026 11:16:31 +0530 Subject: writeback: prep helpers for dirty-limit and writeback accounting Add helper APIs needed by filesystems to avoid poking into writeback internals. Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Reviewed-by: Andreas Gruenbacher Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta Link: https://patch.msgid.link/20260213054634.79785-2-kundan.kumar@samsung.com Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0c8342747cab..5b7d12b40d5e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -136,6 +136,19 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +/* Must not be used by file systems that support cgroup writeback */ +static inline int bdi_wb_dirty_exceeded(struct backing_dev_info *bdi) +{ + return bdi->wb.dirty_exceeded; +} + +/* Must not be used by file systems that support cgroup writeback */ +static inline void bdi_wb_stat_mod(struct inode *inode, enum wb_stat_item item, + s64 amount) +{ + wb_stat_mod(&inode_to_bdi(inode)->wb, item, amount); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, -- cgit v1.2.3 From 1f662195dbc07a66241cb5fe483036e5d07fb642 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 30 Jan 2026 14:59:19 +0900 Subject: fs: add generic FS_IOC_SHUTDOWN definitions Currently, several filesystems (e.g., xfs, ext4, btrfs) implement a "shutdown" or "going down" ioctl to simulate filesystem force a shutdown. While they often use the same underlying numeric value, the definition is duplicated across filesystem headers or private definitions. Add generic definitions for FS_IOC_SHUTDOWN in uapi/linux/fs.h. This allows new filesystems (like ntfs) to implement this feature using a standard VFS definition and paves the way for existing filesystems to unify their definitions later. The flag names are standardized as FS_SHUTDOWN_* to be consistent with the ioctl name, replacing the historical GOING_DOWN naming convention. Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Namjae Jeon --- include/uapi/linux/fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 70b2b661f42c..13f71202845e 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -657,4 +657,16 @@ struct procmap_query { __u64 build_id_addr; /* in */ }; +/* + * Shutdown the filesystem. + */ +#define FS_IOC_SHUTDOWN _IOR('X', 125, __u32) + +/* + * Flags for FS_IOC_SHUTDOWN + */ +#define FS_SHUTDOWN_FLAGS_DEFAULT 0x0 +#define FS_SHUTDOWN_FLAGS_LOGFLUSH 0x1 /* flush log but not data*/ +#define FS_SHUTDOWN_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + #endif /* _UAPI_LINUX_FS_H */ -- cgit v1.2.3 From 0d799df5b147e08828887fe7299efd7a9e0eea40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:01 +0100 Subject: fs: mark bool_names static The bool_names array is only used in fs_parser.c so mark it static. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs_parser.c | 3 +-- include/linux/fs_parser.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/fs/fs_parser.c b/fs/fs_parser.c index c092a9f79e32..46993e31137d 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -13,7 +13,7 @@ #include #include "internal.h" -const struct constant_table bool_names[] = { +static const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, @@ -22,7 +22,6 @@ const struct constant_table bool_names[] = { { "yes", true }, { }, }; -EXPORT_SYMBOL(bool_names); static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5e8a3b546033..ac8253cca2bc 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -84,8 +84,6 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); -extern const struct constant_table bool_names[]; - #ifdef CONFIG_VALIDATE_FS_PARSER extern bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc); -- cgit v1.2.3 From 8823db29744fceda9f94e674f74deea446c620b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:02 +0100 Subject: fs: remove fsparam_blob / fs_param_is_blob These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/mount_api.rst | 2 -- fs/fs_parser.c | 9 --------- include/linux/fs_parser.h | 3 +-- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index a064234fed5b..b4a0f23914a6 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -647,7 +647,6 @@ The members are as follows: fs_param_is_u64 64-bit unsigned int result->uint_64 fs_param_is_enum Enum value name result->uint_32 fs_param_is_string Arbitrary string param->string - fs_param_is_blob Binary blob param->blob fs_param_is_blockdev Blockdev path * Needs lookup fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 @@ -681,7 +680,6 @@ The members are as follows: fsparam_u64() fs_param_is_u64 fsparam_enum() fs_param_is_enum fsparam_string() fs_param_is_string - fsparam_blob() fs_param_is_blob fsparam_bdev() fs_param_is_blockdev fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 46993e31137d..79e8fe9176fa 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -277,15 +277,6 @@ int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_string); -int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p, - struct fs_parameter *param, struct fs_parse_result *result) -{ - if (param->type != fs_value_is_blob) - return fs_param_bad_value(log, param); - return 0; -} -EXPORT_SYMBOL(fs_param_is_blob); - int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index ac8253cca2bc..961562b101c5 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -27,7 +27,7 @@ typedef int fs_param_type(struct p_log *, * The type of parameter expected. */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, - fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, + fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; @@ -125,7 +125,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) -#define fsparam_blob(NAME, OPT) __fsparam(fs_param_is_blob, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) -- cgit v1.2.3 From d2f2f7cf8e898f7b80fe031a263df9c7de94b0b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:03 +0100 Subject: fs: remove fsparam_path / fs_param_is_path These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-4-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/mount_api.rst | 2 -- fs/fs_parser.c | 7 ------- include/linux/fs_parser.h | 3 +-- 3 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index b4a0f23914a6..e8b94357b4df 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -648,7 +648,6 @@ The members are as follows: fs_param_is_enum Enum value name result->uint_32 fs_param_is_string Arbitrary string param->string fs_param_is_blockdev Blockdev path * Needs lookup - fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 fs_param_is_uid User ID (u32) result->uid fs_param_is_gid Group ID (u32) result->gid @@ -681,7 +680,6 @@ The members are as follows: fsparam_enum() fs_param_is_enum fsparam_string() fs_param_is_string fsparam_bdev() fs_param_is_blockdev - fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd fsparam_uid() fs_param_is_uid fsparam_gid() fs_param_is_gid diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 79e8fe9176fa..b4cc4cce518a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -361,13 +361,6 @@ int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_blockdev); -int fs_param_is_path(struct p_log *log, const struct fs_parameter_spec *p, - struct fs_parameter *param, struct fs_parse_result *result) -{ - return 0; -} -EXPORT_SYMBOL(fs_param_is_path); - #ifdef CONFIG_VALIDATE_FS_PARSER /** * fs_validate_description - Validate a parameter specification array diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 961562b101c5..98b83708f92b 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,7 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, + fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; /* @@ -126,7 +126,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) -#define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) #define fsparam_file_or_string(NAME, OPT) \ __fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL) -- cgit v1.2.3 From 95cef38e70250234a254e6228eb7342b6deaaffa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:18 +0100 Subject: firmware: google: Export coreboot table entries Move types for coreboot table entries to . Allows drivers in other subsystems to use these structures. Signed-off-by: Thomas Zimmermann Acked-by: Tzung-Bi Shih Acked-by: Julius Werner Link: https://patch.msgid.link/20260217155836.96267-9-tzimmermann@suse.de --- MAINTAINERS | 1 + drivers/firmware/google/coreboot_table.c | 10 ++++ drivers/firmware/google/coreboot_table.h | 60 +---------------------- drivers/firmware/google/framebuffer-coreboot.c | 2 - include/linux/coreboot.h | 66 ++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 61 deletions(-) create mode 100644 include/linux/coreboot.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 4a2d5e8f0f63..d0dfcfd15e59 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10753,6 +10753,7 @@ L: chrome-platform@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git F: drivers/firmware/google/ +F: include/linux/coreboot.h GOOGLE TENSOR SoC SUPPORT M: Peter Griffin diff --git a/drivers/firmware/google/coreboot_table.c b/drivers/firmware/google/coreboot_table.c index a031d6fe6bc5..c769631ea15d 100644 --- a/drivers/firmware/google/coreboot_table.c +++ b/drivers/firmware/google/coreboot_table.c @@ -22,6 +22,16 @@ #include "coreboot_table.h" +/* Coreboot table header structure */ +struct coreboot_table_header { + char signature[4]; + u32 header_bytes; + u32 header_checksum; + u32 table_bytes; + u32 table_checksum; + u32 table_entries; +}; + #define CB_DEV(d) container_of(d, struct coreboot_device, dev) #define CB_DRV(d) container_of_const(d, struct coreboot_driver, drv) diff --git a/drivers/firmware/google/coreboot_table.h b/drivers/firmware/google/coreboot_table.h index 17e9e5c3f6e1..616ca3903e5c 100644 --- a/drivers/firmware/google/coreboot_table.h +++ b/drivers/firmware/google/coreboot_table.h @@ -12,67 +12,9 @@ #ifndef __COREBOOT_TABLE_H #define __COREBOOT_TABLE_H +#include #include -struct coreboot_device_id; - -/* Coreboot table header structure */ -struct coreboot_table_header { - char signature[4]; - u32 header_bytes; - u32 header_checksum; - u32 table_bytes; - u32 table_checksum; - u32 table_entries; -}; - -/* List of coreboot entry structures that is used */ -/* Generic */ -struct coreboot_table_entry { - u32 tag; - u32 size; -}; - -/* Points to a CBMEM entry */ -struct lb_cbmem_ref { - u32 tag; - u32 size; - - u64 cbmem_addr; -}; - -#define LB_TAG_CBMEM_ENTRY 0x31 - -/* Corresponds to LB_TAG_CBMEM_ENTRY */ -struct lb_cbmem_entry { - u32 tag; - u32 size; - - u64 address; - u32 entry_size; - u32 id; -}; - -/* Describes framebuffer setup by coreboot */ -struct lb_framebuffer { - u32 tag; - u32 size; - - u64 physical_address; - u32 x_resolution; - u32 y_resolution; - u32 bytes_per_line; - u8 bits_per_pixel; - u8 red_mask_pos; - u8 red_mask_size; - u8 green_mask_pos; - u8 green_mask_size; - u8 blue_mask_pos; - u8 blue_mask_size; - u8 reserved_mask_pos; - u8 reserved_mask_size; -}; - /* A device, additionally with information from coreboot. */ struct coreboot_device { struct device dev; diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c index 81aa522edb1e..fab3f28655d3 100644 --- a/drivers/firmware/google/framebuffer-coreboot.c +++ b/drivers/firmware/google/framebuffer-coreboot.c @@ -21,8 +21,6 @@ #include "coreboot_table.h" -#define CB_TAG_FRAMEBUFFER 0x12 - #if defined(CONFIG_PCI) static bool framebuffer_pci_dev_is_enabled(struct pci_dev *pdev) { diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h new file mode 100644 index 000000000000..48705b439c6e --- /dev/null +++ b/include/linux/coreboot.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * coreboot.h + * + * Coreboot device and driver interfaces. + * + * Copyright 2014 Gerd Hoffmann + * Copyright 2017 Google Inc. + * Copyright 2017 Samuel Holland + */ + +#ifndef _LINUX_COREBOOT_H +#define _LINUX_COREBOOT_H + +#include + +/* List of coreboot entry structures that is used */ + +#define CB_TAG_FRAMEBUFFER 0x12 +#define LB_TAG_CBMEM_ENTRY 0x31 + +/* Generic */ +struct coreboot_table_entry { + u32 tag; + u32 size; +}; + +/* Points to a CBMEM entry */ +struct lb_cbmem_ref { + u32 tag; + u32 size; + + u64 cbmem_addr; +}; + +/* Corresponds to LB_TAG_CBMEM_ENTRY */ +struct lb_cbmem_entry { + u32 tag; + u32 size; + + u64 address; + u32 entry_size; + u32 id; +}; + +/* Describes framebuffer setup by coreboot */ +struct lb_framebuffer { + u32 tag; + u32 size; + + u64 physical_address; + u32 x_resolution; + u32 y_resolution; + u32 bytes_per_line; + u8 bits_per_pixel; + u8 red_mask_pos; + u8 red_mask_size; + u8 green_mask_pos; + u8 green_mask_size; + u8 blue_mask_pos; + u8 blue_mask_size; + u8 reserved_mask_pos; + u8 reserved_mask_size; +}; + +#endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 27fc52b5505a3acca96b884a4bf1345344e5a566 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:19 +0100 Subject: firmware: google: Pack structures for coreboot table entries Pack the fields in the coreboot table entries. These entries are part of the coreboot ABI, so they don't follow regular calling conventions. Fields of type u64 are aligned to boundaries of 4 bytes instead of 8. [1] So far this has not been a problem. In the future, padding bytes should be added where explicit alignment is required. Signed-off-by: Thomas Zimmermann Link: https://github.com/coreboot/coreboot/blob/main/payloads/libpayload/include/coreboot_tables.h#L96 # [1] Suggested-by: Julius Werner Acked-by: Julius Werner Acked-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260217155836.96267-10-tzimmermann@suse.de --- include/linux/coreboot.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 48705b439c6e..5746b99a070d 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -12,8 +12,11 @@ #ifndef _LINUX_COREBOOT_H #define _LINUX_COREBOOT_H +#include #include +typedef __aligned(4) u64 cb_u64; + /* List of coreboot entry structures that is used */ #define CB_TAG_FRAMEBUFFER 0x12 @@ -30,7 +33,7 @@ struct lb_cbmem_ref { u32 tag; u32 size; - u64 cbmem_addr; + cb_u64 cbmem_addr; }; /* Corresponds to LB_TAG_CBMEM_ENTRY */ @@ -38,7 +41,7 @@ struct lb_cbmem_entry { u32 tag; u32 size; - u64 address; + cb_u64 address; u32 entry_size; u32 id; }; @@ -48,7 +51,7 @@ struct lb_framebuffer { u32 tag; u32 size; - u64 physical_address; + cb_u64 physical_address; u32 x_resolution; u32 y_resolution; u32 bytes_per_line; -- cgit v1.2.3 From a29a1f0ec8d69ee917a9d4c84b844df0decff0ef Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:21 +0100 Subject: drm/sysfb: corebootdrm: Add DRM driver for coreboot framebuffers Add corebootdrm, a DRM driver for coreboot framebuffers. The driver supports a pre-initialized framebuffer with various packed RGB formats. The driver code is fairly small and uses the same logic as the other sysfb drivers. Most of the implementation comes from existing sysfb helpers. Until now, coreboot relied on simpledrm or simplefb for boot-up graphics output. Initialize the platform device for corebootdrm in the same place in framebuffer_probe(). With a later commit, the simple-framebuffer should be removed. v4: - sort include statements (Tzung-Bi) v3: - comment on _HAS_LFB semantics (Tzung-Bi) - fix typo in commit description (Tzung-Bi) - comment on simple-framebuffer being obsolete for coreboot v2: - reimplement as platform driver - limit resources and mappings to known framebuffer memory; no page alignment - create corebootdrm device from coreboot framebuffer code Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-12-tzimmermann@suse.de --- drivers/firmware/google/Kconfig | 3 +- drivers/firmware/google/framebuffer-coreboot.c | 22 +- drivers/gpu/drm/sysfb/Kconfig | 16 + drivers/gpu/drm/sysfb/Makefile | 1 + drivers/gpu/drm/sysfb/corebootdrm.c | 412 +++++++++++++++++++++++++ include/linux/coreboot.h | 8 + 6 files changed, 458 insertions(+), 4 deletions(-) create mode 100644 drivers/gpu/drm/sysfb/corebootdrm.c (limited to 'include') diff --git a/drivers/firmware/google/Kconfig b/drivers/firmware/google/Kconfig index 3ab3e089328b..b78c644fa253 100644 --- a/drivers/firmware/google/Kconfig +++ b/drivers/firmware/google/Kconfig @@ -63,7 +63,8 @@ config GOOGLE_FRAMEBUFFER_COREBOOT help This option enables the kernel to search for a framebuffer in the coreboot table. If found, it is registered with a platform - device of type simple-framebuffer. + device of type coreboot-framebuffer. Using the old device of + type simple-framebuffer is deprecated. config GOOGLE_MEMCONSOLE_COREBOOT tristate "Firmware Memory Console" diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c index fab3f28655d3..2c63a9bd0dcb 100644 --- a/drivers/firmware/google/framebuffer-coreboot.c +++ b/drivers/firmware/google/framebuffer-coreboot.c @@ -76,22 +76,23 @@ static struct device *framebuffer_parent_dev(struct resource *res) return NULL; } -static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; - static int framebuffer_probe(struct coreboot_device *dev) { - int i; struct lb_framebuffer *fb = &dev->framebuffer; struct device *parent; struct platform_device *pdev; struct resource res; int ret; +#if !IS_ENABLED(CONFIG_DRM_COREBOOTDRM) struct simplefb_platform_data pdata = { .width = fb->x_resolution, .height = fb->y_resolution, .stride = fb->bytes_per_line, .format = NULL, }; + int i; + static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; +#endif /* * On coreboot systems, the advertised LB_TAG_FRAMEBUFFER entry @@ -118,6 +119,20 @@ static int framebuffer_probe(struct coreboot_device *dev) if (IS_ERR(parent)) return PTR_ERR(parent); +#if IS_ENABLED(CONFIG_DRM_COREBOOTDRM) + pdev = platform_device_register_resndata(parent, "coreboot-framebuffer", 0, + &res, 1, fb, fb->size); + if (IS_ERR(pdev)) { + pr_warn("coreboot: could not register framebuffer\n"); + ret = PTR_ERR(pdev); + goto out_put_device_parent; + } +#else + /* + * FIXME: Coreboot systems should use a driver that binds to + * coreboot-framebuffer devices. Remove support for + * simple-framebuffer at some point. + */ for (i = 0; i < ARRAY_SIZE(formats); ++i) { if (fb->bits_per_pixel == formats[i].bits_per_pixel && fb->red_mask_pos == formats[i].red.offset && @@ -142,6 +157,7 @@ static int framebuffer_probe(struct coreboot_device *dev) pr_warn("coreboot: could not register framebuffer\n"); goto out_put_device_parent; } +#endif ret = 0; diff --git a/drivers/gpu/drm/sysfb/Kconfig b/drivers/gpu/drm/sysfb/Kconfig index 9c9884c7efc6..2559ead6cf1f 100644 --- a/drivers/gpu/drm/sysfb/Kconfig +++ b/drivers/gpu/drm/sysfb/Kconfig @@ -7,6 +7,22 @@ config DRM_SYSFB_HELPER tristate depends on DRM +config DRM_COREBOOTDRM + tristate "Coreboot framebuffer driver" + depends on DRM && MMU + depends on GOOGLE_FRAMEBUFFER_COREBOOT + select APERTURE_HELPERS + select DRM_CLIENT_SELECTION + select DRM_GEM_SHMEM_HELPER + select DRM_KMS_HELPER + select DRM_SYSFB_HELPER + help + DRM driver for coreboot-provided framebuffers. + + This driver assumes that the display hardware has been initialized + by coreboot firmware before the kernel boots. Scanout buffer, size, + and display format must be provided via coreboot framebuffer device. + config DRM_EFIDRM tristate "EFI framebuffer driver" depends on DRM && MMU && EFI && (!SYSFB_SIMPLEFB || COMPILE_TEST) diff --git a/drivers/gpu/drm/sysfb/Makefile b/drivers/gpu/drm/sysfb/Makefile index a156c496413d..85c9087ab03d 100644 --- a/drivers/gpu/drm/sysfb/Makefile +++ b/drivers/gpu/drm/sysfb/Makefile @@ -6,6 +6,7 @@ drm_sysfb_helper-y := \ drm_sysfb_helper-$(CONFIG_SCREEN_INFO) += drm_sysfb_screen_info.o obj-$(CONFIG_DRM_SYSFB_HELPER) += drm_sysfb_helper.o +obj-$(CONFIG_DRM_COREBOOTDRM) += corebootdrm.o obj-$(CONFIG_DRM_EFIDRM) += efidrm.o obj-$(CONFIG_DRM_OFDRM) += ofdrm.o obj-$(CONFIG_DRM_SIMPLEDRM) += simpledrm.o diff --git a/drivers/gpu/drm/sysfb/corebootdrm.c b/drivers/gpu/drm/sysfb/corebootdrm.c new file mode 100644 index 000000000000..745318580a5d --- /dev/null +++ b/drivers/gpu/drm/sysfb/corebootdrm.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drm_sysfb_helper.h" + +#define DRIVER_NAME "corebootdrm" +#define DRIVER_DESC "DRM driver for Coreboot framebuffers" +#define DRIVER_MAJOR 1 +#define DRIVER_MINOR 0 + +static const struct drm_format_info * +corebootdrm_get_format_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + static const struct drm_sysfb_format formats[] = { + { PIXEL_FORMAT_XRGB1555, DRM_FORMAT_XRGB1555, }, + { PIXEL_FORMAT_RGB565, DRM_FORMAT_RGB565, }, + { PIXEL_FORMAT_RGB888, DRM_FORMAT_RGB888, }, + { PIXEL_FORMAT_XRGB8888, DRM_FORMAT_XRGB8888, }, + { PIXEL_FORMAT_XBGR8888, DRM_FORMAT_XBGR8888, }, + { PIXEL_FORMAT_XRGB2101010, DRM_FORMAT_XRGB2101010, }, + }; + const struct pixel_format pixel = { + .bits_per_pixel = fb->bits_per_pixel, + .indexed = false, + .alpha = { + .offset = 0, + .length = 0, + }, + .red = { + .offset = fb->red_mask_pos, + .length = fb->red_mask_size, + }, + .green = { + .offset = fb->green_mask_pos, + .length = fb->green_mask_size, + }, + .blue = { + .offset = fb->blue_mask_pos, + .length = fb->blue_mask_size, + }, + }; + + return drm_sysfb_get_format(dev, formats, ARRAY_SIZE(formats), &pixel); +} + +static int corebootdrm_get_width_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + return drm_sysfb_get_validated_int0(dev, "width", fb->x_resolution, INT_MAX); +} + +static int corebootdrm_get_height_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + return drm_sysfb_get_validated_int0(dev, "height", fb->y_resolution, INT_MAX); +} + +static int corebootdrm_get_pitch_fb(struct drm_device *dev, const struct drm_format_info *format, + unsigned int width, const struct lb_framebuffer *fb) +{ + u64 bytes_per_line = fb->bytes_per_line; + + if (!bytes_per_line) + bytes_per_line = drm_format_info_min_pitch(format, 0, width); + + return drm_sysfb_get_validated_int0(dev, "pitch", bytes_per_line, INT_MAX); +} + +static resource_size_t corebootdrm_get_size_fb(struct drm_device *dev, unsigned int height, + unsigned int pitch, + const struct lb_framebuffer *fb) +{ + resource_size_t size; + + if (check_mul_overflow(height, pitch, &size)) + return 0; + + return size; +} + +static phys_addr_t corebootdrm_get_address_fb(struct drm_device *dev, resource_size_t size, + const struct lb_framebuffer *fb) +{ + if (size > PHYS_ADDR_MAX) + return 0; + if (!fb->physical_address) + return 0; + if (fb->physical_address > (PHYS_ADDR_MAX - size)) + return 0; + + return fb->physical_address; +} + +/* + * Simple Framebuffer device + */ + +struct corebootdrm_device { + struct drm_sysfb_device sysfb; + + /* modesetting */ + u32 formats[DRM_SYSFB_PLANE_NFORMATS(1)]; + struct drm_plane primary_plane; + struct drm_crtc crtc; + struct drm_encoder encoder; + struct drm_connector connector; +}; + +/* + * Modesetting + */ + +static const u64 corebootdrm_primary_plane_format_modifiers[] = { + DRM_SYSFB_PLANE_FORMAT_MODIFIERS, +}; + +static const struct drm_plane_helper_funcs corebootdrm_primary_plane_helper_funcs = { + DRM_SYSFB_PLANE_HELPER_FUNCS, +}; + +static const struct drm_plane_funcs corebootdrm_primary_plane_funcs = { + DRM_SYSFB_PLANE_FUNCS, + .destroy = drm_plane_cleanup, +}; + +static const struct drm_crtc_helper_funcs corebootdrm_crtc_helper_funcs = { + DRM_SYSFB_CRTC_HELPER_FUNCS, +}; + +static const struct drm_crtc_funcs corebootdrm_crtc_funcs = { + DRM_SYSFB_CRTC_FUNCS, + .destroy = drm_crtc_cleanup, +}; + +static const struct drm_encoder_funcs corebootdrm_encoder_funcs = { + .destroy = drm_encoder_cleanup, +}; + +static const struct drm_connector_helper_funcs corebootdrm_connector_helper_funcs = { + DRM_SYSFB_CONNECTOR_HELPER_FUNCS, +}; + +static const struct drm_connector_funcs corebootdrm_connector_funcs = { + DRM_SYSFB_CONNECTOR_FUNCS, + .destroy = drm_connector_cleanup, +}; + +static const struct drm_mode_config_funcs corebootdrm_mode_config_funcs = { + DRM_SYSFB_MODE_CONFIG_FUNCS, +}; + +static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) +{ + struct drm_sysfb_device *sysfb = &cdev->sysfb; + struct drm_device *dev = &sysfb->dev; + const struct drm_format_info *format = sysfb->fb_format; + unsigned int width = sysfb->fb_mode.hdisplay; + unsigned int height = sysfb->fb_mode.vdisplay; + struct drm_plane *primary_plane; + struct drm_crtc *crtc; + struct drm_encoder *encoder; + struct drm_connector *connector; + size_t nformats; + int ret; + + ret = drmm_mode_config_init(dev); + if (ret) + return ret; + + dev->mode_config.min_width = width; + dev->mode_config.max_width = max_t(unsigned int, width, DRM_SHADOW_PLANE_MAX_WIDTH); + dev->mode_config.min_height = height; + dev->mode_config.max_height = max_t(unsigned int, height, DRM_SHADOW_PLANE_MAX_HEIGHT); + dev->mode_config.funcs = &corebootdrm_mode_config_funcs; + dev->mode_config.preferred_depth = format->depth; + + /* Primary plane */ + + nformats = drm_sysfb_build_fourcc_list(dev, &format->format, 1, + cdev->formats, ARRAY_SIZE(cdev->formats)); + + primary_plane = &cdev->primary_plane; + ret = drm_universal_plane_init(dev, primary_plane, 0, &corebootdrm_primary_plane_funcs, + cdev->formats, nformats, + corebootdrm_primary_plane_format_modifiers, + DRM_PLANE_TYPE_PRIMARY, NULL); + if (ret) + return ret; + drm_plane_helper_add(primary_plane, &corebootdrm_primary_plane_helper_funcs); + drm_plane_enable_fb_damage_clips(primary_plane); + + /* CRTC */ + + crtc = &cdev->crtc; + ret = drm_crtc_init_with_planes(dev, crtc, primary_plane, NULL, + &corebootdrm_crtc_funcs, NULL); + if (ret) + return ret; + drm_crtc_helper_add(crtc, &corebootdrm_crtc_helper_funcs); + + /* Encoder */ + + encoder = &cdev->encoder; + ret = drm_encoder_init(dev, encoder, &corebootdrm_encoder_funcs, + DRM_MODE_ENCODER_NONE, NULL); + if (ret) + return ret; + encoder->possible_crtcs = drm_crtc_mask(crtc); + + /* Connector */ + + connector = &cdev->connector; + ret = drm_connector_init(dev, connector, &corebootdrm_connector_funcs, + DRM_MODE_CONNECTOR_Unknown); + if (ret) + return ret; + drm_connector_helper_add(connector, &corebootdrm_connector_helper_funcs); + drm_connector_set_panel_orientation_with_quirk(connector, + DRM_MODE_PANEL_ORIENTATION_UNKNOWN, + width, height); + + ret = drm_connector_attach_encoder(connector, encoder); + if (ret) + return ret; + + return 0; +} + +/* + * DRM driver + */ + +DEFINE_DRM_GEM_FOPS(corebootdrm_fops); + +static struct drm_driver corebootdrm_drm_driver = { + DRM_GEM_SHMEM_DRIVER_OPS, + DRM_FBDEV_SHMEM_DRIVER_OPS, + .name = DRIVER_NAME, + .desc = DRIVER_DESC, + .major = DRIVER_MAJOR, + .minor = DRIVER_MINOR, + .driver_features = DRIVER_ATOMIC | DRIVER_GEM | DRIVER_MODESET, + .fops = &corebootdrm_fops, +}; + +/* + * Coreboot driver + */ + +static int corebootdrm_probe(struct platform_device *pdev) +{ + const struct lb_framebuffer *fb = dev_get_platdata(&pdev->dev); + struct corebootdrm_device *cdev; + struct drm_sysfb_device *sysfb; + struct drm_device *dev; + const struct drm_format_info *format; + int width, height, pitch; + resource_size_t size; + phys_addr_t address; + struct resource *res, *mem = NULL; + struct resource aperture; + void __iomem *screen_base; + int ret; + + cdev = devm_drm_dev_alloc(&pdev->dev, &corebootdrm_drm_driver, + struct corebootdrm_device, sysfb.dev); + if (IS_ERR(cdev)) + return PTR_ERR(cdev); + platform_set_drvdata(pdev, cdev); + + sysfb = &cdev->sysfb; + dev = &sysfb->dev; + + if (!fb) { + drm_err(dev, "coreboot framebuffer not found\n"); + return -EINVAL; + } else if (!LB_FRAMEBUFFER_HAS_LFB(fb)) { + drm_err(dev, "coreboot framebuffer entry too small\n"); + return -EINVAL; + } + + /* + * Hardware settings + */ + + format = corebootdrm_get_format_fb(dev, fb); + if (!format) + return -EINVAL; + width = corebootdrm_get_width_fb(dev, fb); + if (width < 0) + return width; + height = corebootdrm_get_height_fb(dev, fb); + if (height < 0) + return height; + pitch = corebootdrm_get_pitch_fb(dev, format, width, fb); + if (pitch < 0) + return pitch; + size = corebootdrm_get_size_fb(dev, height, pitch, fb); + if (!size) + return -EINVAL; + address = corebootdrm_get_address_fb(dev, size, fb); + if (!address) + return -EINVAL; + + sysfb->fb_mode = drm_sysfb_mode(width, height, 0, 0); + sysfb->fb_format = format; + sysfb->fb_pitch = pitch; + + drm_dbg(dev, "display mode={" DRM_MODE_FMT "}\n", DRM_MODE_ARG(&sysfb->fb_mode)); + drm_dbg(dev, "framebuffer format=%p4cc, size=%dx%d, pitch=%d byte\n", + &format->format, width, height, pitch); + + /* + * Memory management + */ + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + drm_err(dev, "memory resource not found\n"); + return -EINVAL; + } + + mem = devm_request_mem_region(&pdev->dev, res->start, resource_size(res), + dev->driver->name); + if (!mem) { + drm_warn(dev, "could not acquire memory resource at %pr\n", res); + /* + * We cannot make this fatal. Sometimes this comes from magic + * spaces our resource handlers simply don't know about. Use + * the memory resource as-is and try to map that instead. + */ + mem = res; + } + + drm_dbg(dev, "using memory resource at %pr\n", mem); + + aperture = DEFINE_RES_MEM(address, size); + if (!resource_contains(mem, &aperture)) { + drm_err(dev, "framebuffer aperture at invalid memory range %pr\n", &aperture); + return -EINVAL; + } + + ret = devm_aperture_acquire_for_platform_device(pdev, address, size); + if (ret) { + drm_err(dev, "could not acquire framebuffer aperture: %d\n", ret); + return ret; + } + + screen_base = devm_ioremap_wc(&pdev->dev, address, size); + if (!screen_base) + return -ENOMEM; + + iosys_map_set_vaddr_iomem(&sysfb->fb_addr, screen_base); + + /* + * DRM mode setting and registration + */ + + ret = corebootdrm_mode_config_init(cdev); + if (ret) + return ret; + + drm_mode_config_reset(dev); + + ret = drm_dev_register(dev, 0); + if (ret) + return ret; + + drm_client_setup(dev, sysfb->fb_format); + + return 0; +} + +static void corebootdrm_remove(struct platform_device *pdev) +{ + struct corebootdrm_device *cdev = platform_get_drvdata(pdev); + struct drm_device *dev = &cdev->sysfb.dev; + + drm_dev_unplug(dev); +} + +static struct platform_driver corebootdrm_platform_driver = { + .driver = { + .name = "coreboot-framebuffer", + }, + .probe = corebootdrm_probe, + .remove = corebootdrm_remove, +}; + +module_platform_driver(corebootdrm_platform_driver); + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 5746b99a070d..885da106fee3 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -13,6 +13,7 @@ #define _LINUX_COREBOOT_H #include +#include #include typedef __aligned(4) u64 cb_u64; @@ -66,4 +67,11 @@ struct lb_framebuffer { u8 reserved_mask_size; }; +/* + * True if the coreboot-provided data is large enough to hold information + * on the linear framebuffer. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_LFB(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 058fc04b8587ad07a86dfa8f99d8d99db0a55443 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:22 +0100 Subject: drm/sysfb: corebootdrm: Support panel orientation Add fields and constants for coreboot framebuffer orientation. Set corebootdrm's DRM connector state from the values. Not all firmware provides orientation, so make it optional. Systems without, continue to use unknown orientation. v3: - comment on _HAS_ORIENTATION semantics (Tzung-Bi) Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-13-tzimmermann@suse.de --- drivers/gpu/drm/sysfb/corebootdrm.c | 30 ++++++++++++++++++++++++++---- include/linux/coreboot.h | 13 +++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/sysfb/corebootdrm.c b/drivers/gpu/drm/sysfb/corebootdrm.c index 745318580a5d..5dc6f3c76f7b 100644 --- a/drivers/gpu/drm/sysfb/corebootdrm.c +++ b/drivers/gpu/drm/sysfb/corebootdrm.c @@ -110,6 +110,26 @@ static phys_addr_t corebootdrm_get_address_fb(struct drm_device *dev, resource_s return fb->physical_address; } +static enum drm_panel_orientation corebootdrm_get_orientation_fb(struct drm_device *dev, + const struct lb_framebuffer *fb) +{ + if (!LB_FRAMEBUFFER_HAS_ORIENTATION(fb)) + return DRM_MODE_PANEL_ORIENTATION_UNKNOWN; + + switch (fb->orientation) { + case LB_FRAMEBUFFER_ORIENTATION_NORMAL: + return DRM_MODE_PANEL_ORIENTATION_NORMAL; + case LB_FRAMEBUFFER_ORIENTATION_BOTTOM_UP: + return DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP; + case LB_FRAMEBUFFER_ORIENTATION_LEFT_UP: + return DRM_MODE_PANEL_ORIENTATION_LEFT_UP; + case LB_FRAMEBUFFER_ORIENTATION_RIGHT_UP: + return DRM_MODE_PANEL_ORIENTATION_RIGHT_UP; + } + + return DRM_MODE_PANEL_ORIENTATION_UNKNOWN; +} + /* * Simple Framebuffer device */ @@ -168,7 +188,8 @@ static const struct drm_mode_config_funcs corebootdrm_mode_config_funcs = { DRM_SYSFB_MODE_CONFIG_FUNCS, }; -static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) +static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev, + enum drm_panel_orientation orientation) { struct drm_sysfb_device *sysfb = &cdev->sysfb; struct drm_device *dev = &sysfb->dev; @@ -234,8 +255,7 @@ static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) if (ret) return ret; drm_connector_helper_add(connector, &corebootdrm_connector_helper_funcs); - drm_connector_set_panel_orientation_with_quirk(connector, - DRM_MODE_PANEL_ORIENTATION_UNKNOWN, + drm_connector_set_panel_orientation_with_quirk(connector, orientation, width, height); ret = drm_connector_attach_encoder(connector, encoder); @@ -276,6 +296,7 @@ static int corebootdrm_probe(struct platform_device *pdev) int width, height, pitch; resource_size_t size; phys_addr_t address; + enum drm_panel_orientation orientation; struct resource *res, *mem = NULL; struct resource aperture; void __iomem *screen_base; @@ -320,6 +341,7 @@ static int corebootdrm_probe(struct platform_device *pdev) address = corebootdrm_get_address_fb(dev, size, fb); if (!address) return -EINVAL; + orientation = corebootdrm_get_orientation_fb(dev, fb); sysfb->fb_mode = drm_sysfb_mode(width, height, 0, 0); sysfb->fb_format = format; @@ -375,7 +397,7 @@ static int corebootdrm_probe(struct platform_device *pdev) * DRM mode setting and registration */ - ret = corebootdrm_mode_config_init(cdev); + ret = corebootdrm_mode_config_init(cdev, orientation); if (ret) return ret; diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 885da106fee3..5d40ca7a1d89 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -47,6 +47,11 @@ struct lb_cbmem_entry { u32 id; }; +#define LB_FRAMEBUFFER_ORIENTATION_NORMAL 0 +#define LB_FRAMEBUFFER_ORIENTATION_BOTTOM_UP 1 +#define LB_FRAMEBUFFER_ORIENTATION_LEFT_UP 2 +#define LB_FRAMEBUFFER_ORIENTATION_RIGHT_UP 3 + /* Describes framebuffer setup by coreboot */ struct lb_framebuffer { u32 tag; @@ -65,6 +70,7 @@ struct lb_framebuffer { u8 blue_mask_size; u8 reserved_mask_pos; u8 reserved_mask_size; + u8 orientation; }; /* @@ -74,4 +80,11 @@ struct lb_framebuffer { #define LB_FRAMEBUFFER_HAS_LFB(__fb) \ ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) +/* + * True if the coreboot-provided data is large enough to hold information + * on the display orientation. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_ORIENTATION(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, orientation)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 16843e6638b743dd0376a1fc0845f2fd34daff98 Mon Sep 17 00:00:00 2001 From: Satyanarayana K V P Date: Fri, 20 Feb 2026 05:55:21 +0000 Subject: drm/sa: Split drm_suballoc_new() into SA alloc and init helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_suballoc_new() currently both allocates the SA object using kmalloc() and searches for a suitable hole in the sub-allocator for the requested size. If SA allocation is done by holding sub-allocator mutex, this design can lead to reclaim safety issues. By splitting the kmalloc() step outside of the critical section, we allow the memory allocation to use GFP_KERNEL (reclaim-safe) while ensuring that the initialization step that holds reclaim-tainted locks (sub-allocator mutex) operates in a reclaim-unsafe context with pre-allocated memory. This separation prevents potential deadlocks where memory reclaim could attempt to acquire locks that are already held during the sub-allocator operations. Signed-off-by: Satyanarayana K V P Suggested-by: Matthew Brost Cc: Thomas Hellström Cc: Michal Wajdeczko Cc: Matthew Auld Cc: Christian König Cc: dri-devel@lists.freedesktop.org Cc: Maarten Lankhorst Reviewed-by: Christian König Reviewed-by: Thomas Hellström Reviewed-by: Matthew Brost Acked-by: Maarten Lankhorst Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260220055519.2485681-6-satyanarayana.k.v.p@intel.com --- drivers/gpu/drm/drm_suballoc.c | 106 +++++++++++++++++++++++++++++++++-------- include/drm/drm_suballoc.h | 6 +++ 2 files changed, 92 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_suballoc.c b/drivers/gpu/drm/drm_suballoc.c index 879ea33dbbc4..dc9bef3c0419 100644 --- a/drivers/gpu/drm/drm_suballoc.c +++ b/drivers/gpu/drm/drm_suballoc.c @@ -293,45 +293,66 @@ static bool drm_suballoc_next_hole(struct drm_suballoc_manager *sa_manager, } /** - * drm_suballoc_new() - Make a suballocation. + * drm_suballoc_alloc() - Allocate uninitialized suballoc object. + * @gfp: gfp flags used for memory allocation. + * + * Allocate memory for an uninitialized suballoc object. Intended usage is + * allocate memory for suballoc object outside of a reclaim tainted context + * and then be initialized at a later time in a reclaim tainted context. + * + * @drm_suballoc_free() should be used to release the memory if returned + * suballoc object is in uninitialized state. + * + * Return: a new uninitialized suballoc object, or an ERR_PTR(-ENOMEM). + */ +struct drm_suballoc *drm_suballoc_alloc(gfp_t gfp) +{ + struct drm_suballoc *sa; + + sa = kmalloc(sizeof(*sa), gfp); + if (!sa) + return ERR_PTR(-ENOMEM); + + sa->manager = NULL; + + return sa; +} +EXPORT_SYMBOL(drm_suballoc_alloc); + +/** + * drm_suballoc_insert() - Initialize a suballocation and insert a hole. * @sa_manager: pointer to the sa_manager + * @sa: The struct drm_suballoc. * @size: number of bytes we want to suballocate. - * @gfp: gfp flags used for memory allocation. Typically GFP_KERNEL but - * the argument is provided for suballocations from reclaim context or - * where the caller wants to avoid pipelining rather than wait for - * reclaim. * @intr: Whether to perform waits interruptible. This should typically * always be true, unless the caller needs to propagate a * non-interruptible context from above layers. * @align: Alignment. Must not exceed the default manager alignment. * If @align is zero, then the manager alignment is used. * - * Try to make a suballocation of size @size, which will be rounded - * up to the alignment specified in specified in drm_suballoc_manager_init(). + * Try to make a suballocation on a pre-allocated suballoc object of size @size, + * which will be rounded up to the alignment specified in specified in + * drm_suballoc_manager_init(). * - * Return: a new suballocated bo, or an ERR_PTR. + * Return: zero on success, errno on failure. */ -struct drm_suballoc * -drm_suballoc_new(struct drm_suballoc_manager *sa_manager, size_t size, - gfp_t gfp, bool intr, size_t align) +int drm_suballoc_insert(struct drm_suballoc_manager *sa_manager, + struct drm_suballoc *sa, size_t size, + bool intr, size_t align) { struct dma_fence *fences[DRM_SUBALLOC_MAX_QUEUES]; unsigned int tries[DRM_SUBALLOC_MAX_QUEUES]; unsigned int count; int i, r; - struct drm_suballoc *sa; if (WARN_ON_ONCE(align > sa_manager->align)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (WARN_ON_ONCE(size > sa_manager->size || !size)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (!align) align = sa_manager->align; - sa = kmalloc(sizeof(*sa), gfp); - if (!sa) - return ERR_PTR(-ENOMEM); sa->manager = sa_manager; sa->fence = NULL; INIT_LIST_HEAD(&sa->olist); @@ -348,7 +369,7 @@ drm_suballoc_new(struct drm_suballoc_manager *sa_manager, size_t size, if (drm_suballoc_try_alloc(sa_manager, sa, size, align)) { spin_unlock(&sa_manager->wq.lock); - return sa; + return 0; } /* see if we can skip over some allocations */ @@ -385,8 +406,48 @@ drm_suballoc_new(struct drm_suballoc_manager *sa_manager, size_t size, } while (!r); spin_unlock(&sa_manager->wq.lock); - kfree(sa); - return ERR_PTR(r); + sa->manager = NULL; + return r; +} +EXPORT_SYMBOL(drm_suballoc_insert); + +/** + * drm_suballoc_new() - Make a suballocation. + * @sa_manager: pointer to the sa_manager + * @size: number of bytes we want to suballocate. + * @gfp: gfp flags used for memory allocation. Typically GFP_KERNEL but + * the argument is provided for suballocations from reclaim context or + * where the caller wants to avoid pipelining rather than wait for + * reclaim. + * @intr: Whether to perform waits interruptible. This should typically + * always be true, unless the caller needs to propagate a + * non-interruptible context from above layers. + * @align: Alignment. Must not exceed the default manager alignment. + * If @align is zero, then the manager alignment is used. + * + * Try to make a suballocation of size @size, which will be rounded + * up to the alignment specified in specified in drm_suballoc_manager_init(). + * + * Return: a new suballocated bo, or an ERR_PTR. + */ +struct drm_suballoc * +drm_suballoc_new(struct drm_suballoc_manager *sa_manager, size_t size, + gfp_t gfp, bool intr, size_t align) +{ + struct drm_suballoc *sa; + int err; + + sa = drm_suballoc_alloc(gfp); + if (IS_ERR(sa)) + return sa; + + err = drm_suballoc_insert(sa_manager, sa, size, intr, align); + if (err) { + drm_suballoc_free(sa, NULL); + return ERR_PTR(err); + } + + return sa; } EXPORT_SYMBOL(drm_suballoc_new); @@ -405,6 +466,11 @@ void drm_suballoc_free(struct drm_suballoc *suballoc, if (!suballoc) return; + if (!suballoc->manager) { + kfree(suballoc); + return; + } + sa_manager = suballoc->manager; spin_lock(&sa_manager->wq.lock); diff --git a/include/drm/drm_suballoc.h b/include/drm/drm_suballoc.h index 7ba72a81a808..29befdda35d2 100644 --- a/include/drm/drm_suballoc.h +++ b/include/drm/drm_suballoc.h @@ -53,6 +53,12 @@ void drm_suballoc_manager_init(struct drm_suballoc_manager *sa_manager, void drm_suballoc_manager_fini(struct drm_suballoc_manager *sa_manager); +struct drm_suballoc *drm_suballoc_alloc(gfp_t gfp); + +int drm_suballoc_insert(struct drm_suballoc_manager *sa_manager, + struct drm_suballoc *sa, size_t size, bool intr, + size_t align); + struct drm_suballoc * drm_suballoc_new(struct drm_suballoc_manager *sa_manager, size_t size, gfp_t gfp, bool intr, size_t align); -- cgit v1.2.3 From 77ae37018a2705f5abe8cc428e3496651258901d Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Fri, 6 Feb 2026 09:04:12 +0800 Subject: drm/bridge: synopsys: dw-dp: Set pixel mode by platform data In the implementation and integration of the SoC, the DW DisplayPort hardware block can be configured to work in single, dual, quad pixel mode on differnt platforms, so make the pixel mode set by plat_data to support the upcoming rk3576 variant. Signed-off-by: Andy Yan Reviewed-by: Sebastian Reichel Tested-by: Sebastian Reichel Signed-off-by: Heiko Stuebner Link: https://patch.msgid.link/20260206010421.443605-3-andyshrk@163.com --- drivers/gpu/drm/bridge/synopsys/dw-dp.c | 8 +------- drivers/gpu/drm/rockchip/dw_dp-rockchip.c | 19 +++++++++++++++---- include/drm/bridge/dw_dp.h | 7 +++++++ 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/bridge/synopsys/dw-dp.c b/drivers/gpu/drm/bridge/synopsys/dw-dp.c index 432342452484..ccc0d7c85645 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-dp.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-dp.c @@ -352,12 +352,6 @@ enum { DW_DP_YCBCR420_16BIT, }; -enum { - DW_DP_MP_SINGLE_PIXEL, - DW_DP_MP_DUAL_PIXEL, - DW_DP_MP_QUAD_PIXEL, -}; - enum { DW_DP_SDP_VERTICAL_INTERVAL = BIT(0), DW_DP_SDP_HORIZONTAL_INTERVAL = BIT(1), @@ -1984,7 +1978,7 @@ struct dw_dp *dw_dp_bind(struct device *dev, struct drm_encoder *encoder, return ERR_CAST(dp); dp->dev = dev; - dp->pixel_mode = DW_DP_MP_QUAD_PIXEL; + dp->pixel_mode = plat_data->pixel_mode; dp->plat_data.max_link_rate = plat_data->max_link_rate; bridge = &dp->bridge; diff --git a/drivers/gpu/drm/rockchip/dw_dp-rockchip.c b/drivers/gpu/drm/rockchip/dw_dp-rockchip.c index 25ab4e46301e..89d614d53596 100644 --- a/drivers/gpu/drm/rockchip/dw_dp-rockchip.c +++ b/drivers/gpu/drm/rockchip/dw_dp-rockchip.c @@ -75,7 +75,7 @@ static const struct drm_encoder_helper_funcs dw_dp_encoder_helper_funcs = { static int dw_dp_rockchip_bind(struct device *dev, struct device *master, void *data) { struct platform_device *pdev = to_platform_device(dev); - struct dw_dp_plat_data plat_data; + const struct dw_dp_plat_data *plat_data; struct drm_device *drm_dev = data; struct rockchip_dw_dp *dp; struct drm_encoder *encoder; @@ -89,7 +89,10 @@ static int dw_dp_rockchip_bind(struct device *dev, struct device *master, void * dp->dev = dev; platform_set_drvdata(pdev, dp); - plat_data.max_link_rate = 810000; + plat_data = of_device_get_match_data(dev); + if (!plat_data) + return -ENODEV; + encoder = &dp->encoder.encoder; encoder->possible_crtcs = drm_of_find_possible_crtcs(drm_dev, dev->of_node); rockchip_drm_encoder_set_crtc_endpoint_id(&dp->encoder, dev->of_node, 0, 0); @@ -99,7 +102,7 @@ static int dw_dp_rockchip_bind(struct device *dev, struct device *master, void * return ret; drm_encoder_helper_add(encoder, &dw_dp_encoder_helper_funcs); - dp->base = dw_dp_bind(dev, encoder, &plat_data); + dp->base = dw_dp_bind(dev, encoder, plat_data); if (IS_ERR(dp->base)) { ret = PTR_ERR(dp->base); return ret; @@ -134,8 +137,16 @@ static void dw_dp_remove(struct platform_device *pdev) component_del(dp->dev, &dw_dp_rockchip_component_ops); } +static const struct dw_dp_plat_data rk3588_dp_plat_data = { + .max_link_rate = 810000, + .pixel_mode = DW_DP_MP_QUAD_PIXEL, +}; + static const struct of_device_id dw_dp_of_match[] = { - { .compatible = "rockchip,rk3588-dp", }, + { + .compatible = "rockchip,rk3588-dp", + .data = &rk3588_dp_plat_data, + }, {} }; MODULE_DEVICE_TABLE(of, dw_dp_of_match); diff --git a/include/drm/bridge/dw_dp.h b/include/drm/bridge/dw_dp.h index d05df49fd884..25363541e69d 100644 --- a/include/drm/bridge/dw_dp.h +++ b/include/drm/bridge/dw_dp.h @@ -11,8 +11,15 @@ struct drm_encoder; struct dw_dp; +enum { + DW_DP_MP_SINGLE_PIXEL, + DW_DP_MP_DUAL_PIXEL, + DW_DP_MP_QUAD_PIXEL, +}; + struct dw_dp_plat_data { u32 max_link_rate; + u8 pixel_mode; }; struct dw_dp *dw_dp_bind(struct device *dev, struct drm_encoder *encoder, -- cgit v1.2.3 From dc652a33cf08ecd7c9935bf9168a1a27c9a246f0 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Fri, 12 Dec 2025 08:41:42 +0900 Subject: clk: remove round_rate() clk ops The round_rate() clk ops is deprecated, and all in tree drivers have been converted, so let's go ahead and remove any references to the round_rate() clk ops. Signed-off-by: Brian Masney --- Documentation/driver-api/clk.rst | 9 +-------- drivers/clk/clk.c | 39 ++++++++++++++------------------------- include/linux/clk-provider.h | 18 ++++++------------ 3 files changed, 21 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst index 93bab5336dfd..c6aca8186a78 100644 --- a/Documentation/driver-api/clk.rst +++ b/Documentation/driver-api/clk.rst @@ -77,9 +77,6 @@ the operations defined in clk-provider.h:: void (*disable_unused)(struct clk_hw *hw); unsigned long (*recalc_rate)(struct clk_hw *hw, unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, - unsigned long rate, - unsigned long *parent_rate); int (*determine_rate)(struct clk_hw *hw, struct clk_rate_request *req); int (*set_parent)(struct clk_hw *hw, u8 index); @@ -220,9 +217,7 @@ optional or must be evaluated on a case-by-case basis. +----------------+------+-------------+---------------+-------------+------+ |.recalc_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ - |.round_rate | | y [1]_ | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.determine_rate | | y [1]_ | | | | + |.determine_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ |.set_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ @@ -238,8 +233,6 @@ optional or must be evaluated on a case-by-case basis. |.init | | | | | | +----------------+------+-------------+---------------+-------------+------+ -.. [1] either one of round_rate or determine_rate is required. - Finally, register your clock at run-time with a hardware-specific registration function. This function simply populates struct clk_foo's data and then passes the common struct clk parameters to the framework diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 47093cda9df3..fd418dc988b1 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1560,8 +1560,6 @@ late_initcall_sync(clk_disable_unused); static int clk_core_determine_round_nolock(struct clk_core *core, struct clk_rate_request *req) { - long rate; - lockdep_assert_held(&prepare_lock); if (!core) @@ -1591,13 +1589,6 @@ static int clk_core_determine_round_nolock(struct clk_core *core, req->rate = core->rate; } else if (core->ops->determine_rate) { return core->ops->determine_rate(core->hw, req); - } else if (core->ops->round_rate) { - rate = core->ops->round_rate(core->hw, req->rate, - &req->best_parent_rate); - if (rate < 0) - return rate; - - req->rate = rate; } else { return -EINVAL; } @@ -1682,7 +1673,7 @@ EXPORT_SYMBOL_GPL(clk_hw_forward_rate_request); static bool clk_core_can_round(struct clk_core * const core) { - return core->ops->determine_rate || core->ops->round_rate; + return core->ops->determine_rate; } static int clk_core_round_rate_nolock(struct clk_core *core, @@ -1750,11 +1741,11 @@ EXPORT_SYMBOL_GPL(__clk_determine_rate); * use. * * Context: prepare_lock must be held. - * For clk providers to call from within clk_ops such as .round_rate, + * For clk providers to call from within clk_ops such as * .determine_rate. * - * Return: returns rounded rate of hw clk if clk supports round_rate operation - * else returns the parent rate. + * Return: returns rounded rate of hw clk if clk supports determine_rate + * operation; else returns the parent rate. */ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate) { @@ -2569,12 +2560,13 @@ err: * * Setting the CLK_SET_RATE_PARENT flag allows the rate change operation to * propagate up to clk's parent; whether or not this happens depends on the - * outcome of clk's .round_rate implementation. If *parent_rate is unchanged - * after calling .round_rate then upstream parent propagation is ignored. If - * *parent_rate comes back with a new rate for clk's parent then we propagate - * up to clk's parent and set its rate. Upward propagation will continue - * until either a clk does not support the CLK_SET_RATE_PARENT flag or - * .round_rate stops requesting changes to clk's parent_rate. + * outcome of clk's .determine_rate implementation. If req->best_parent_rate + * is unchanged after calling .determine_rate then upstream parent propagation + * is ignored. If req->best_parent_rate comes back with a new rate for clk's + * parent then we propagate up to clk's parent and set its rate. Upward + * propagation will continue until either a clk does not support the + * CLK_SET_RATE_PARENT flag or .determine_rate stops requesting changes to + * clk's parent_rate. * * Rate changes are accomplished via tree traversal that also recalculates the * rates for the clocks and fires off POST_RATE_CHANGE notifiers. @@ -2703,8 +2695,6 @@ static int clk_set_rate_range_nolock(struct clk *clk, * FIXME: * There is a catch. It may fail for the usual reason (clock * broken, clock protected, etc) but also because: - * - round_rate() was not favorable and fell on the wrong - * side of the boundary * - the determine_rate() callback does not really check for * this corner case when determining the rate */ @@ -3915,10 +3905,9 @@ static int __clk_core_init(struct clk_core *core) } /* check that clk_ops are sane. See Documentation/driver-api/clk.rst */ - if (core->ops->set_rate && - !((core->ops->round_rate || core->ops->determine_rate) && - core->ops->recalc_rate)) { - pr_err("%s: %s must implement .round_rate or .determine_rate in addition to .recalc_rate\n", + if (core->ops->set_rate && !core->ops->determine_rate && + core->ops->recalc_rate) { + pr_err("%s: %s must implement .determine_rate in addition to .recalc_rate\n", __func__, core->name); ret = -EINVAL; goto out; diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 630705a47129..1cda2c78dffa 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -136,10 +136,6 @@ struct clk_duty { * 0. Returns the calculated rate. Optional, but recommended - if * this op is not set then clock rate will be initialized to 0. * - * @round_rate: Given a target rate as input, returns the closest rate actually - * supported by the clock. The parent rate is an input/output - * parameter. - * * @determine_rate: Given a target rate as input, returns the closest rate * actually supported by the clock, and optionally the parent clock * that should be used to provide the clock rate. @@ -163,13 +159,13 @@ struct clk_duty { * * @set_rate: Change the rate of this clock. The requested rate is specified * by the second argument, which should typically be the return - * of .round_rate call. The third argument gives the parent rate - * which is likely helpful for most .set_rate implementation. + * of .determine_rate call. The third argument gives the parent + * rate which is likely helpful for most .set_rate implementation. * Returns 0 on success, -EERROR otherwise. * * @set_rate_and_parent: Change the rate and the parent of this clock. The * requested rate is specified by the second argument, which - * should typically be the return of .round_rate call. The + * should typically be the return of clk_round_rate() call. The * third argument gives the parent rate which is likely helpful * for most .set_rate_and_parent implementation. The fourth * argument gives the parent index. This callback is optional (and @@ -244,8 +240,6 @@ struct clk_ops { void (*restore_context)(struct clk_hw *hw); unsigned long (*recalc_rate)(struct clk_hw *hw, unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, unsigned long rate, - unsigned long *parent_rate); int (*determine_rate)(struct clk_hw *hw, struct clk_rate_request *req); int (*set_parent)(struct clk_hw *hw, u8 index); @@ -679,7 +673,7 @@ struct clk_div_table { * @lock: register lock * * Clock with an adjustable divider affecting its output frequency. Implements - * .recalc_rate, .set_rate and .round_rate + * .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the @@ -1126,7 +1120,7 @@ void of_fixed_factor_clk_setup(struct device_node *node); * * Clock with a fixed multiplier and divider. The output frequency is the * parent clock rate divided by div and multiplied by mult. - * Implements .recalc_rate, .set_rate, .round_rate and .recalc_accuracy + * Implements .recalc_rate, .set_rate, .determine_rate and .recalc_accuracy * * Flags: * * CLK_FIXED_FACTOR_FIXED_ACCURACY - Use the value in @acc instead of the @@ -1254,7 +1248,7 @@ void clk_hw_unregister_fractional_divider(struct clk_hw *hw); * @lock: register lock * * Clock with an adjustable multiplier affecting its output frequency. - * Implements .recalc_rate, .set_rate and .round_rate + * Implements .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_MULTIPLIER_ZERO_BYPASS - By default, the multiplier is the value read -- cgit v1.2.3 From 4b5231d608d00749a2346a3dd11bd6d05c0662e3 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:44 -0500 Subject: clk: divider: remove divider_ro_round_rate_parent() There are no remaining users of divider_ro_round_rate_parent(), so let's go ahead and remove it. Signed-off-by: Brian Masney --- drivers/clk/clk-divider.c | 22 ---------------------- include/linux/clk-provider.h | 15 --------------- 2 files changed, 37 deletions(-) (limited to 'include') diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c index 45e7ebde4a8b..26610dd976ec 100644 --- a/drivers/clk/clk-divider.c +++ b/drivers/clk/clk-divider.c @@ -409,28 +409,6 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, } EXPORT_SYMBOL_GPL(divider_round_rate_parent); -long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, u8 width, - unsigned long flags, unsigned int val) -{ - struct clk_rate_request req; - int ret; - - clk_hw_init_rate_request(hw, &req, rate); - req.best_parent_rate = *prate; - req.best_parent_hw = parent; - - ret = divider_ro_determine_rate(hw, &req, table, width, flags, val); - if (ret) - return ret; - - *prate = req.best_parent_rate; - - return req.rate; -} -EXPORT_SYMBOL_GPL(divider_ro_round_rate_parent); - static int clk_divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1cda2c78dffa..0d31077749fb 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -737,10 +737,6 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, unsigned long rate, unsigned long *prate, const struct clk_div_table *table, u8 width, unsigned long flags); -long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, u8 width, - unsigned long flags, unsigned int val); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1440,17 +1436,6 @@ static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, rate, prate, table, width, flags); } -static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags, - unsigned int val) -{ - return divider_ro_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags, - val); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From d4851759742c1322f498021dab882d322fc34a1d Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:45 -0500 Subject: clk: divider: remove divider_round_rate() and divider_round_rate_parent() There are no remaining users of divider_round_rate() and divider_round_rate_parent(), so let's go ahead and remove them. Signed-off-by: Brian Masney --- drivers/clk/clk-divider.c | 22 ---------------------- include/linux/clk-provider.h | 13 ------------- 2 files changed, 35 deletions(-) (limited to 'include') diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c index 26610dd976ec..b3b485d23ea8 100644 --- a/drivers/clk/clk-divider.c +++ b/drivers/clk/clk-divider.c @@ -387,28 +387,6 @@ int divider_ro_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, } EXPORT_SYMBOL_GPL(divider_ro_determine_rate); -long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags) -{ - struct clk_rate_request req; - int ret; - - clk_hw_init_rate_request(hw, &req, rate); - req.best_parent_rate = *prate; - req.best_parent_hw = parent; - - ret = divider_determine_rate(hw, &req, table, width, flags); - if (ret) - return ret; - - *prate = req.best_parent_rate; - - return req.rate; -} -EXPORT_SYMBOL_GPL(divider_round_rate_parent); - static int clk_divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 0d31077749fb..4d21602d7dbd 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -733,10 +733,6 @@ extern const struct clk_ops clk_divider_ro_ops; unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate, unsigned int val, const struct clk_div_table *table, unsigned long flags, unsigned long width); -long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1427,15 +1423,6 @@ static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src) dst->core = src->core; } -static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags) -{ - return divider_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From 07c774dd64ba0c605dbf844132122e3edbdbea93 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 19 Feb 2026 04:53:52 +0000 Subject: ASoC: soc-compress: use function to clear symmetric params Current soc-compress.c clears symmetric_rate, but it clears rate only, not clear other symmetric_channels/sample_bits. static int soc_compr_clean(...) { ... if (!snd_soc_dai_active(cpu_dai)) => cpu_dai->symmetric_rate = 0; if (!snd_soc_dai_active(codec_dai)) => codec_dai->symmetric_rate = 0; ... }; This feature was added when v3.7 kernel [1], and there was only symmetric_rate, no symmetric_channels/sample_bits in that timing. symmetric_channels/sample_bits were added in v3.14 [2], but I guess it didn't notice that soc-compress.c is updating symmetric_xxx. We are clearing symmetry_xxx by soc_pcm_set_dai_params(), but is soc-pcm.c local function. Makes it global function and clear symmetry_xxx by it. [1] commit 1245b7005de02 ("ASoC: add compress stream support") [2] commit 3635bf09a89cf ("ASoC: soc-pcm: add symmetry for channels and sample bits") Fixes: 3635bf09a89c ("ASoC: soc-pcm: add symmetry for channels and sample bits") Cc: Nicolin Chen Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87ms15e3kv.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 3 +++ sound/soc/soc-compress.c | 4 ++-- sound/soc/soc-pcm.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 7d8376c8e1be..1e0b7cd8d956 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1412,6 +1412,9 @@ struct snd_soc_dai *snd_soc_find_dai( struct snd_soc_dai *snd_soc_find_dai_with_mutex( const struct snd_soc_dai_link_component *dlc); +void soc_pcm_set_dai_params(struct snd_soc_dai *dai, + struct snd_pcm_hw_params *params); + #include static inline diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index 7b81dffc6a93..b8402802ae78 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -69,10 +69,10 @@ static int soc_compr_clean(struct snd_compr_stream *cstream, int rollback) snd_soc_dai_digital_mute(codec_dai, 1, stream); if (!snd_soc_dai_active(cpu_dai)) - cpu_dai->symmetric_rate = 0; + soc_pcm_set_dai_params(cpu_dai, NULL); if (!snd_soc_dai_active(codec_dai)) - codec_dai->symmetric_rate = 0; + soc_pcm_set_dai_params(codec_dai, NULL); snd_soc_link_compr_shutdown(cstream, rollback); diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index afa9fad4457f..9b12eedb77c3 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -423,8 +423,8 @@ void dpcm_dapm_stream_event(struct snd_soc_pcm_runtime *fe, int dir, int event) snd_soc_dapm_stream_event(fe, dir, event); } -static void soc_pcm_set_dai_params(struct snd_soc_dai *dai, - struct snd_pcm_hw_params *params) +void soc_pcm_set_dai_params(struct snd_soc_dai *dai, + struct snd_pcm_hw_params *params) { if (params) { dai->symmetric_rate = params_rate(params); -- cgit v1.2.3 From d075cef4af6327a5de4bee7bf77591e3201e54f4 Mon Sep 17 00:00:00 2001 From: Stefano Radaelli Date: Fri, 13 Feb 2026 16:03:55 +0100 Subject: ASoC: simple-card-utils: add sysclk ordering support When simple-audio-card programs sysclk for CPU and codec DAIs during hw_params, the ordering of these calls may matter on some platforms. Some CPU DAIs finalize or adjust the MCLK rate as part of their set_sysclk() callback (for example by calling clk_set_rate()). If the codec sysclk is configured before the CPU DAI applies the final MCLK rate, the codec may configure its internal clocking based on a non-final MCLK value. Such situations can arise depending on the clock provider/consumer relationship between the CPU DAI and the codec. Introduce an explicit sysclk ordering enum in simple-card-utils and use it to control the order of snd_soc_dai_set_sysclk() calls in the mclk-fs handling path. The default behaviour remains unchanged (codec-first) to avoid regressions. Signed-off-by: Stefano Radaelli Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20260213150355.442609-1-stefano.r@variscite.com Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 6 +++++ sound/soc/generic/simple-card-utils.c | 41 +++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 69a9c9c4d0e9..915e6ae5f68d 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -54,6 +54,11 @@ struct prop_nums { int platforms; }; +enum simple_util_sysclk_order { + SIMPLE_SYSCLK_ORDER_CODEC_FIRST = 0, + SIMPLE_SYSCLK_ORDER_CPU_FIRST, +}; + struct simple_util_priv { struct snd_soc_card snd_card; struct simple_dai_props { @@ -63,6 +68,7 @@ struct simple_util_priv { struct snd_soc_codec_conf *codec_conf; struct prop_nums num; unsigned int mclk_fs; + enum simple_util_sysclk_order sysclk_order; } *dai_props; struct simple_util_jack hp_jack; struct simple_util_jack mic_jack; diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c index bdc02e85b089..fdd8b76f2914 100644 --- a/sound/soc/generic/simple-card-utils.c +++ b/sound/soc/generic/simple-card-utils.c @@ -468,6 +468,7 @@ int simple_util_hw_params(struct snd_pcm_substream *substream, struct snd_soc_dai *sdai; struct simple_util_priv *priv = snd_soc_card_get_drvdata(rtd->card); struct simple_dai_props *props = runtime_simple_priv_to_props(priv, rtd); + enum simple_util_sysclk_order order = props->sysclk_order; unsigned int mclk, mclk_fs = 0; int i, ret; @@ -501,18 +502,36 @@ int simple_util_hw_params(struct snd_pcm_substream *substream, goto end; } - for_each_rtd_codec_dais(rtd, i, sdai) { - pdai = simple_props_to_dai_codec(props, i); - ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); - if (ret && ret != -ENOTSUPP) - goto end; - } + if (order == SIMPLE_SYSCLK_ORDER_CPU_FIRST) { + /* CPU first */ + for_each_rtd_cpu_dais(rtd, i, sdai) { + pdai = simple_props_to_dai_cpu(props, i); + ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); + if (ret && ret != -ENOTSUPP) + goto end; + } - for_each_rtd_cpu_dais(rtd, i, sdai) { - pdai = simple_props_to_dai_cpu(props, i); - ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); - if (ret && ret != -ENOTSUPP) - goto end; + for_each_rtd_codec_dais(rtd, i, sdai) { + pdai = simple_props_to_dai_codec(props, i); + ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); + if (ret && ret != -ENOTSUPP) + goto end; + } + } else { + /* default: codec first */ + for_each_rtd_codec_dais(rtd, i, sdai) { + pdai = simple_props_to_dai_codec(props, i); + ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); + if (ret && ret != -ENOTSUPP) + goto end; + } + + for_each_rtd_cpu_dais(rtd, i, sdai) { + pdai = simple_props_to_dai_cpu(props, i); + ret = snd_soc_dai_set_sysclk(sdai, 0, mclk, pdai->clk_direction); + if (ret && ret != -ENOTSUPP) + goto end; + } } } -- cgit v1.2.3 From 5cab6d386bd30c3bb4efceb05b25842a6f144693 Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:29 +0530 Subject: drm/buddy: Add kernel-doc for allocator structures and flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing kernel-doc for GPU buddy allocator flags, gpu_buddy_block, and gpu_buddy. The documentation covers block header fields, allocator roots, free trees, and allocation flags such as RANGE, TOPDOWN, CONTIGUOUS, CLEAR, and TRIM_DISABLE. Private members are marked with kernel-doc private markers and documented with regular comments. No functional changes. v2: - Corrected GPU_BUDDY_CLEAR_TREE and GPU_BUDDY_DIRTY_TREE index values (Arun) - Rebased after DRM buddy allocator moved to drivers/gpu/ - Updated commit message v3: - Document reserved bits 8:6 in header layout (Arun) - Fix checkpatch warning Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-5-sanjay.kumar.yadav@intel.com --- include/linux/gpu_buddy.h | 123 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index 07ac65db6d2e..bf2a42256536 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -12,11 +12,58 @@ #include #include +/** + * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range + * + * When set, allocation is restricted to the range [start, end) specified + * in gpu_buddy_alloc_blocks(). Without this flag, start/end are ignored + * and allocation can use any free space. + */ #define GPU_BUDDY_RANGE_ALLOCATION BIT(0) + +/** + * GPU_BUDDY_TOPDOWN_ALLOCATION - Allocate from top of address space + * + * Allocate starting from high addresses and working down. Useful for + * separating different allocation types (e.g., kernel vs userspace) + * to reduce fragmentation. + */ #define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) + +/** + * GPU_BUDDY_CONTIGUOUS_ALLOCATION - Require physically contiguous blocks + * + * The allocation must be satisfied with a single contiguous block. + * If the requested size cannot be allocated contiguously, the + * allocation fails with -ENOSPC. + */ #define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) + +/** + * GPU_BUDDY_CLEAR_ALLOCATION - Prefer pre-cleared (zeroed) memory + * + * Attempt to allocate from the clear tree first. If insufficient clear + * memory is available, falls back to dirty memory. Useful when the + * caller needs zeroed memory and wants to avoid GPU clear operations. + */ #define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) + +/** + * GPU_BUDDY_CLEARED - Mark returned blocks as cleared + * + * Used with gpu_buddy_free_list() to indicate that the memory being + * freed has been cleared (zeroed). The blocks will be placed in the + * clear tree for future GPU_BUDDY_CLEAR_ALLOCATION requests. + */ #define GPU_BUDDY_CLEARED BIT(4) + +/** + * GPU_BUDDY_TRIM_DISABLE - Disable automatic block trimming + * + * By default, if an allocation is smaller than the allocated block, + * excess memory is trimmed and returned to the free pool. This flag + * disables trimming, keeping the full power-of-two block size. + */ #define GPU_BUDDY_TRIM_DISABLE BIT(5) enum gpu_buddy_free_tree { @@ -28,7 +75,28 @@ enum gpu_buddy_free_tree { #define for_each_free_tree(tree) \ for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) +/** + * struct gpu_buddy_block - Block within a buddy allocator + * + * Each block in the buddy allocator is represented by this structure. + * Blocks are organized in a binary tree where each parent block can be + * split into two children (left and right buddies). The allocator manages + * blocks at various orders (power-of-2 sizes) from chunk_size up to the + * largest contiguous region. + * + * @private: Private data owned by the allocator user (e.g., driver-specific data) + * @link: List node for user ownership while block is allocated + */ struct gpu_buddy_block { +/* private: */ + /* + * Header bit layout: + * - Bits 63:12: block offset within the address space + * - Bits 11:10: state (ALLOCATED, FREE, or SPLIT) + * - Bit 9: clear bit (1 if memory is zeroed) + * - Bits 8:6: reserved + * - Bits 5:0: order (log2 of size relative to chunk_size) + */ #define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) #define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) #define GPU_BUDDY_ALLOCATED (1 << 10) @@ -43,7 +111,7 @@ struct gpu_buddy_block { struct gpu_buddy_block *left; struct gpu_buddy_block *right; struct gpu_buddy_block *parent; - +/* public: */ void *private; /* owned by creator */ /* @@ -53,43 +121,58 @@ struct gpu_buddy_block { * gpu_buddy_free* ownership is given back to the mm. */ union { +/* private: */ struct rb_node rb; +/* public: */ struct list_head link; }; - +/* private: */ struct list_head tmp_link; }; /* Order-zero must be at least SZ_4K */ #define GPU_BUDDY_MAX_ORDER (63 - 12) -/* - * Binary Buddy System. +/** + * struct gpu_buddy - GPU binary buddy allocator + * + * The buddy allocator provides efficient power-of-two memory allocation + * with fast allocation and free operations. It is commonly used for GPU + * memory management where allocations can be split into power-of-two + * block sizes. * - * Locking should be handled by the user, a simple mutex around - * gpu_buddy_alloc* and gpu_buddy_free* should suffice. + * Locking should be handled by the user; a simple mutex around + * gpu_buddy_alloc_blocks() and gpu_buddy_free_block()/gpu_buddy_free_list() + * should suffice. + * + * @n_roots: Number of root blocks in the roots array. + * @max_order: Maximum block order (log2 of largest block size / chunk_size). + * @chunk_size: Minimum allocation granularity in bytes. Must be at least SZ_4K. + * @size: Total size of the address space managed by this allocator in bytes. + * @avail: Total free space currently available for allocation in bytes. + * @clear_avail: Free space available in the clear tree (zeroed memory) in bytes. + * This is a subset of @avail. */ struct gpu_buddy { - /* Maintain a free list for each order. */ - struct rb_root **free_trees; - +/* private: */ /* - * Maintain explicit binary tree(s) to track the allocation of the - * address space. This gives us a simple way of finding a buddy block - * and performing the potentially recursive merge step when freeing a - * block. Nodes are either allocated or free, in which case they will - * also exist on the respective free list. + * Array of red-black trees for free block management. + * Indexed as free_trees[clear/dirty][order] where: + * - Index 0 (GPU_BUDDY_CLEAR_TREE): blocks with zeroed content + * - Index 1 (GPU_BUDDY_DIRTY_TREE): blocks with unknown content + * Each tree holds free blocks of the corresponding order. */ - struct gpu_buddy_block **roots; - + struct rb_root **free_trees; /* - * Anything from here is public, and remains static for the lifetime of - * the mm. Everything above is considered do-not-touch. + * Array of root blocks representing the top-level blocks of the + * binary tree(s). Multiple roots exist when the total size is not + * a power of two, with each root being the largest power-of-two + * that fits in the remaining space. */ + struct gpu_buddy_block **roots; +/* public: */ unsigned int n_roots; unsigned int max_order; - - /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; -- cgit v1.2.3 From df8c7892e06efa5df2aa780a338f33a4f666370b Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:30 +0530 Subject: drm/buddy: Move internal helpers to buddy.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move gpu_buddy_block_state(), gpu_buddy_block_is_allocated(), and gpu_buddy_block_is_split() from gpu_buddy.h to gpu_buddy.c as static functions since they have no external callers. Remove gpu_get_buddy() as it was an unused exported wrapper around the internal __get_buddy(). No functional changes. v2: - Rebased after DRM buddy allocator moved to drivers/gpu/ - Keep gpu_buddy_block_is_free() in header since it's now used by drm_buddy.c - Updated commit message Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-6-sanjay.kumar.yadav@intel.com --- drivers/gpu/buddy.c | 35 ++++++++++++++++++----------------- include/linux/gpu_buddy.h | 25 ++----------------------- 2 files changed, 20 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index 603c59a2013a..b27761246d4b 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -14,6 +14,24 @@ static struct kmem_cache *slab_blocks; +static unsigned int +gpu_buddy_block_state(struct gpu_buddy_block *block) +{ + return block->header & GPU_BUDDY_HEADER_STATE; +} + +static bool +gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) +{ + return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; +} + +static bool +gpu_buddy_block_is_split(struct gpu_buddy_block *block) +{ + return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; +} + static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, struct gpu_buddy_block *parent, unsigned int order, @@ -449,23 +467,6 @@ static int split_block(struct gpu_buddy *mm, return 0; } -/** - * gpu_get_buddy - get buddy address - * - * @block: GPU buddy block - * - * Returns the corresponding buddy block for @block, or NULL - * if this is a root block and can't be merged further. - * Requires some kind of locking to protect against - * any concurrent allocate and free operations. - */ -struct gpu_buddy_block * -gpu_get_buddy(struct gpu_buddy_block *block) -{ - return __get_buddy(block); -} -EXPORT_SYMBOL(gpu_get_buddy); - /** * gpu_buddy_reset_clear - reset blocks clear state * diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index bf2a42256536..f1fb6eff604a 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -191,16 +191,10 @@ gpu_buddy_block_order(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_ORDER; } -static inline unsigned int -gpu_buddy_block_state(struct gpu_buddy_block *block) -{ - return block->header & GPU_BUDDY_HEADER_STATE; -} - static inline bool -gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; + return (block->header & GPU_BUDDY_HEADER_STATE) == GPU_BUDDY_FREE; } static inline bool @@ -209,18 +203,6 @@ gpu_buddy_block_is_clear(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_CLEAR; } -static inline bool -gpu_buddy_block_is_free(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; -} - -static inline bool -gpu_buddy_block_is_split(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; -} - static inline u64 gpu_buddy_block_size(struct gpu_buddy *mm, struct gpu_buddy_block *block) @@ -232,9 +214,6 @@ int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); void gpu_buddy_fini(struct gpu_buddy *mm); -struct gpu_buddy_block * -gpu_get_buddy(struct gpu_buddy_block *block); - int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, -- cgit v1.2.3 From 8167d7f674648cfd428ed49773522f9df5c4fdfd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Feb 2026 21:44:18 -0800 Subject: soundwire: sdw.h: repair names and format of kernel-doc comments Fix all kernel-doc warnings in sdw.h: Warning: include/linux/soundwire/sdw.h:538 cannot understand function prototype: 'enum sdw_reg_bank' Warning: include/linux/soundwire/sdw.h:779 struct member 'port_num' not described in 'sdw_transport_params' Warning: include/linux/soundwire/sdw.h:792 struct member 'port_num' not described in 'sdw_enable_ch' Warning: include/linux/soundwire/sdw.h:892 cannot understand function prototype: 'struct sdw_port_config' Warning: include/linux/soundwire/sdw.h:906 cannot understand function prototype: 'struct sdw_stream_config' Warning: include/linux/soundwire/sdw.h:925 cannot understand function prototype: 'enum sdw_stream_state' Warning: include/linux/soundwire/sdw.h:942 cannot understand function prototype: 'struct sdw_stream_params' Warning: include/linux/soundwire/sdw.h:960 cannot understand function prototype: 'struct sdw_stream_runtime' Warning: include/linux/soundwire/sdw.h:1047 struct member 'bpt_stream_refcount' not described in 'sdw_bus' Signed-off-by: Randy Dunlap Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260216054418.2766846-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index f462717acf20..6147eb1fb210 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -532,7 +532,7 @@ struct sdw_slave_intr_status { }; /** - * sdw_reg_bank - SoundWire register banks + * enum sdw_reg_bank - SoundWire register banks * @SDW_BANK0: Soundwire register bank 0 * @SDW_BANK1: Soundwire register bank 1 */ @@ -751,7 +751,7 @@ struct sdw_port_params { * struct sdw_transport_params: Data Port Transport Parameters * * @blk_grp_ctrl_valid: Port implements block group control - * @num: Port number + * @port_num: Port number * @blk_grp_ctrl: Block group control value * @sample_interval: Sample interval * @offset1: Blockoffset of the payload data @@ -782,7 +782,7 @@ struct sdw_transport_params { /** * struct sdw_enable_ch: Enable/disable Data Port channel * - * @num: Port number + * @port_num: Port number * @ch_mask: Active channel mask * @enable: Enable (true) /disable (false) channel */ @@ -885,7 +885,7 @@ void sdw_bus_master_delete(struct sdw_bus *bus); void sdw_show_ping_status(struct sdw_bus *bus, bool sync_delay); /** - * sdw_port_config: Master or Slave Port configuration + * struct sdw_port_config: Master or Slave Port configuration * * @num: Port number * @ch_mask: channels mask for port @@ -896,7 +896,7 @@ struct sdw_port_config { }; /** - * sdw_stream_config: Master or Slave stream configuration + * struct sdw_stream_config: Master or Slave stream configuration * * @frame_rate: Audio frame rate of the stream, in Hz * @ch_count: Channel count of the stream @@ -913,7 +913,7 @@ struct sdw_stream_config { }; /** - * sdw_stream_state: Stream states + * enum sdw_stream_state: Stream states * * @SDW_STREAM_ALLOCATED: New stream allocated. * @SDW_STREAM_CONFIGURED: Stream configured @@ -934,7 +934,7 @@ enum sdw_stream_state { }; /** - * sdw_stream_params: Stream parameters + * struct sdw_stream_params: Stream parameters * * @rate: Sampling frequency, in Hz * @ch_count: Number of channels @@ -947,7 +947,7 @@ struct sdw_stream_params { }; /** - * sdw_stream_runtime: Runtime stream parameters + * struct sdw_stream_runtime: Runtime stream parameters * * @name: SoundWire stream name * @params: Stream parameters @@ -983,7 +983,7 @@ struct sdw_stream_runtime { * @defer_msg: Defer message * @params: Current bus parameters * @stream_refcount: number of streams currently using this bus - * @btp_stream_refcount: number of BTP streams currently using this bus (should + * @bpt_stream_refcount: number of BTP streams currently using this bus (should * be zero or one, multiple streams per link is not supported). * @bpt_stream: pointer stored to handle BTP streams. * @ops: Master callback ops -- cgit v1.2.3 From 88440208c6074e639a7ccc038c6a7ed4b6f8bb99 Mon Sep 17 00:00:00 2001 From: Tomas Melin Date: Tue, 10 Feb 2026 10:53:34 +0000 Subject: iio: industrialio-backend: support backend capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all backends support the full set of capabilities provided by the industrialio-backend framework. Capability bits can be used in frontends and backends for checking for a certain feature set, or if using related functions can be expected to fail. Capability bits should be set by a compatible backend and provided when registering the backend. Reviewed-by: Nuno Sá Reviewed-by: David Lechner Signed-off-by: Tomas Melin Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-backend.c | 16 ++++++++++++++++ include/linux/iio/backend.h | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include') diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c index 447b694d6d5f..1afd00763da9 100644 --- a/drivers/iio/industrialio-backend.c +++ b/drivers/iio/industrialio-backend.c @@ -56,6 +56,7 @@ struct iio_backend { void *priv; const char *name; unsigned int cached_reg_addr; + u32 caps; /* * This index is relative to the frontend. Meaning that for * frontends with multiple backends, this will be the index of this @@ -774,6 +775,20 @@ int iio_backend_extend_chan_spec(struct iio_backend *back, } EXPORT_SYMBOL_NS_GPL(iio_backend_extend_chan_spec, "IIO_BACKEND"); +/** + * iio_backend_has_caps - Check if backend has specific capabilities + * @back: Backend device + * @caps: Capabilities to check + * + * RETURNS: + * True if backend has all the requested capabilities, false otherwise. + */ +bool iio_backend_has_caps(struct iio_backend *back, u32 caps) +{ + return (back->caps & caps) == caps; +} +EXPORT_SYMBOL_NS_GPL(iio_backend_has_caps, "IIO_BACKEND"); + static void iio_backend_release(void *arg) { struct iio_backend *back = arg; @@ -1114,6 +1129,7 @@ int devm_iio_backend_register(struct device *dev, back->ops = info->ops; back->name = info->name; + back->caps = info->caps; back->owner = dev->driver->owner; back->dev = dev; back->priv = priv; diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 7f815f3fed6a..4d15c2a9802c 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -84,6 +84,27 @@ enum iio_backend_filter_type { IIO_BACKEND_FILTER_TYPE_MAX }; +/** + * enum iio_backend_capabilities - Backend capabilities + * Backend capabilities can be used by frontends to check if a given + * functionality is supported by the backend. This is useful for frontend + * devices which are expected to work with alternative backend + * implementations. Capabilities are loosely coupled with operations, + * meaning that a capability requires certain operations to be implemented + * by the backend. A capability might be mapped to a single operation or + * multiple operations. + * + * @IIO_BACKEND_CAP_CALIBRATION: Backend supports digital interface + * calibration. Calibration procedure is device specific. + * @IIO_BACKEND_CAP_BUFFER: Support for IIO buffer interface. + * @IIO_BACKEND_CAP_ENABLE: Backend can be explicitly enabled/disabled. + */ +enum iio_backend_capabilities { + IIO_BACKEND_CAP_CALIBRATION = BIT(0), + IIO_BACKEND_CAP_BUFFER = BIT(1), + IIO_BACKEND_CAP_ENABLE = BIT(2), +}; + /** * struct iio_backend_ops - operations structure for an iio_backend * @enable: Enable backend. @@ -179,10 +200,12 @@ struct iio_backend_ops { * struct iio_backend_info - info structure for an iio_backend * @name: Backend name. * @ops: Backend operations. + * @caps: Backend capabilities. (bitmask of enum iio_backend_capabilities). */ struct iio_backend_info { const char *name; const struct iio_backend_ops *ops; + u32 caps; }; int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); @@ -235,6 +258,7 @@ int iio_backend_read_raw(struct iio_backend *back, long mask); int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); +bool iio_backend_has_caps(struct iio_backend *back, u32 caps); void *iio_backend_get_priv(const struct iio_backend *conv); struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); struct iio_backend *devm_iio_backend_fwnode_get(struct device *dev, -- cgit v1.2.3 From 8b65eb52d93e4e496bd26e6867152344554eb39e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:10 -0800 Subject: locking/mutex: Rename mutex_init_lockep() Typo, this wants to be _lockdep(). Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-2-dave@stgolabs.net --- include/linux/mutex.h | 4 ++-- kernel/locking/mutex.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ecaa0440f6ec..8126da959088 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -87,12 +87,12 @@ do { \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) #ifdef CONFIG_DEBUG_LOCK_ALLOC -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key); static inline void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - mutex_init_lockep(lock, name, key); + mutex_init_lockdep(lock, name, key); } #else extern void mutex_init_generic(struct mutex *lock); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..c867f6c15530 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -171,7 +171,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) #else /* !CONFIG_DEBUG_LOCK_ALLOC */ -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key) { __mutex_init_generic(lock); @@ -181,7 +181,7 @@ void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_k debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(mutex_init_lockep); +EXPORT_SYMBOL(mutex_init_lockdep); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) -- cgit v1.2.3 From babcde3be8c9148aa60a14b17831e8f249854963 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:11 -0800 Subject: locking/mutex: Fix wrong comment for CONFIG_DEBUG_LOCK_ALLOC ... that endif block should be CONFIG_DEBUG_LOCK_ALLOC, not CONFIG_LOCKDEP. Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-3-dave@stgolabs.net --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8126da959088..f57d2a97da57 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -146,7 +146,7 @@ static inline void __mutex_init(struct mutex *lock, const char *name, { mutex_rt_init_generic(lock); } -#endif /* !CONFIG_LOCKDEP */ +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES -- cgit v1.2.3 From 50214dc4382055352fb1d7b9779550dabf5059e5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:12 -0800 Subject: locking/mutex: Add killable flavor to guard definitions The mutex guard family defines _try and _intr variants but is missing the killable one. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-4-dave@stgolabs.net --- include/linux/mutex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index f57d2a97da57..2f648ee204e7 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -253,6 +253,7 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) __cond_a DEFINE_LOCK_GUARD_1(mutex, struct mutex, mutex_lock(_T->lock), mutex_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _try, mutex_trylock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _intr, mutex_lock_interruptible(_T->lock), _RET == 0) +DEFINE_LOCK_GUARD_1_COND(mutex, _kill, mutex_lock_killable(_T->lock), _RET == 0) DEFINE_LOCK_GUARD_1(mutex_init, struct mutex, mutex_init(_T->lock), /* */) DECLARE_LOCK_GUARD_1_ATTRS(mutex, __acquires(_T), __releases(*(struct mutex **)_T)) @@ -261,6 +262,8 @@ DECLARE_LOCK_GUARD_1_ATTRS(mutex_try, __acquires(_T), __releases(*(struct mutex #define class_mutex_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_try, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_intr, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_intr, _T) +DECLARE_LOCK_GUARD_1_ATTRS(mutex_kill, __acquires(_T), __releases(*(struct mutex **)_T)) +#define class_mutex_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_kill, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_init, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_init, _T) -- cgit v1.2.3 From d28cb72e07b21acb90204be201966b9e92eca75a Mon Sep 17 00:00:00 2001 From: Karel Balej Date: Sat, 13 Dec 2025 09:48:12 +0100 Subject: dt-bindings: power: define ID for Marvell PXA1908 audio domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define an identifier for the SoC's audio power island so that it can be referenced through device tree. Acked-by: Krzysztof Kozlowski Reviewed-by: Duje Mihanović Signed-off-by: Karel Balej Signed-off-by: Ulf Hansson --- include/dt-bindings/power/marvell,pxa1908-power.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/power/marvell,pxa1908-power.h b/include/dt-bindings/power/marvell,pxa1908-power.h index 19b088351af1..173f47e0e69d 100644 --- a/include/dt-bindings/power/marvell,pxa1908-power.h +++ b/include/dt-bindings/power/marvell,pxa1908-power.h @@ -13,5 +13,6 @@ #define PXA1908_POWER_DOMAIN_GPU2D 2 #define PXA1908_POWER_DOMAIN_DSI 3 #define PXA1908_POWER_DOMAIN_ISP 4 +#define PXA1908_POWER_DOMAIN_AUDIO 5 #endif -- cgit v1.2.3 From 3cdac25e80a23ca107189b55deaadf0a49bab44b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 13 Jan 2026 12:00:09 +0100 Subject: dt-bindings: power: mt7622-power: Add MT7622_POWER_DOMAIN_AUDIO Add the missing power domain for the Audio IPs in this SoC. Signed-off-by: AngeloGioacchino Del Regno Acked-by: Rob Herring (Arm) Signed-off-by: Ulf Hansson --- include/dt-bindings/power/mt7622-power.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/power/mt7622-power.h b/include/dt-bindings/power/mt7622-power.h index ffad81ad3d46..ec244989eeba 100644 --- a/include/dt-bindings/power/mt7622-power.h +++ b/include/dt-bindings/power/mt7622-power.h @@ -10,5 +10,6 @@ #define MT7622_POWER_DOMAIN_HIF0 1 #define MT7622_POWER_DOMAIN_HIF1 2 #define MT7622_POWER_DOMAIN_WB 3 +#define MT7622_POWER_DOMAIN_AUDIO 4 #endif /* _DT_BINDINGS_POWER_MT7622_POWER_H */ -- cgit v1.2.3 From 82d58440cd72dd4bf719e50eddfb9c4ef50f2deb Mon Sep 17 00:00:00 2001 From: Irving-CH Lin Date: Mon, 2 Feb 2026 14:48:13 +0800 Subject: dt-bindings: power: Add MediaTek MT8189 power domain Add dt schema and IDs for the power domain of MediaTek MT8189 SoC. The MT8189 power domain IP provide power domains control function for subsys (eg. MFG, audio, venc/vdec ...). Signed-off-by: Irving-CH Lin Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Ulf Hansson --- .../bindings/power/mediatek,power-controller.yaml | 1 + include/dt-bindings/power/mediatek,mt8189-power.h | 38 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 include/dt-bindings/power/mediatek,mt8189-power.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml index 9507b342a7ee..07f046277f8a 100644 --- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml +++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml @@ -31,6 +31,7 @@ properties: - mediatek,mt8183-power-controller - mediatek,mt8186-power-controller - mediatek,mt8188-power-controller + - mediatek,mt8189-power-controller - mediatek,mt8192-power-controller - mediatek,mt8195-power-controller - mediatek,mt8196-hwv-hfrp-power-controller diff --git a/include/dt-bindings/power/mediatek,mt8189-power.h b/include/dt-bindings/power/mediatek,mt8189-power.h new file mode 100644 index 000000000000..70a8c2113457 --- /dev/null +++ b/include/dt-bindings/power/mediatek,mt8189-power.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025 MediaTek Inc. + * Author: Qiqi Wang + */ + +#ifndef _DT_BINDINGS_POWER_MT8189_POWER_H +#define _DT_BINDINGS_POWER_MT8189_POWER_H + +/* SPM */ +#define MT8189_POWER_DOMAIN_CONN 0 +#define MT8189_POWER_DOMAIN_AUDIO 1 +#define MT8189_POWER_DOMAIN_ADSP_TOP_DORMANT 2 +#define MT8189_POWER_DOMAIN_ADSP_INFRA 3 +#define MT8189_POWER_DOMAIN_ADSP_AO 4 +#define MT8189_POWER_DOMAIN_MM_INFRA 5 +#define MT8189_POWER_DOMAIN_ISP_IMG1 6 +#define MT8189_POWER_DOMAIN_ISP_IMG2 7 +#define MT8189_POWER_DOMAIN_ISP_IPE 8 +#define MT8189_POWER_DOMAIN_VDE0 9 +#define MT8189_POWER_DOMAIN_VEN0 10 +#define MT8189_POWER_DOMAIN_CAM_MAIN 11 +#define MT8189_POWER_DOMAIN_CAM_SUBA 12 +#define MT8189_POWER_DOMAIN_CAM_SUBB 13 +#define MT8189_POWER_DOMAIN_MDP0 14 +#define MT8189_POWER_DOMAIN_DISP 15 +#define MT8189_POWER_DOMAIN_DP_TX 16 +#define MT8189_POWER_DOMAIN_CSI_RX 17 +#define MT8189_POWER_DOMAIN_SSUSB 18 +#define MT8189_POWER_DOMAIN_MFG0 19 +#define MT8189_POWER_DOMAIN_MFG1 20 +#define MT8189_POWER_DOMAIN_MFG2 21 +#define MT8189_POWER_DOMAIN_MFG3 22 +#define MT8189_POWER_DOMAIN_EDP_TX_DORMANT 23 +#define MT8189_POWER_DOMAIN_PCIE 24 +#define MT8189_POWER_DOMAIN_PCIE_PHY 25 + +#endif /* _DT_BINDINGS_POWER_MT8189_POWER_H */ -- cgit v1.2.3 From 263ff314cc5602599d481b0912a381555fcbad28 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Fri, 28 Nov 2025 07:20:11 +0200 Subject: mmc: core: Add quirk for incorrect manufacturing date Some eMMC vendors need to report manufacturing dates beyond 2025 but are reluctant to update the EXT_CSD revision from 8 to 9. Changing the Updating the EXT_CSD revision may involve additional testing or qualification steps with customers. To ease this transition and avoid a full re-qualification process, a workaround is needed. This patch introduces a temporary quirk that re-purposes the year codes corresponding to 2010, 2011, and 2012 to represent the years 2026, 2027, and 2028, respectively. This solution is only valid for this three-year period. After 2028, vendors must update their firmware to set EXT_CSD_REV=9 to continue reporting the correct manufacturing date in compliance with the JEDEC standard. The `MMC_QUIRK_BROKEN_MDT` is introduced and enabled for all Sandisk devices to handle this behavior. Signed-off-by: Avri Altman Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 6 ++++++ drivers/mmc/core/mmc.c | 5 +++++ drivers/mmc/core/quirks.h | 3 +++ include/linux/mmc/card.h | 1 + 4 files changed, 15 insertions(+) (limited to 'include') diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index 1200951bab08..a9619dd45270 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -89,6 +89,7 @@ struct mmc_fixup { #define CID_MANFID_MICRON 0x13 #define CID_MANFID_SAMSUNG 0x15 #define CID_MANFID_APACER 0x27 +#define CID_MANFID_SANDISK_MMC 0x45 #define CID_MANFID_SWISSBIT 0x5D #define CID_MANFID_KINGSTON 0x70 #define CID_MANFID_HYNIX 0x90 @@ -305,4 +306,9 @@ static inline int mmc_card_no_uhs_ddr50_tuning(const struct mmc_card *c) return c->quirks & MMC_QUIRK_NO_UHS_DDR50_TUNING; } +static inline int mmc_card_broken_mdt(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_BROKEN_MDT; +} + #endif diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index f744dd501842..8846550a8892 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -676,6 +676,11 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) /* Adjust production date as per JEDEC JESD84-B51B September 2025 */ if (card->cid.year < 2023) card->cid.year += 16; + } else { + /* Handle vendors with broken MDT reporting */ + if (mmc_card_broken_mdt(card) && card->cid.year >= 2010 && + card->cid.year <= 2012) + card->cid.year += 16; } } diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index c417ed34c057..f5e8a0f6d11b 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -170,6 +170,9 @@ static const struct mmc_fixup __maybe_unused mmc_ext_csd_fixups[] = { MMC_FIXUP_EXT_CSD_REV(CID_NAME_ANY, CID_MANFID_NUMONYX, 0x014e, add_quirk, MMC_QUIRK_BROKEN_HPI, 6), + MMC_FIXUP(CID_NAME_ANY, CID_MANFID_SANDISK_MMC, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_BROKEN_MDT), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index e9e964c20e53..4722dd7e46ce 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -329,6 +329,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_CACHE_FLUSH (1<<16) /* Don't flush cache until the write has occurred */ #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ +#define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From c0b68bc25efeaca8a1ef3a0cfab5fbf5f81d002d Mon Sep 17 00:00:00 2001 From: Jeff Chen Date: Tue, 13 Jan 2026 11:15:17 +0800 Subject: mmc: sdio: add NXP vendor and IW61x device IDs Add NXP's SDIO vendor ID (0x0471) and IW61x device ID (0x0205) to sdio_ids.h for future support of NXP Wi-Fi chips over SDIO. Signed-off-by: Jeff Chen Signed-off-by: Ulf Hansson --- include/linux/mmc/sdio_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 673cbdf43453..39ac2b612e4a 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -116,6 +116,9 @@ #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 +#define SDIO_VENDOR_ID_NXP 0x0471 +#define SDIO_DEVICE_ID_NXP_IW61X 0x0205 + #define SDIO_VENDOR_ID_REALTEK 0x024c #define SDIO_DEVICE_ID_REALTEK_RTW8723BS 0xb723 #define SDIO_DEVICE_ID_REALTEK_RTW8821BS 0xb821 -- cgit v1.2.3 From d6bf2e64dec87322f2b11565ddb59c0e967f96e3 Mon Sep 17 00:00:00 2001 From: Luke Wang Date: Wed, 4 Feb 2026 11:40:03 +0800 Subject: mmc: core: Optimize time for secure erase/trim for some Kingston eMMCs Kingston eMMC IY2964 and IB2932 takes a fixed ~2 seconds for each secure erase/trim operation regardless of size - that is, a single secure erase/trim operation of 1MB takes the same time as 1GB. With default calculated 3.5MB max discard size, secure erase 1GB requires ~300 separate operations taking ~10 minutes total. Add a card quirk, MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME, to set maximum secure erase size for those devices. This allows 1GB secure erase to complete in a single operation, reducing time from 10 minutes to just 2 seconds. Signed-off-by: Luke Wang Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 5 +++++ drivers/mmc/core/queue.c | 9 +++++++-- drivers/mmc/core/quirks.h | 9 +++++++++ include/linux/mmc/card.h | 1 + 4 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index a9619dd45270..a7c364d0030a 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -311,4 +311,9 @@ static inline int mmc_card_broken_mdt(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BROKEN_MDT; } +static inline int mmc_card_fixed_secure_erase_trim_time(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME; +} + #endif diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 13000fc57e2e..39fcb662c43f 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -184,8 +184,13 @@ static void mmc_queue_setup_discard(struct mmc_card *card, return; lim->max_hw_discard_sectors = max_discard; - if (mmc_card_can_secure_erase_trim(card)) - lim->max_secure_erase_sectors = max_discard; + if (mmc_card_can_secure_erase_trim(card)) { + if (mmc_card_fixed_secure_erase_trim_time(card)) + lim->max_secure_erase_sectors = UINT_MAX >> card->erase_shift; + else + lim->max_secure_erase_sectors = max_discard; + } + if (mmc_card_can_trim(card) && card->erased_byte == 0) lim->max_write_zeroes_sectors = max_discard; diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index f5e8a0f6d11b..6f727b4a60a5 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -153,6 +153,15 @@ static const struct mmc_fixup __maybe_unused mmc_blk_fixups[] = { MMC_FIXUP("M62704", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, MMC_QUIRK_TRIM_BROKEN), + /* + * On Some Kingston eMMCs, secure erase/trim time is independent + * of erase size, fixed at approximately 2 seconds. + */ + MMC_FIXUP("IY2964", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, + MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME), + MMC_FIXUP("IB2932", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, + MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 4722dd7e46ce..9dc4750296af 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -330,6 +330,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ #define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ +#define MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME (1<<20) /* Secure erase/trim time is fixed regardless of size */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From 94d709be8c0dc875dfc9ebb64d3b8093d0790c15 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:57 +0100 Subject: xattr: add rcu_head and rhash_head to struct simple_xattr In preparation for converting simple_xattrs from rbtree to rhashtable, add rhash_head and rcu_head members to struct simple_xattr. The rhashtable implementation will use rhash_head for hash table linkage and RCU-based lockless reads, requiring that replaced or removed xattr entries be freed via call_rcu() rather than immediately. Add simple_xattr_free_rcu() which schedules RCU-deferred freeing of an xattr entry. This will be used by callers of simple_xattr_set() once they switch to the rhashtable-based xattr store. No functional changes. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-1-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xattr.c | 23 +++++++++++++++++++++++ include/linux/xattr.h | 4 ++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/fs/xattr.c b/fs/xattr.c index 3e49e612e1ba..9cbb1917bcb2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1197,6 +1197,29 @@ void simple_xattr_free(struct simple_xattr *xattr) kvfree(xattr); } +static void simple_xattr_rcu_free(struct rcu_head *head) +{ + struct simple_xattr *xattr; + + xattr = container_of(head, struct simple_xattr, rcu); + simple_xattr_free(xattr); +} + +/** + * simple_xattr_free_rcu - free an xattr object after an RCU grace period + * @xattr: the xattr object + * + * Schedule RCU-deferred freeing of an xattr entry. This is used by + * rhashtable-based callers of simple_xattr_set() that replace or remove + * an existing entry while concurrent RCU readers may still be accessing + * it. + */ +void simple_xattr_free_rcu(struct simple_xattr *xattr) +{ + if (xattr) + call_rcu(&xattr->rcu, simple_xattr_rcu_free); +} + /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 296b5ee5c979..fdbd2095414a 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,8 @@ struct simple_xattrs { struct simple_xattr { struct rb_node rb_node; + struct rhash_head hash_node; + struct rcu_head rcu; char *name; size_t size; char value[] __counted_by(size); @@ -122,6 +125,7 @@ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); +void simple_xattr_free_rcu(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, -- cgit v1.2.3 From b32c4a213698ab351b44da2fd1b2a5976c7fa033 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:58 +0100 Subject: xattr: add rhashtable-based simple_xattr infrastructure Add rhashtable support to the simple_xattr subsystem while keeping the existing rbtree code fully functional. This allows consumers to be migrated one at a time without breaking any intermediate build. struct simple_xattrs gains a dispatch flag and a union holding either the rbtree (rb_root + rwlock) or rhashtable state: struct simple_xattrs { bool use_rhashtable; union { struct { struct rb_root rb_root; rwlock_t lock; }; struct rhashtable ht; }; }; simple_xattrs_init() continues to set up the rbtree path for existing embedded-struct callers. Add simple_xattrs_alloc() which dynamically allocates a simple_xattrs and initializes the rhashtable path. This is the entry point for consumers switching to pointer-based lazy allocation. The five core functions (get, set, list, add, free) dispatch based on the use_rhashtable flag. Existing callers continue to use the rbtree path unchanged. As each consumer is converted it will switch to simple_xattrs_alloc() and the rhashtable path. Once all consumers are converted a follow-up patch will remove the rbtree code. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-2-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/xattr.c | 439 ++++++++++++++++++++++++++++++++++++++------------ include/linux/xattr.h | 25 ++- mm/shmem.c | 2 +- 3 files changed, 357 insertions(+), 109 deletions(-) (limited to 'include') diff --git a/fs/xattr.c b/fs/xattr.c index 9cbb1917bcb2..1d98ea459b7b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -1228,22 +1229,25 @@ void simple_xattr_free_rcu(struct simple_xattr *xattr) * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * - * Return: On success a new xattr object is returned. On failure NULL is - * returned. + * Return: New xattr object on success, NULL if @value is NULL, ERR_PTR on + * failure. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; + if (!value) + return NULL; + /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) - return NULL; + return ERR_PTR(-ENOMEM); new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) - return NULL; + return ERR_PTR(-ENOMEM); new_xattr->size = size; memcpy(new_xattr->value, value, size); @@ -1287,6 +1291,33 @@ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, return rbtree_simple_xattr_cmp(xattr->name, node); } +static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed) +{ + const char *name = data; + return jhash(name, strlen(name), seed); +} + +static u32 simple_xattr_obj_hashfn(const void *obj, u32 len, u32 seed) +{ + const struct simple_xattr *xattr = obj; + return jhash(xattr->name, strlen(xattr->name), seed); +} + +static int simple_xattr_obj_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct simple_xattr *xattr = obj; + return strcmp(xattr->name, arg->key); +} + +static const struct rhashtable_params simple_xattr_params = { + .head_offset = offsetof(struct simple_xattr, hash_node), + .hashfn = simple_xattr_hashfn, + .obj_hashfn = simple_xattr_obj_hashfn, + .obj_cmpfn = simple_xattr_obj_cmpfn, + .automatic_shrinking = true, +}; + /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object @@ -1306,22 +1337,41 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; - struct rb_node *rbp; int ret = -ENODATA; - read_lock(&xattrs->lock); - rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); - if (rbp) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); + if (xattrs->use_rhashtable) { + guard(rcu)(); + xattr = rhashtable_lookup(&xattrs->ht, name, + simple_xattr_params); + if (xattr) { + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, + xattr->size); + } + } + } else { + struct rb_node *rbp; + + read_lock(&xattrs->lock); + rbp = rb_find(name, &xattrs->rb_root, + rbtree_simple_xattr_cmp); + if (rbp) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, + xattr->size); + } } + read_unlock(&xattrs->lock); } - read_unlock(&xattrs->lock); return ret; } @@ -1355,78 +1405,134 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { - struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; - struct rb_node *parent = NULL, **rbp; - int err = 0, ret; + struct simple_xattr *old_xattr = NULL; + int err = 0; - /* value == NULL means remove */ - if (value) { - new_xattr = simple_xattr_alloc(value, size); - if (!new_xattr) - return ERR_PTR(-ENOMEM); + CLASS(simple_xattr, new_xattr)(value, size); + if (IS_ERR(new_xattr)) + return new_xattr; + if (new_xattr) { new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); - if (!new_xattr->name) { - simple_xattr_free(new_xattr); + if (!new_xattr->name) return ERR_PTR(-ENOMEM); - } } - write_lock(&xattrs->lock); - rbp = &xattrs->rb_root.rb_node; - while (*rbp) { - parent = *rbp; - ret = rbtree_simple_xattr_cmp(name, *rbp); - if (ret < 0) - rbp = &(*rbp)->rb_left; - else if (ret > 0) - rbp = &(*rbp)->rb_right; - else - old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (old_xattr) - break; - } + if (xattrs->use_rhashtable) { + /* + * Lookup is safe without RCU here since writes are + * serialized by the caller. + */ + old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, + simple_xattr_params); + + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) + return ERR_PTR(-EEXIST); + + if (new_xattr) { + err = rhashtable_replace_fast(&xattrs->ht, + &old_xattr->hash_node, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } else { + err = rhashtable_remove_fast(&xattrs->ht, + &old_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) + return ERR_PTR(-ENODATA); + + /* + * If XATTR_CREATE or no flags are specified together + * with a new value simply insert it. + */ + if (new_xattr) { + err = rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) { - err = -EEXIST; - goto out_unlock; + /* + * If XATTR_CREATE or no flags are specified and + * neither an old or new xattr exist then we don't + * need to do anything. + */ } - - if (new_xattr) - rb_replace_node(&old_xattr->rb_node, - &new_xattr->rb_node, &xattrs->rb_root); - else - rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) { - err = -ENODATA; - goto out_unlock; - } + struct rb_node *parent = NULL, **rbp; + int ret; - /* - * If XATTR_CREATE or no flags are specified together with a - * new value simply insert it. - */ - if (new_xattr) { - rb_link_node(&new_xattr->rb_node, parent, rbp); - rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); + write_lock(&xattrs->lock); + rbp = &xattrs->rb_root.rb_node; + while (*rbp) { + parent = *rbp; + ret = rbtree_simple_xattr_cmp(name, *rbp); + if (ret < 0) + rbp = &(*rbp)->rb_left; + else if (ret > 0) + rbp = &(*rbp)->rb_right; + else + old_xattr = rb_entry(*rbp, struct simple_xattr, + rb_node); + if (old_xattr) + break; } - /* - * If XATTR_CREATE or no flags are specified and neither an - * old or new xattr exist then we don't need to do anything. - */ - } + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out_unlock; + } + + if (new_xattr) + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, + &xattrs->rb_root); + else + rb_erase(&old_xattr->rb_node, + &xattrs->rb_root); + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out_unlock; + } + + /* + * If XATTR_CREATE or no flags are specified together + * with a new value simply insert it. + */ + if (new_xattr) { + rb_link_node(&new_xattr->rb_node, parent, rbp); + rb_insert_color(&new_xattr->rb_node, + &xattrs->rb_root); + } + + /* + * If XATTR_CREATE or no flags are specified and + * neither an old or new xattr exist then we don't + * need to do anything. + */ + } out_unlock: - write_unlock(&xattrs->lock); - if (!err) - return old_xattr; - simple_xattr_free(new_xattr); - return ERR_PTR(err); + write_unlock(&xattrs->lock); + if (err) + return ERR_PTR(err); + } + retain_and_null_ptr(new_xattr); + return old_xattr; } static bool xattr_is_trusted(const char *name) @@ -1467,7 +1573,6 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; - struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; @@ -1487,23 +1592,62 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, remaining_size -= err; err = 0; - read_lock(&xattrs->lock); - for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); + if (!xattrs) + return size - remaining_size; - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + if (xattrs->use_rhashtable) { + struct rhashtable_iter iter; - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) - continue; + rhashtable_walk_enter(&xattrs->ht, &iter); + rhashtable_walk_start(&iter); - err = xattr_list_one(&buffer, &remaining_size, xattr->name); - if (err) - break; + while ((xattr = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(xattr)) { + if (PTR_ERR(xattr) == -EAGAIN) + continue; + err = PTR_ERR(xattr); + break; + } + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + + err = xattr_list_one(&buffer, &remaining_size, + xattr->name); + if (err) + break; + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + } else { + struct rb_node *rbp; + + read_lock(&xattrs->lock); + for (rbp = rb_first(&xattrs->rb_root); rbp; + rbp = rb_next(rbp)) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + + err = xattr_list_one(&buffer, &remaining_size, + xattr->name); + if (err) + break; + } + read_unlock(&xattrs->lock); } - read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } @@ -1536,9 +1680,16 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); + if (xattrs->use_rhashtable) { + WARN_ON(rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params)); + } else { + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, + rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); + } } /** @@ -1549,10 +1700,80 @@ void simple_xattr_add(struct simple_xattrs *xattrs, */ void simple_xattrs_init(struct simple_xattrs *xattrs) { + xattrs->use_rhashtable = false; xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } +/** + * simple_xattrs_alloc - allocate and initialize a new xattr header + * + * Dynamically allocate a simple_xattrs header and initialize the + * underlying rhashtable. This is intended for consumers that want + * rhashtable-based xattr storage. + * + * Return: On success a new simple_xattrs is returned. On failure an + * ERR_PTR is returned. + */ +struct simple_xattrs *simple_xattrs_alloc(void) +{ + struct simple_xattrs *xattrs __free(kfree) = NULL; + + xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL); + if (!xattrs) + return ERR_PTR(-ENOMEM); + + xattrs->use_rhashtable = true; + if (rhashtable_init(&xattrs->ht, &simple_xattr_params)) + return ERR_PTR(-ENOMEM); + + return no_free_ptr(xattrs); +} + +/** + * simple_xattrs_lazy_alloc - get or allocate xattrs for a set operation + * @xattrsp: pointer to the xattrs pointer (may point to NULL) + * @value: value being set (NULL means remove) + * @flags: xattr set flags + * + * For lazily-allocated xattrs on the write path. If no xattrs exist yet + * and this is a remove operation, returns the appropriate result without + * allocating. Otherwise ensures xattrs is allocated and published with + * store-release semantics. + * + * Return: On success a valid pointer to the xattrs is returned. On + * failure or early-exit an ERR_PTR or NULL is returned. Callers should + * check with IS_ERR_OR_NULL() and propagate with PTR_ERR() which + * correctly returns 0 for the NULL no-op case. + */ +struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags) +{ + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(*xattrsp); + if (xattrs) + return xattrs; + + if (!value) + return (flags & XATTR_REPLACE) ? ERR_PTR(-ENODATA) : NULL; + + xattrs = simple_xattrs_alloc(); + if (!IS_ERR(xattrs)) + smp_store_release(xattrsp, xattrs); + return xattrs; +} + +static void simple_xattr_ht_free(void *ptr, void *arg) +{ + struct simple_xattr *xattr = ptr; + size_t *freed_space = arg; + + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, xattr->size); + simple_xattr_free(xattr); +} + /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy @@ -1563,22 +1784,28 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { - struct rb_node *rbp; - if (freed_space) *freed_space = 0; - rbp = rb_first(&xattrs->rb_root); - while (rbp) { - struct simple_xattr *xattr; - struct rb_node *rbp_next; - - rbp_next = rb_next(rbp); - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (freed_space) - *freed_space += simple_xattr_space(xattr->name, - xattr->size); - simple_xattr_free(xattr); - rbp = rbp_next; + + if (xattrs->use_rhashtable) { + rhashtable_free_and_destroy(&xattrs->ht, + simple_xattr_ht_free, freed_space); + } else { + struct rb_node *rbp; + + rbp = rb_first(&xattrs->rb_root); + while (rbp) { + struct simple_xattr *xattr; + struct rb_node *rbp_next; + + rbp_next = rb_next(rbp); + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + rb_erase(&xattr->rb_node, &xattrs->rb_root); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); + simple_xattr_free(xattr); + rbp = rbp_next; + } } } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index fdbd2095414a..832a44358661 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,8 +107,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - struct rb_root rb_root; - rwlock_t lock; + bool use_rhashtable; + union { + struct { + struct rb_root rb_root; + rwlock_t lock; + }; + struct rhashtable ht; + }; }; struct simple_xattr { @@ -121,6 +127,9 @@ struct simple_xattr { }; void simple_xattrs_init(struct simple_xattrs *xattrs); +struct simple_xattrs *simple_xattrs_alloc(void); +struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); @@ -137,4 +146,16 @@ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +DEFINE_CLASS(simple_xattr, + struct simple_xattr *, + if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), + simple_xattr_alloc(value, size), + const void *value, size_t size) + +DEFINE_CLASS(simple_xattrs, + struct simple_xattrs *, + if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, + simple_xattrs_alloc(), + void) + #endif /* _LINUX_XATTR_H */ diff --git a/mm/shmem.c b/mm/shmem.c index b40f3cd48961..35c2f8748668 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4278,7 +4278,7 @@ static int shmem_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); - if (!new_xattr) + if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; -- cgit v1.2.3 From 52b364fed6e1578e551fee20c76fecb3fc0e10ed Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:59 +0100 Subject: shmem: adapt to rhashtable-based simple_xattrs with lazy allocation Adapt tmpfs/shmem to use the rhashtable-based xattr path and switch from an embedded struct to pointer-based lazy allocation. Change shmem_inode_info.xattrs from embedded 'struct simple_xattrs' to a pointer 'struct simple_xattrs *', initialized to NULL. This avoids the rhashtable overhead for every tmpfs inode, which helps when a lot of inodes exist. The xattr store is allocated on first use: - shmem_initxattrs(): Allocates via simple_xattrs_alloc() when security modules set initial xattrs during inode creation. - shmem_xattr_handler_set(): Allocates on first setxattr, with a short-circuit for removal when no xattrs are stored yet. All read paths (shmem_xattr_handler_get, shmem_listxattr) check for NULL xattrs pointer and return -ENODATA or 0 respectively. Replaced xattr entries are freed via simple_xattr_free_rcu() to allow concurrent RCU readers to finish. shmem_evict_inode() conditionally frees the xattr store only when allocated. Also change simple_xattr_add() from void to int to propagate rhashtable insertion failures. shmem_initxattrs() is the only caller. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-3-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/xattr.c | 26 +++++++++++++------------- include/linux/shmem_fs.h | 2 +- include/linux/xattr.h | 4 ++-- mm/shmem.c | 44 +++++++++++++++++++++++++++++++------------- 4 files changed, 47 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/fs/xattr.c b/fs/xattr.c index 1d98ea459b7b..eb45ae0fd17f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1677,19 +1677,19 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ -void simple_xattr_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr) -{ - if (xattrs->use_rhashtable) { - WARN_ON(rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params)); - } else { - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, - rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); - } +int simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + if (xattrs->use_rhashtable) + return rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, + rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); + return 0; } /** diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a8273b32e041..f6a2d3402d76 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs xattrs; /* list of xattrs */ + struct simple_xattrs *xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 832a44358661..6e619e185e90 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -142,8 +142,8 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); -void simple_xattr_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr); +int simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); DEFINE_CLASS(simple_xattr, diff --git a/mm/shmem.c b/mm/shmem.c index 35c2f8748668..0b0e577e880a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1425,7 +1425,10 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + if (info->xattrs) { + simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL); + kfree(info->xattrs); + } shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); @@ -3101,7 +3104,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); @@ -4255,10 +4257,13 @@ static int shmem_initxattrs(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; - struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; + CLASS(simple_xattrs, xattrs)(); + if (IS_ERR(xattrs)) + return PTR_ERR(xattrs); + if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, @@ -4277,24 +4282,24 @@ static int shmem_initxattrs(struct inode *inode, } for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); - if (!new_xattr->name) { - kvfree(new_xattr); + if (!new_xattr->name) break; - } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_add(&info->xattrs, new_xattr); + if (simple_xattr_add(xattrs, new_xattr)) + break; + retain_and_null_ptr(new_xattr); } if (xattr->name != NULL) { @@ -4303,10 +4308,10 @@ static int shmem_initxattrs(struct inode *inode, sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } - simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } + smp_store_release(&info->xattrs, no_free_ptr(xattrs)); return 0; } @@ -4315,9 +4320,14 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(info->xattrs); + if (!xattrs) + return -ENODATA; name = xattr_full_name(handler, name); - return simple_xattr_get(&info->xattrs, name, buffer, size); + return simple_xattr_get(xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, @@ -4328,10 +4338,16 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattrs *xattrs; struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); + + xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags); + if (IS_ERR_OR_NULL(xattrs)) + return PTR_ERR(xattrs); + if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); @@ -4344,13 +4360,13 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, return -ENOSPC; } - old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); - simple_xattr_free(old_xattr); + simple_xattr_free_rcu(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); @@ -4391,7 +4407,9 @@ static const struct xattr_handler * const shmem_xattr_handlers[] = { static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); + + return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs), + buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ -- cgit v1.2.3 From f4cc3ab824d6772a48ca9d9c74ac623b3309985d Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 7 Oct 2025 14:06:05 +0200 Subject: dma-buf: protected fence ops by RCU v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fence ops of a dma_fence currently need to life as long as the dma_fence is alive. This means that the module which originally issued a dma_fence can't unload unless all fences are freed up. As first step to solve this issue protect the fence ops by RCU. While it is counter intuitive to protect a constant function pointer table by RCU it allows modules to wait for an RCU grace period before they unload, to make sure that nobody is executing their functions any more. This patch has not much functional change, but only adds the RCU handling for the static checker to test. v2: make one the now duplicated lockdep warnings a comment instead. v3: Add more documentation to ->wait and ->release callback. v4: fix typo in documentation v5: rebased on drm-tip v6: improve code comments v7: improve commit message and code comments v8: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-2-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 71 +++++++++++++++++++++++---------- drivers/gpu/drm/drm_crtc.c | 2 +- drivers/gpu/drm/scheduler/sched_fence.c | 4 +- include/linux/dma-fence.h | 33 ++++++++++++--- 4 files changed, 80 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 7e8db99186c2..076e6e6c75be 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -522,6 +522,7 @@ EXPORT_SYMBOL(dma_fence_signal); signed long dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) { + const struct dma_fence_ops *ops; signed long ret; if (WARN_ON(timeout < 0)) @@ -533,15 +534,22 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) dma_fence_enable_sw_signaling(fence); - if (trace_dma_fence_wait_start_enabled()) { - rcu_read_lock(); - trace_dma_fence_wait_start(fence); + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + trace_dma_fence_wait_start(fence); + if (ops->wait) { + /* + * Implementing the wait ops is deprecated and not supported for + * issuers of fences who need their lifetime to be independent + * of their module after they signal, so it is ok to use the + * ops outside the RCU protected section. + */ + rcu_read_unlock(); + ret = ops->wait(fence, intr, timeout); + } else { rcu_read_unlock(); - } - if (fence->ops->wait) - ret = fence->ops->wait(fence, intr, timeout); - else ret = dma_fence_default_wait(fence, intr, timeout); + } if (trace_dma_fence_wait_end_enabled()) { rcu_read_lock(); trace_dma_fence_wait_end(fence); @@ -562,6 +570,7 @@ void dma_fence_release(struct kref *kref) { struct dma_fence *fence = container_of(kref, struct dma_fence, refcount); + const struct dma_fence_ops *ops; rcu_read_lock(); trace_dma_fence_destroy(fence); @@ -593,12 +602,12 @@ void dma_fence_release(struct kref *kref) spin_unlock_irqrestore(fence->lock, flags); } - rcu_read_unlock(); - - if (fence->ops->release) - fence->ops->release(fence); + ops = rcu_dereference(fence->ops); + if (ops->release) + ops->release(fence); else dma_fence_free(fence); + rcu_read_unlock(); } EXPORT_SYMBOL(dma_fence_release); @@ -617,6 +626,7 @@ EXPORT_SYMBOL(dma_fence_free); static bool __dma_fence_enable_signaling(struct dma_fence *fence) { + const struct dma_fence_ops *ops; bool was_set; lockdep_assert_held(fence->lock); @@ -627,14 +637,18 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) if (dma_fence_test_signaled_flag(fence)) return false; - if (!was_set && fence->ops->enable_signaling) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (!was_set && ops->enable_signaling) { trace_dma_fence_enable_signal(fence); - if (!fence->ops->enable_signaling(fence)) { + if (!ops->enable_signaling(fence)) { + rcu_read_unlock(); dma_fence_signal_locked(fence); return false; } } + rcu_read_unlock(); return true; } @@ -1007,8 +1021,13 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout); */ void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) { - if (fence->ops->set_deadline && !dma_fence_is_signaled(fence)) - fence->ops->set_deadline(fence, deadline); + const struct dma_fence_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->set_deadline && !dma_fence_is_signaled(fence)) + ops->set_deadline(fence, deadline); + rcu_read_unlock(); } EXPORT_SYMBOL(dma_fence_set_deadline); @@ -1049,7 +1068,13 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name); kref_init(&fence->refcount); - fence->ops = ops; + /* + * While it is counter intuitive to protect a constant function pointer + * table by RCU it allows modules to wait for an RCU grace period + * before they unload, to make sure that nobody is executing their + * functions any more. + */ + RCU_INIT_POINTER(fence->ops, ops); INIT_LIST_HEAD(&fence->cb_list); fence->lock = lock; fence->context = context; @@ -1129,11 +1154,12 @@ EXPORT_SYMBOL(dma_fence_init64); */ const char __rcu *dma_fence_driver_name(struct dma_fence *fence) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "RCU protection is required for safe access to returned string"); + const struct dma_fence_ops *ops; + /* RCU protection is required for safe access to returned string */ + ops = rcu_dereference(fence->ops); if (!dma_fence_test_signaled_flag(fence)) - return (const char __rcu *)fence->ops->get_driver_name(fence); + return (const char __rcu *)ops->get_driver_name(fence); else return (const char __rcu *)"detached-driver"; } @@ -1161,11 +1187,12 @@ EXPORT_SYMBOL(dma_fence_driver_name); */ const char __rcu *dma_fence_timeline_name(struct dma_fence *fence) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "RCU protection is required for safe access to returned string"); + const struct dma_fence_ops *ops; + /* RCU protection is required for safe access to returned string */ + ops = rcu_dereference(fence->ops); if (!dma_fence_test_signaled_flag(fence)) - return (const char __rcu *)fence->ops->get_driver_name(fence); + return (const char __rcu *)ops->get_driver_name(fence); else return (const char __rcu *)"signaled-timeline"; } diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 90684f30a048..960fdc1cc6ba 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -158,7 +158,7 @@ static const struct dma_fence_ops drm_crtc_fence_ops; static struct drm_crtc *fence_to_crtc(struct dma_fence *fence) { - BUG_ON(fence->ops != &drm_crtc_fence_ops); + BUG_ON(rcu_access_pointer(fence->ops) != &drm_crtc_fence_ops); return container_of(fence->lock, struct drm_crtc, fence_lock); } diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 9391d6f0dc01..a27786cb86fb 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -195,10 +195,10 @@ static const struct dma_fence_ops drm_sched_fence_ops_finished = { struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f) { - if (f->ops == &drm_sched_fence_ops_scheduled) + if (rcu_access_pointer(f->ops) == &drm_sched_fence_ops_scheduled) return container_of(f, struct drm_sched_fence, scheduled); - if (f->ops == &drm_sched_fence_ops_finished) + if (rcu_access_pointer(f->ops) == &drm_sched_fence_ops_finished) return container_of(f, struct drm_sched_fence, finished); return NULL; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9c4d25289239..fa3cfe3e98ac 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -67,7 +67,7 @@ struct seq_file; */ struct dma_fence { spinlock_t *lock; - const struct dma_fence_ops *ops; + const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we * release the fence it is unused. No one should be adding to the @@ -220,6 +220,10 @@ struct dma_fence_ops { * timed out. Can also return other error values on custom implementations, * which should be treated as if the fence is signaled. For example a hardware * lockup could be reported like that. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. */ signed long (*wait)(struct dma_fence *fence, bool intr, signed long timeout); @@ -231,6 +235,13 @@ struct dma_fence_ops { * Can be called from irq context. This callback is optional. If it is * NULL, then dma_fence_free() is instead called as the default * implementation. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. + * + * If the callback is implemented the memory backing the dma_fence + * object must be freed RCU safe. */ void (*release)(struct dma_fence *fence); @@ -454,13 +465,19 @@ dma_fence_test_signaled_flag(struct dma_fence *fence) static inline bool dma_fence_is_signaled_locked(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal_locked(fence); return true; } + rcu_read_unlock(); return false; } @@ -484,13 +501,19 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) static inline bool dma_fence_is_signaled(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal(fence); return true; } + rcu_read_unlock(); return false; } @@ -695,7 +718,7 @@ extern const struct dma_fence_ops dma_fence_chain_ops; */ static inline bool dma_fence_is_array(struct dma_fence *fence) { - return fence->ops == &dma_fence_array_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_array_ops; } /** @@ -706,7 +729,7 @@ static inline bool dma_fence_is_array(struct dma_fence *fence) */ static inline bool dma_fence_is_chain(struct dma_fence *fence) { - return fence->ops == &dma_fence_chain_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_chain_ops; } /** -- cgit v1.2.3 From 541c8f2468b933acc5d129e84bd264923675a66e Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 8 Oct 2025 18:12:46 +0200 Subject: dma-buf: detach fence ops on signal v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When neither a release nor a wait backend ops is specified it is possible to let the dma_fence live on independently of the module who issued it. This makes it possible to unload drivers and only wait for all their fences to signal. v2: fix typo in comment v3: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-3-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 18 ++++++++++++++---- include/linux/dma-fence.h | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 076e6e6c75be..3279d82ffa98 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -362,6 +362,7 @@ void __dma_fence_might_wait(void) void dma_fence_signal_timestamp_locked(struct dma_fence *fence, ktime_t timestamp) { + const struct dma_fence_ops *ops; struct dma_fence_cb *cur, *tmp; struct list_head cb_list; @@ -371,6 +372,15 @@ void dma_fence_signal_timestamp_locked(struct dma_fence *fence, &fence->flags))) return; + /* + * When neither a release nor a wait operation is specified set the ops + * pointer to NULL to allow the fence structure to become independent + * from who originally issued it. + */ + ops = rcu_dereference_protected(fence->ops, true); + if (!ops->release && !ops->wait) + RCU_INIT_POINTER(fence->ops, NULL); + /* Stash the cb_list before replacing it with the timestamp */ list_replace(&fence->cb_list, &cb_list); @@ -537,7 +547,7 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) rcu_read_lock(); ops = rcu_dereference(fence->ops); trace_dma_fence_wait_start(fence); - if (ops->wait) { + if (ops && ops->wait) { /* * Implementing the wait ops is deprecated and not supported for * issuers of fences who need their lifetime to be independent @@ -603,7 +613,7 @@ void dma_fence_release(struct kref *kref) } ops = rcu_dereference(fence->ops); - if (ops->release) + if (ops && ops->release) ops->release(fence); else dma_fence_free(fence); @@ -639,7 +649,7 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (!was_set && ops->enable_signaling) { + if (!was_set && ops && ops->enable_signaling) { trace_dma_fence_enable_signal(fence); if (!ops->enable_signaling(fence)) { @@ -1025,7 +1035,7 @@ void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->set_deadline && !dma_fence_is_signaled(fence)) + if (ops && ops->set_deadline && !dma_fence_is_signaled(fence)) ops->set_deadline(fence, deadline); rcu_read_unlock(); } diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index fa3cfe3e98ac..9ff2c4a09cdc 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -472,7 +472,7 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal_locked(fence); return true; @@ -508,7 +508,7 @@ dma_fence_is_signaled(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal(fence); return true; -- cgit v1.2.3 From 3e5067931b5df667f5350fafe4410554e228e53e Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: abstract fence locking v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_fence_lock_irqsafe() and dma_fence_unlock_irqrestore() wrappers and mechanically apply them everywhere. Just a pre-requisite cleanup for a follow up patch. v2: add some missing i915 bits, add abstraction for lockdep assertion as well v3: one more suggestion by Tvrtko Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260219160822.1529-4-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 48 +++++++++++++---------------- drivers/dma-buf/st-dma-fence.c | 6 ++-- drivers/dma-buf/sw_sync.c | 14 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 2 +- drivers/gpu/drm/i915/i915_active.c | 19 +++++++----- drivers/gpu/drm/nouveau/nouveau_drm.c | 5 +-- drivers/gpu/drm/scheduler/sched_fence.c | 6 ++-- drivers/gpu/drm/xe/xe_sched_job.c | 4 +-- include/linux/dma-fence.h | 38 +++++++++++++++++++++++ 12 files changed, 96 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 3279d82ffa98..698260c49f52 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -366,7 +366,7 @@ void dma_fence_signal_timestamp_locked(struct dma_fence *fence, struct dma_fence_cb *cur, *tmp; struct list_head cb_list; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))) @@ -414,9 +414,9 @@ void dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp) if (WARN_ON(!fence)) return; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); dma_fence_signal_timestamp_locked(fence, timestamp); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } EXPORT_SYMBOL(dma_fence_signal_timestamp); @@ -475,9 +475,9 @@ bool dma_fence_check_and_signal(struct dma_fence *fence) unsigned long flags; bool ret; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); ret = dma_fence_check_and_signal_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -503,9 +503,9 @@ void dma_fence_signal(struct dma_fence *fence) tmp = dma_fence_begin_signalling(); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); dma_fence_signal_timestamp_locked(fence, ktime_get()); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); dma_fence_end_signalling(tmp); } @@ -606,10 +606,10 @@ void dma_fence_release(struct kref *kref) * don't leave chains dangling. We set the error flag first * so that the callbacks know this signal is due to an error. */ - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); fence->error = -EDEADLK; dma_fence_signal_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } ops = rcu_dereference(fence->ops); @@ -639,7 +639,7 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) const struct dma_fence_ops *ops; bool was_set; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); was_set = test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &fence->flags); @@ -675,9 +675,9 @@ void dma_fence_enable_sw_signaling(struct dma_fence *fence) { unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); __dma_fence_enable_signaling(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } EXPORT_SYMBOL(dma_fence_enable_sw_signaling); @@ -717,8 +717,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb, return -ENOENT; } - spin_lock_irqsave(fence->lock, flags); - + dma_fence_lock_irqsave(fence, flags); if (__dma_fence_enable_signaling(fence)) { cb->func = func; list_add_tail(&cb->node, &fence->cb_list); @@ -726,8 +725,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb, INIT_LIST_HEAD(&cb->node); ret = -ENOENT; } - - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -750,9 +748,9 @@ int dma_fence_get_status(struct dma_fence *fence) unsigned long flags; int status; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); status = dma_fence_get_status_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return status; } @@ -782,13 +780,11 @@ dma_fence_remove_callback(struct dma_fence *fence, struct dma_fence_cb *cb) unsigned long flags; bool ret; - spin_lock_irqsave(fence->lock, flags); - + dma_fence_lock_irqsave(fence, flags); ret = !list_empty(&cb->node); if (ret) list_del_init(&cb->node); - - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -827,7 +823,7 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) unsigned long flags; signed long ret = timeout ? timeout : 1; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (dma_fence_test_signaled_flag(fence)) goto out; @@ -851,11 +847,11 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) __set_current_state(TASK_INTERRUPTIBLE); else __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); ret = schedule_timeout(ret); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (ret > 0 && intr && signal_pending(current)) ret = -ERESTARTSYS; } @@ -865,7 +861,7 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) __set_current_state(TASK_RUNNING); out: - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } EXPORT_SYMBOL(dma_fence_default_wait); diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index 73ed6fd48a13..5d0d9abc6e21 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -410,8 +410,10 @@ struct race_thread { static void __wait_for_callbacks(struct dma_fence *f) { - spin_lock_irq(f->lock); - spin_unlock_irq(f->lock); + unsigned long flags; + + dma_fence_lock_irqsave(f, flags); + dma_fence_unlock_irqrestore(f, flags); } static int thread_signal_callback(void *arg) diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c index 963a72324d16..8df20b0218a9 100644 --- a/drivers/dma-buf/sw_sync.c +++ b/drivers/dma-buf/sw_sync.c @@ -156,12 +156,12 @@ static void timeline_fence_release(struct dma_fence *fence) struct sync_timeline *parent = dma_fence_parent(fence); unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!list_empty(&pt->link)) { list_del(&pt->link); rb_erase(&pt->node, &parent->pt_tree); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); sync_timeline_put(parent); dma_fence_free(fence); @@ -179,7 +179,7 @@ static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadlin struct sync_pt *pt = dma_fence_to_sync_pt(fence); unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { if (ktime_before(deadline, pt->deadline)) pt->deadline = deadline; @@ -187,7 +187,7 @@ static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadlin pt->deadline = deadline; __set_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } static const struct dma_fence_ops timeline_fence_ops = { @@ -431,13 +431,13 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a goto put_fence; } - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { ret = -ENOENT; goto unlock; } data.deadline_ns = ktime_to_ns(pt->deadline); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); dma_fence_put(fence); @@ -450,7 +450,7 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a return 0; unlock: - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); put_fence: dma_fence_put(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 4638a686a84e..7c047f5a1549 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -479,10 +479,10 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence) return false; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!dma_fence_is_signaled_locked(fence)) dma_fence_set_error(fence, -ENODATA); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); while (!dma_fence_is_signaled(fence) && ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index f2beb980e3c3..8b095087feb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2785,8 +2785,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) dma_fence_put(vm->last_unlocked); dma_fence_wait(vm->last_tlb_flush, false); /* Make sure that all fence callbacks have completed */ - spin_lock_irqsave(vm->last_tlb_flush->lock, flags); - spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags); + dma_fence_lock_irqsave(vm->last_tlb_flush, flags); + dma_fence_unlock_irqrestore(vm->last_tlb_flush, flags); dma_fence_put(vm->last_tlb_flush); list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 806d62ed61ef..a914ceec90aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -639,7 +639,7 @@ static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm) * sure that the dma_fence structure isn't freed up. */ rcu_read_lock(); - lock = vm->last_tlb_flush->lock; + lock = dma_fence_spinlock(vm->last_tlb_flush); rcu_read_unlock(); spin_lock_irqsave(lock, flags); diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index a2b413982ce6..c10ac0ab3bfa 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -148,7 +148,7 @@ __dma_fence_signal__notify(struct dma_fence *fence, { struct dma_fence_cb *cur, *tmp; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); list_for_each_entry_safe(cur, tmp, list, node) { INIT_LIST_HEAD(&cur->node); diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 25c46d7b1ea7..cd44cbfb53b5 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -1045,9 +1045,10 @@ __i915_active_fence_set(struct i915_active_fence *active, * nesting rules for the fence->lock; the inner lock is always the * older lock. */ - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (prev) - spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + spin_lock_nested(dma_fence_spinlock(prev), + SINGLE_DEPTH_NESTING); /* * A does the cmpxchg first, and so it sees C or NULL, as before, or @@ -1061,17 +1062,18 @@ __i915_active_fence_set(struct i915_active_fence *active, */ while (cmpxchg(__active_fence_slot(active), prev, fence) != prev) { if (prev) { - spin_unlock(prev->lock); + spin_unlock(dma_fence_spinlock(prev)); dma_fence_put(prev); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); prev = i915_active_fence_get(active); GEM_BUG_ON(prev == fence); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (prev) - spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + spin_lock_nested(dma_fence_spinlock(prev), + SINGLE_DEPTH_NESTING); } /* @@ -1088,10 +1090,11 @@ __i915_active_fence_set(struct i915_active_fence *active, */ if (prev) { __list_del_entry(&active->cb.node); - spin_unlock(prev->lock); /* serialise with prev->cb_list */ + /* serialise with prev->cb_list */ + spin_unlock(dma_fence_spinlock(prev)); } list_add_tail(&active->cb.node, &fence->cb_list); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return prev; } diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index cb22237ac17d..17c114645d9f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -156,12 +156,13 @@ nouveau_name(struct drm_device *dev) static inline bool nouveau_cli_work_ready(struct dma_fence *fence) { + unsigned long flags; bool ret = true; - spin_lock_irq(fence->lock); + dma_fence_lock_irqsave(fence, flags); if (!dma_fence_is_signaled_locked(fence)) ret = false; - spin_unlock_irq(fence->lock); + dma_fence_unlock_irqrestore(fence, flags); if (ret == true) dma_fence_put(fence); diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index a27786cb86fb..096fe28aa9c9 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -156,19 +156,19 @@ static void drm_sched_fence_set_deadline_finished(struct dma_fence *f, struct dma_fence *parent; unsigned long flags; - spin_lock_irqsave(&fence->lock, flags); + dma_fence_lock_irqsave(f, flags); /* If we already have an earlier deadline, keep it: */ if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags) && ktime_before(fence->deadline, deadline)) { - spin_unlock_irqrestore(&fence->lock, flags); + dma_fence_unlock_irqrestore(f, flags); return; } fence->deadline = deadline; set_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags); - spin_unlock_irqrestore(&fence->lock, flags); + dma_fence_unlock_irqrestore(f, flags); /* * smp_load_aquire() to ensure that if we are racing another diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 3927666fe556..ae5b38b2a884 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -190,11 +190,11 @@ static bool xe_fence_set_error(struct dma_fence *fence, int error) unsigned long irq_flags; bool signaled; - spin_lock_irqsave(fence->lock, irq_flags); + dma_fence_lock_irqsave(fence, irq_flags); signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags); if (!signaled) dma_fence_set_error(fence, error); - spin_unlock_irqrestore(fence->lock, irq_flags); + dma_fence_unlock_irqrestore(fence, irq_flags); return signaled; } diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9ff2c4a09cdc..85d6eac9fa85 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -377,6 +377,44 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) } while (1); } +/** + * dma_fence_spinlock - return pointer to the spinlock protecting the fence + * @fence: the fence to get the lock from + * + * Return the pointer to the extern lock. + */ +static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) +{ + return fence->lock; +} + +/** + * dma_fence_lock_irqsave - irqsave lock the fence + * @fence: the fence to lock + * @flags: where to store the CPU flags. + * + * Lock the fence, preventing it from changing to the signaled state. + */ +#define dma_fence_lock_irqsave(fence, flags) \ + spin_lock_irqsave(fence->lock, flags) + +/** + * dma_fence_unlock_irqrestore - unlock the fence and irqrestore + * @fence: the fence to unlock + * @flags the CPU flags to restore + * + * Unlock the fence, allowing it to change it's state to signaled again. + */ +#define dma_fence_unlock_irqrestore(fence, flags) \ + spin_unlock_irqrestore(fence->lock, flags) + +/** + * dma_fence_assert_held - lockdep assertion that fence is locked + * @fence: the fence which should be locked + */ +#define dma_fence_assert_held(fence) \ + lockdep_assert_held(dma_fence_spinlock(fence)); + #ifdef CONFIG_LOCKDEP bool dma_fence_begin_signalling(void); void dma_fence_end_signalling(bool cookie); -- cgit v1.2.3 From 1f32f310a13c9fb67a9993ab67f596b3f960206f Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: inline spinlock for fence protection v5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement per-fence spinlocks, allowing implementations to not give an external spinlock to protect the fence internal state. Instead a spinlock embedded into the fence structure itself is used in this case. Shared spinlocks have the problem that implementations need to guarantee that the lock lives at least as long all fences referencing them. Using a per-fence spinlock allows completely decoupling spinlock producer and consumer life times, simplifying the handling in most use cases. v2: improve naming, coverage and function documentation v3: fix one additional locking in the selftests v4: separate out some changes to make the patch smaller, fix one amdgpu crash found by CI systems v5: improve comments Signed-off-by: Christian König Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-5-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 21 ++++++++++++++++----- drivers/dma-buf/sync_debug.h | 2 +- drivers/gpu/drm/drm_crtc.c | 2 +- drivers/gpu/drm/drm_writeback.c | 2 +- drivers/gpu/drm/nouveau/nouveau_fence.c | 3 ++- drivers/gpu/drm/qxl/qxl_release.c | 3 ++- drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 3 ++- drivers/gpu/drm/xe/xe_hw_fence.c | 3 ++- include/linux/dma-fence.h | 19 +++++++++++++------ 9 files changed, 40 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 698260c49f52..4ad863d2a52c 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -343,7 +343,6 @@ void __dma_fence_might_wait(void) } #endif - /** * dma_fence_signal_timestamp_locked - signal completion of a fence * @fence: the fence to signal @@ -1070,7 +1069,6 @@ static void __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, spinlock_t *lock, u64 context, u64 seqno, unsigned long flags) { - BUG_ON(!lock); BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name); kref_init(&fence->refcount); @@ -1082,10 +1080,15 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, */ RCU_INIT_POINTER(fence->ops, ops); INIT_LIST_HEAD(&fence->cb_list); - fence->lock = lock; fence->context = context; fence->seqno = seqno; fence->flags = flags | BIT(DMA_FENCE_FLAG_INITIALIZED_BIT); + if (lock) { + fence->extern_lock = lock; + } else { + spin_lock_init(&fence->inline_lock); + fence->flags |= BIT(DMA_FENCE_FLAG_INLINE_LOCK_BIT); + } fence->error = 0; trace_dma_fence_init(fence); @@ -1095,7 +1098,7 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, * dma_fence_init - Initialize a custom fence. * @fence: the fence to initialize * @ops: the dma_fence_ops for operations on this fence - * @lock: the irqsafe spinlock to use for locking this fence + * @lock: optional irqsafe spinlock to use for locking this fence * @context: the execution context this fence is run on * @seqno: a linear increasing sequence number for this context * @@ -1105,6 +1108,10 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, * * context and seqno are used for easy comparison between fences, allowing * to check which fence is later by simply using dma_fence_later(). + * + * It is strongly discouraged to provide an external lock because this couples + * lock and fence life time. This is only allowed for legacy use cases when + * multiple fences need to be prevented from signaling out of order. */ void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, @@ -1118,7 +1125,7 @@ EXPORT_SYMBOL(dma_fence_init); * dma_fence_init64 - Initialize a custom fence with 64-bit seqno support. * @fence: the fence to initialize * @ops: the dma_fence_ops for operations on this fence - * @lock: the irqsafe spinlock to use for locking this fence + * @lock: optional irqsafe spinlock to use for locking this fence * @context: the execution context this fence is run on * @seqno: a linear increasing sequence number for this context * @@ -1128,6 +1135,10 @@ EXPORT_SYMBOL(dma_fence_init); * * Context and seqno are used for easy comparison between fences, allowing * to check which fence is later by simply using dma_fence_later(). + * + * It is strongly discouraged to provide an external lock because this couples + * lock and fence life time. This is only allowed for legacy use cases when + * multiple fences need to be prevented from signaling out of order. */ void dma_fence_init64(struct dma_fence *fence, const struct dma_fence_ops *ops, diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h index 02af347293d0..c49324505b20 100644 --- a/drivers/dma-buf/sync_debug.h +++ b/drivers/dma-buf/sync_debug.h @@ -47,7 +47,7 @@ struct sync_timeline { static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence) { - return container_of(fence->lock, struct sync_timeline, lock); + return container_of(fence->extern_lock, struct sync_timeline, lock); } /** diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 960fdc1cc6ba..8d6f721c2c9a 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -159,7 +159,7 @@ static const struct dma_fence_ops drm_crtc_fence_ops; static struct drm_crtc *fence_to_crtc(struct dma_fence *fence) { BUG_ON(rcu_access_pointer(fence->ops) != &drm_crtc_fence_ops); - return container_of(fence->lock, struct drm_crtc, fence_lock); + return container_of(fence->extern_lock, struct drm_crtc, fence_lock); } static const char *drm_crtc_fence_get_driver_name(struct dma_fence *fence) diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c index 09362cf4f22f..4da5d6094721 100644 --- a/drivers/gpu/drm/drm_writeback.c +++ b/drivers/gpu/drm/drm_writeback.c @@ -81,7 +81,7 @@ * From userspace, this property will always read as zero. */ -#define fence_to_wb_connector(x) container_of(x->lock, \ +#define fence_to_wb_connector(x) container_of(x->extern_lock, \ struct drm_writeback_connector, \ fence_lock) diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index 903d326927ca..edbe9e08ba0f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -41,7 +41,8 @@ static const struct dma_fence_ops nouveau_fence_ops_legacy; static inline struct nouveau_fence_chan * nouveau_fctx(struct nouveau_fence *fence) { - return container_of(fence->base.lock, struct nouveau_fence_chan, lock); + return container_of(fence->base.extern_lock, struct nouveau_fence_chan, + lock); } static bool diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 720d6d57151c..06979d0e8a9f 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -62,7 +62,8 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, struct qxl_device *qdev; unsigned long cur, end = jiffies + timeout; - qdev = container_of(fence->lock, struct qxl_device, release_lock); + qdev = container_of(fence->extern_lock, struct qxl_device, + release_lock); if (!wait_event_timeout(qdev->release_event, (dma_fence_is_signaled(fence) || diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index 3469e2c9e706..4ef84ff9b638 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -47,7 +47,8 @@ struct vmw_event_fence_action { static struct vmw_fence_manager * fman_from_fence(struct vmw_fence_obj *fence) { - return container_of(fence->base.lock, struct vmw_fence_manager, lock); + return container_of(fence->base.extern_lock, struct vmw_fence_manager, + lock); } static void vmw_fence_obj_destroy(struct dma_fence *f) diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c index ae8ed15b64c5..14720623ad00 100644 --- a/drivers/gpu/drm/xe/xe_hw_fence.c +++ b/drivers/gpu/drm/xe/xe_hw_fence.c @@ -124,7 +124,8 @@ static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence); static struct xe_hw_fence_irq *xe_hw_fence_irq(struct xe_hw_fence *fence) { - return container_of(fence->dma.lock, struct xe_hw_fence_irq, lock); + return container_of(fence->dma.extern_lock, struct xe_hw_fence_irq, + lock); } static const char *xe_hw_fence_get_driver_name(struct dma_fence *dma_fence) diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 85d6eac9fa85..3dc93f068bf6 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -34,7 +34,8 @@ struct seq_file; * @ops: dma_fence_ops associated with this fence * @rcu: used for releasing fence with kfree_rcu * @cb_list: list of all callbacks to call - * @lock: spin_lock_irqsave used for locking + * @extern_lock: external spin_lock_irqsave used for locking (deprecated) + * @inline_lock: alternative internal spin_lock_irqsave used for locking * @context: execution context this fence belongs to, returned by * dma_fence_context_alloc() * @seqno: the sequence number of this fence inside the execution context, @@ -49,6 +50,7 @@ struct seq_file; * of the time. * * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized + * DMA_FENCE_FLAG_INLINE_LOCK_BIT - use inline spinlock instead of external one * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -66,7 +68,10 @@ struct seq_file; * been completed, or never called at all. */ struct dma_fence { - spinlock_t *lock; + union { + spinlock_t *extern_lock; + spinlock_t inline_lock; + }; const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we @@ -100,6 +105,7 @@ struct dma_fence { enum dma_fence_flag_bits { DMA_FENCE_FLAG_INITIALIZED_BIT, + DMA_FENCE_FLAG_INLINE_LOCK_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -381,11 +387,12 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) * dma_fence_spinlock - return pointer to the spinlock protecting the fence * @fence: the fence to get the lock from * - * Return the pointer to the extern lock. + * Return either the pointer to the embedded or the external spin lock. */ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) { - return fence->lock; + return test_bit(DMA_FENCE_FLAG_INLINE_LOCK_BIT, &fence->flags) ? + &fence->inline_lock : fence->extern_lock; } /** @@ -396,7 +403,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Lock the fence, preventing it from changing to the signaled state. */ #define dma_fence_lock_irqsave(fence, flags) \ - spin_lock_irqsave(fence->lock, flags) + spin_lock_irqsave(dma_fence_spinlock(fence), flags) /** * dma_fence_unlock_irqrestore - unlock the fence and irqrestore @@ -406,7 +413,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Unlock the fence, allowing it to change it's state to signaled again. */ #define dma_fence_unlock_irqrestore(fence, flags) \ - spin_unlock_irqrestore(fence->lock, flags) + spin_unlock_irqrestore(dma_fence_spinlock(fence), flags) /** * dma_fence_assert_held - lockdep assertion that fence is locked -- cgit v1.2.3 From 5943243914b9fed8e26edcb9d45421721a5e3576 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 16:18:53 +0200 Subject: dma-buf: use inline lock for the dma-fence-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-8-christian.koenig@amd.com --- drivers/dma-buf/dma-fence-array.c | 5 ++--- include/linux/dma-fence-array.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c index 37e2c6179d77..cd970eceaefb 100644 --- a/drivers/dma-buf/dma-fence-array.c +++ b/drivers/dma-buf/dma-fence-array.c @@ -204,9 +204,8 @@ void dma_fence_array_init(struct dma_fence_array *array, array->num_fences = num_fences; - spin_lock_init(&array->lock); - dma_fence_init(&array->base, &dma_fence_array_ops, &array->lock, - context, seqno); + dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context, + seqno); init_irq_work(&array->work, irq_dma_fence_array_work); atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences); diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index 079b3dec0a16..370b3d2bba37 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -38,7 +38,6 @@ struct dma_fence_array_cb { struct dma_fence_array { struct dma_fence base; - spinlock_t lock; unsigned num_fences; atomic_t num_pending; struct dma_fence **fences; -- cgit v1.2.3 From a408c0ca0c411ca1ead995bdae3112a806c87556 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 16:32:33 +0200 Subject: dma-buf: use inline lock for the dma-fence-chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-9-christian.koenig@amd.com --- drivers/dma-buf/dma-fence-chain.c | 3 +-- include/linux/dma-fence-chain.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c index a8a90acf4f34..a707792b6025 100644 --- a/drivers/dma-buf/dma-fence-chain.c +++ b/drivers/dma-buf/dma-fence-chain.c @@ -245,7 +245,6 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, struct dma_fence_chain *prev_chain = to_dma_fence_chain(prev); uint64_t context; - spin_lock_init(&chain->lock); rcu_assign_pointer(chain->prev, prev); chain->fence = fence; chain->prev_seqno = 0; @@ -261,7 +260,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, seqno = max(prev->seqno, seqno); } - dma_fence_init64(&chain->base, &dma_fence_chain_ops, &chain->lock, + dma_fence_init64(&chain->base, &dma_fence_chain_ops, NULL, context, seqno); /* diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 5cd3ba53b4a1..df3beadf1515 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -46,7 +46,6 @@ struct dma_fence_chain { */ struct irq_work work; }; - spinlock_t lock; }; -- cgit v1.2.3 From 4aff230cf28b5f68a62fcd79de341c58245ea8e2 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Tue, 27 Jan 2026 12:45:49 +0530 Subject: dt-bindings: clock: qcom: document the Glymur GPU Clock Controller Glymur SoC has Qualcomm GX(graphics) clock controller and also the Graphics clock controller. The GX graphics clock controller helps in the recovery of the Graphics subsystem. Add bindings documentation for the Glymur Graphics Clock and Graphics power domain Controller for Glymur SoC. Signed-off-by: Taniya Das Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20260127-glymur_gpucc-v1-1-547334c81ba2@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,kaanapali-gxclkctl.yaml | 1 + .../bindings/clock/qcom,sm8450-gpucc.yaml | 4 +- include/dt-bindings/clock/qcom,glymur-gpucc.h | 51 ++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/clock/qcom,glymur-gpucc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,kaanapali-gxclkctl.yaml b/Documentation/devicetree/bindings/clock/qcom,kaanapali-gxclkctl.yaml index 5490a975f3db..55bf3f811017 100644 --- a/Documentation/devicetree/bindings/clock/qcom,kaanapali-gxclkctl.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,kaanapali-gxclkctl.yaml @@ -20,6 +20,7 @@ description: | properties: compatible: enum: + - qcom,glymur-gxclkctl - qcom,kaanapali-gxclkctl power-domains: diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml index 6feaa32569f9..5993804c91fa 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml @@ -13,7 +13,8 @@ description: | Qualcomm graphics clock control module provides the clocks, resets and power domains on Qualcomm SoCs. - See also:: + See also: + include/dt-bindings/clock/qcom,glymur-gpucc.h include/dt-bindings/clock/qcom,kaanapali-gpucc.h include/dt-bindings/clock/qcom,milos-gpucc.h include/dt-bindings/clock/qcom,sar2130p-gpucc.h @@ -27,6 +28,7 @@ description: | properties: compatible: enum: + - qcom,glymur-gpucc - qcom,kaanapali-gpucc - qcom,milos-gpucc - qcom,sar2130p-gpucc diff --git a/include/dt-bindings/clock/qcom,glymur-gpucc.h b/include/dt-bindings/clock/qcom,glymur-gpucc.h new file mode 100644 index 000000000000..35f5abb848fd --- /dev/null +++ b/include/dt-bindings/clock/qcom,glymur-gpucc.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_GLYMUR_H +#define _DT_BINDINGS_CLK_QCOM_GPU_CC_GLYMUR_H + +/* GPU_CC clocks */ +#define GPU_CC_AHB_CLK 0 +#define GPU_CC_CB_CLK 1 +#define GPU_CC_CX_ACCU_SHIFT_CLK 2 +#define GPU_CC_CX_FF_CLK 3 +#define GPU_CC_CX_GMU_CLK 4 +#define GPU_CC_CXO_AON_CLK 5 +#define GPU_CC_CXO_CLK 6 +#define GPU_CC_DEMET_CLK 7 +#define GPU_CC_DPM_CLK 8 +#define GPU_CC_FF_CLK_SRC 9 +#define GPU_CC_FREQ_MEASURE_CLK 10 +#define GPU_CC_GMU_CLK_SRC 11 +#define GPU_CC_GPU_SMMU_VOTE_CLK 12 +#define GPU_CC_GX_ACCU_SHIFT_CLK 13 +#define GPU_CC_GX_ACD_AHB_FF_CLK 14 +#define GPU_CC_GX_AHB_FF_CLK 15 +#define GPU_CC_GX_GMU_CLK 16 +#define GPU_CC_GX_RCG_AHB_FF_CLK 17 +#define GPU_CC_HUB_AON_CLK 18 +#define GPU_CC_HUB_CLK_SRC 19 +#define GPU_CC_HUB_CX_INT_CLK 20 +#define GPU_CC_HUB_DIV_CLK_SRC 21 +#define GPU_CC_MEMNOC_GFX_CLK 22 +#define GPU_CC_PLL0 23 +#define GPU_CC_PLL0_OUT_EVEN 24 +#define GPU_CC_RSCC_HUB_AON_CLK 25 +#define GPU_CC_RSCC_XO_AON_CLK 26 +#define GPU_CC_SLEEP_CLK 27 + +/* GPU_CC power domains */ +#define GPU_CC_CX_GDSC 0 + +/* GPU_CC resets */ +#define GPU_CC_CB_BCR 0 +#define GPU_CC_CX_BCR 1 +#define GPU_CC_FAST_HUB_BCR 2 +#define GPU_CC_FF_BCR 3 +#define GPU_CC_GMU_BCR 4 +#define GPU_CC_GX_BCR 5 +#define GPU_CC_XO_BCR 6 + +#endif -- cgit v1.2.3 From 7c3260327fc874b7c89d7bb230cd569d2e78aff7 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Mon, 2 Feb 2026 16:26:50 +0530 Subject: dt-bindings: clock: qcom: Add GCC video axi reset clock for Glymur The global clock controller video axi reset clocks are required by the video SW driver to assert and deassert the clock resets. Signed-off-by: Taniya Das Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260202-glymur_videocc-v2-1-8f7d8b4d8edd@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,glymur-gcc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,glymur-gcc.h b/include/dt-bindings/clock/qcom,glymur-gcc.h index 10c12b8c51c3..6907653c7992 100644 --- a/include/dt-bindings/clock/qcom,glymur-gcc.h +++ b/include/dt-bindings/clock/qcom,glymur-gcc.h @@ -574,5 +574,6 @@ #define GCC_VIDEO_AXI0_CLK_ARES 89 #define GCC_VIDEO_AXI1_CLK_ARES 90 #define GCC_VIDEO_BCR 91 +#define GCC_VIDEO_AXI0C_CLK_ARES 92 #endif -- cgit v1.2.3 From ed9ca829614735ab0de0c97af9239bd20a618de1 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Mon, 2 Feb 2026 16:26:51 +0530 Subject: dt-bindings: clock: qcom: Add video clock controller on Glymur SoC Add compatible string for Glymur video clock controller and the bindings for Glymur Qualcomm SoC. Signed-off-by: Taniya Das Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260202-glymur_videocc-v2-2-8f7d8b4d8edd@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,sm8450-videocc.yaml | 3 ++ include/dt-bindings/clock/qcom,glymur-videocc.h | 45 ++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,glymur-videocc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml index e6beebd6a36e..7bbf120d928c 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-videocc.yaml @@ -15,6 +15,7 @@ description: | domains on SM8450. See also: + include/dt-bindings/clock/qcom,glymur-videocc.h include/dt-bindings/clock/qcom,kaanapali-videocc.h include/dt-bindings/clock/qcom,sm8450-videocc.h include/dt-bindings/clock/qcom,sm8650-videocc.h @@ -23,6 +24,7 @@ description: | properties: compatible: enum: + - qcom,glymur-videocc - qcom,kaanapali-videocc - qcom,sm8450-videocc - qcom,sm8475-videocc @@ -63,6 +65,7 @@ allOf: compatible: contains: enum: + - qcom,glymur-videocc - qcom,kaanapali-videocc - qcom,sm8450-videocc - qcom,sm8550-videocc diff --git a/include/dt-bindings/clock/qcom,glymur-videocc.h b/include/dt-bindings/clock/qcom,glymur-videocc.h new file mode 100644 index 000000000000..98c0debef8fa --- /dev/null +++ b/include/dt-bindings/clock/qcom,glymur-videocc.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_VIDEO_CC_GLYMUR_H +#define _DT_BINDINGS_CLK_QCOM_VIDEO_CC_GLYMUR_H + +/* VIDEO_CC clocks */ +#define VIDEO_CC_AHB_CLK 0 +#define VIDEO_CC_AHB_CLK_SRC 1 +#define VIDEO_CC_MVS0_CLK 2 +#define VIDEO_CC_MVS0_CLK_SRC 3 +#define VIDEO_CC_MVS0_DIV_CLK_SRC 4 +#define VIDEO_CC_MVS0_FREERUN_CLK 5 +#define VIDEO_CC_MVS0_SHIFT_CLK 6 +#define VIDEO_CC_MVS0C_CLK 7 +#define VIDEO_CC_MVS0C_DIV2_DIV_CLK_SRC 8 +#define VIDEO_CC_MVS0C_FREERUN_CLK 9 +#define VIDEO_CC_MVS0C_SHIFT_CLK 10 +#define VIDEO_CC_MVS1_CLK 11 +#define VIDEO_CC_MVS1_DIV_CLK_SRC 12 +#define VIDEO_CC_MVS1_FREERUN_CLK 13 +#define VIDEO_CC_MVS1_SHIFT_CLK 14 +#define VIDEO_CC_PLL0 15 +#define VIDEO_CC_SLEEP_CLK 16 +#define VIDEO_CC_SLEEP_CLK_SRC 17 +#define VIDEO_CC_XO_CLK 18 +#define VIDEO_CC_XO_CLK_SRC 19 + +/* VIDEO_CC power domains */ +#define VIDEO_CC_MVS0_GDSC 0 +#define VIDEO_CC_MVS0C_GDSC 1 +#define VIDEO_CC_MVS1_GDSC 2 + +/* VIDEO_CC resets */ +#define VIDEO_CC_INTERFACE_BCR 0 +#define VIDEO_CC_MVS0_BCR 1 +#define VIDEO_CC_MVS0C_BCR 2 +#define VIDEO_CC_MVS0C_FREERUN_CLK_ARES 3 +#define VIDEO_CC_MVS0_FREERUN_CLK_ARES 4 +#define VIDEO_CC_MVS1_FREERUN_CLK_ARES 5 +#define VIDEO_CC_XO_CLK_ARES 6 +#define VIDEO_CC_MVS1_BCR 7 +#endif -- cgit v1.2.3 From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Jan 2026 16:17:48 +0100 Subject: sched/fair: More complex proportional newidle balance It turns out that a few workloads (easyWave, fio) have a fairly low success rate on newidle balance, but still benefit greatly from having it anyway. Luckliky these workloads have a faily low newidle rate, so the cost if doing the newidle is relatively low, even if unsuccessfull. Add a simple rate based part to the newidle ratio compute, such that low rate newidle will still have a high newidle ratio. This cures the easyWave and fio workloads while not affecting the schbench numbers either (which have a very high newidle rate). Reported-by: Mario Roy Reported-by: "Mohamed Abuelfotoh, Hazem" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mario Roy Tested-by: "Mohamed Abuelfotoh, Hazem" Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 1 + kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- kernel/sched/features.h | 1 + kernel/sched/topology.c | 3 +++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..a1e1032426dc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -95,6 +95,7 @@ struct sched_domain { unsigned int newidle_call; unsigned int newidle_success; unsigned int newidle_ratio; + u64 newidle_stamp; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..66afa0ac7396 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12289,7 +12289,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12996,7 +13019,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..37d5928fa6dd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..061f8c85f555 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include +#include #include #include "sched.h" @@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); sd_weight = cpumask_weight(tl->mask(tl, cpu)); @@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, -- cgit v1.2.3 From be6d4c9e9d714ebbf358be41332726a0f94b9ffa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:16 +0200 Subject: dma-buf: Add dma_buf_attach_revocable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some exporters need a flow to synchronously revoke access to the DMA-buf by importers. Once revoke is completed the importer is not permitted to touch the memory otherwise they may get IOMMU faults, AERs, or worse. DMA-buf today defines a revoke flow, for both pinned and dynamic importers, which is broadly: dma_resv_lock(dmabuf->resv, NULL); // Prevent new mappings from being established priv->revoked = true; // Tell all importers to eventually unmap dma_buf_invalidate_mappings(dmabuf); // Wait for any inprogress fences on the old mapping dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP, false, MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(dmabuf->resv, NULL); // Wait for all importers to complete unmap wait_for_completion(&priv->unmapped_comp); This works well, and an importer that continues to access the DMA-buf after unmapping it is very buggy. However, the final wait for unmap is effectively unbounded. Several importers do not support invalidate_mappings() at all and won't unmap until userspace triggers it. This unbounded wait is not suitable for exporters like VFIO and RDMA tha need to issue revoke as part of their normal operations. Add dma_buf_attach_revocable() to allow exporters to determine the difference between importers that can complete the above in bounded time, and those that can't. It can be called inside the exporter's attach op to reject incompatible importers. Document these details about how dma_buf_invalidate_mappings() works and what the required sequence is to achieve a full revocation. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-6-463d956bd527@nvidia.com --- drivers/dma-buf/dma-buf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/dma-buf.h | 9 +++------ 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 3b32f15fbc18..a202a308c079 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1318,13 +1318,59 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, "DMA_BUF"); +/** + * dma_buf_attach_revocable - check if a DMA-buf importer implements + * revoke semantics. + * @attach: the DMA-buf attachment to check + * + * Returns true if the DMA-buf importer can support the revoke sequence + * explained in dma_buf_invalidate_mappings() within bounded time. Meaning the + * importer implements invalidate_mappings() and ensures that unmap is called as + * a result. + */ +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach) +{ + return attach->importer_ops && + attach->importer_ops->invalidate_mappings; +} +EXPORT_SYMBOL_NS_GPL(dma_buf_attach_revocable, "DMA_BUF"); + /** * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their - * mappings. + * mappings. If the attachment is dynamic then the dynamic importer is expected + * to invalidate any caches it has of the mapping result and perform a new + * mapping request before allowing HW to do any further DMA. + * + * If the attachment is pinned then this informs the pinned importer that the + * underlying mapping is no longer available. Pinned importers may take this is + * as a permanent revocation and never establish new mappings so exporters + * should not trigger it lightly. + * + * Upon return importers may continue to access the DMA-buf memory. The caller + * must do two additional waits to ensure that the memory is no longer being + * accessed: + * 1) Until dma_resv_wait_timeout() retires fences the importer is allowed to + * fully access the memory. + * 2) Until the importer calls unmap it is allowed to speculatively + * read-and-discard the memory. It must not write to the memory. + * + * A caller wishing to use dma_buf_invalidate_mappings() to fully stop access to + * the DMA-buf must wait for both. Dynamic callers can often use just the first. + * + * All importers providing a invalidate_mappings() op must ensure that unmap is + * called within bounded time after the op. + * + * Pinned importers that do not support a invalidate_mappings() op will + * eventually perform unmap when they are done with the buffer, which may be an + * ubounded time from calling this function. dma_buf_attach_revocable() can be + * used to prevent such importers from attaching. + * + * Importers are free to request a new mapping in parallel as this function + * returns. */ void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e744b8f9bfad..166933b82e27 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -456,12 +456,8 @@ struct dma_buf_attach_ops { * called with this lock held as well. This makes sure that no mapping * is created concurrently with an ongoing move operation. * - * Mappings stay valid and are not directly affected by this callback. - * But the DMA-buf can now be in a different physical location, so all - * mappings should be destroyed and re-created as soon as possible. - * - * New mappings can be created after this callback returns, and will - * point to the new location of the DMA-buf. + * See the kdoc for dma_buf_invalidate_mappings() for details on the + * required behavior. */ void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; @@ -579,6 +575,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From ebf1ccff79c43f860cbd2f9d6cfab9a462d0cb2d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 18 Feb 2026 09:32:17 +0100 Subject: sched_ext: Fix ops.dequeue() semantics Currently, ops.dequeue() is only invoked when the sched_ext core knows that a task resides in BPF-managed data structures, which causes it to miss scheduling property change events. In addition, ops.dequeue() callbacks are completely skipped when tasks are dispatched to non-local DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably track task state. Fix this by guaranteeing that each task entering the BPF scheduler's custody triggers exactly one ops.dequeue() call when it leaves that custody, whether the exit is due to a dispatch (regular or via a core scheduling pick) or to a scheduling property change (e.g. sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA balancing, etc.). BPF scheduler custody concept: a task is considered to be in the BPF scheduler's custody when the scheduler is responsible for managing its lifecycle. This includes tasks dispatched to user-created DSQs or stored in the BPF scheduler's internal data structures from ops.enqueue(). Custody ends when the task is dispatched to a terminal DSQ (such as the local DSQ or %SCX_DSQ_GLOBAL), selected by core scheduling, or removed due to a property change. Tasks directly dispatched to terminal DSQs bypass the BPF scheduler entirely and are never in its custody. Terminal DSQs include: - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues where tasks go directly to execution. - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the BPF scheduler is considered "done" with the task. As a result, ops.dequeue() is not invoked for tasks directly dispatched to terminal DSQs. To identify dequeues triggered by scheduling property changes, introduce the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set, the dequeue was caused by a scheduling property change. New ops.dequeue() semantics: - ops.dequeue() is invoked exactly once when the task leaves the BPF scheduler's custody, in one of the following cases: a) regular dispatch: a task dispatched to a user DSQ or stored in internal BPF data structures is moved to a terminal DSQ (ops.dequeue() called without any special flags set), b) core scheduling dispatch: core-sched picks task before dispatch (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set), c) property change: task properties modified before dispatch, (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set). This allows BPF schedulers to: - reliably track task ownership and lifecycle, - maintain accurate accounting of managed tasks, - update internal state when tasks change properties. Cc: Tejun Heo Cc: Emil Tsalapatis Cc: Kuba Piecuch Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- Documentation/scheduler/sched-ext.rst | 78 +++++++++++++++-- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 110 ++++++++++++++++++++++-- kernel/sched/ext_internal.h | 7 ++ tools/sched_ext/include/scx/enum_defs.autogen.h | 1 + tools/sched_ext/include/scx/enums.autogen.bpf.h | 2 + tools/sched_ext/include/scx/enums.autogen.h | 1 + 7 files changed, 184 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 9e2882d937b4..7cb77fd2e4d7 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -228,16 +228,23 @@ The following briefly shows how a waking task is scheduled and executed. scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, using ``ops.select_cpu()`` judiciously can be simpler and more efficient. - A task can be immediately inserted into a DSQ from ``ops.select_cpu()`` - by calling ``scx_bpf_dsq_insert()``. If the task is inserted into - ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be inserted into the - local DSQ of whichever CPU is returned from ``ops.select_cpu()``. - Additionally, inserting directly from ``ops.select_cpu()`` will cause the - ``ops.enqueue()`` callback to be skipped. - Note that the scheduler core will ignore an invalid CPU selection, for example, if it's outside the allowed cpumask of the task. + A task can be immediately inserted into a DSQ from ``ops.select_cpu()`` + by calling ``scx_bpf_dsq_insert()`` or ``scx_bpf_dsq_insert_vtime()``. + + If the task is inserted into ``SCX_DSQ_LOCAL`` from + ``ops.select_cpu()``, it will be added to the local DSQ of whichever CPU + is returned from ``ops.select_cpu()``. Additionally, inserting directly + from ``ops.select_cpu()`` will cause the ``ops.enqueue()`` callback to + be skipped. + + Any other attempt to store a task in BPF-internal data structures from + ``ops.select_cpu()`` does not prevent ``ops.enqueue()`` from being + invoked. This is discouraged, as it can introduce racy behavior or + inconsistent state. + 2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the task was inserted directly from ``ops.select_cpu()``). ``ops.enqueue()`` can make one of the following decisions: @@ -251,6 +258,61 @@ The following briefly shows how a waking task is scheduled and executed. * Queue the task on the BPF side. + **Task State Tracking and ops.dequeue() Semantics** + + A task is in the "BPF scheduler's custody" when the BPF scheduler is + responsible for managing its lifecycle. A task enters custody when it is + dispatched to a user DSQ or stored in the BPF scheduler's internal data + structures. Custody is entered only from ``ops.enqueue()`` for those + operations. The only exception is dispatching to a user DSQ from + ``ops.select_cpu()``: although the task is not yet technically in BPF + scheduler custody at that point, the dispatch has the same semantic + effect as dispatching from ``ops.enqueue()`` for custody-related + purposes. + + Once ``ops.enqueue()`` is called, the task may or may not enter custody + depending on what the scheduler does: + + * **Directly dispatched to terminal DSQs** (``SCX_DSQ_LOCAL``, + ``SCX_DSQ_LOCAL_ON | cpu``, or ``SCX_DSQ_GLOBAL``): the BPF scheduler + is done with the task - it either goes straight to a CPU's local run + queue or to the global DSQ as a fallback. The task never enters (or + exits) BPF custody, and ``ops.dequeue()`` will not be called. + + * **Dispatch to user-created DSQs** (custom DSQs): the task enters the + BPF scheduler's custody. When the task later leaves BPF custody + (dispatched to a terminal DSQ, picked by core-sched, or dequeued for + sleep/property changes), ``ops.dequeue()`` will be called exactly + once. + + * **Stored in BPF data structures** (e.g., internal BPF queues): the + task is in BPF custody. ``ops.dequeue()`` will be called when it + leaves (e.g., when ``ops.dispatch()`` moves it to a terminal DSQ, or + on property change / sleep). + + When a task leaves BPF scheduler custody, ``ops.dequeue()`` is invoked. + The dequeue can happen for different reasons, distinguished by flags: + + 1. **Regular dispatch**: when a task in BPF custody is dispatched to a + terminal DSQ from ``ops.dispatch()`` (leaving BPF custody for + execution), ``ops.dequeue()`` is triggered without any special flags. + + 2. **Core scheduling pick**: when ``CONFIG_SCHED_CORE`` is enabled and + core scheduling picks a task for execution while it's still in BPF + custody, ``ops.dequeue()`` is called with the + ``SCX_DEQ_CORE_SCHED_EXEC`` flag. + + 3. **Scheduling property change**: when a task property changes (via + operations like ``sched_setaffinity()``, ``sched_setscheduler()``, + priority changes, CPU migrations, etc.) while the task is still in + BPF custody, ``ops.dequeue()`` is called with the + ``SCX_DEQ_SCHED_CHANGE`` flag set in ``deq_flags``. + + **Important**: Once a task has left BPF custody (e.g., after being + dispatched to a terminal DSQ), property changes will not trigger + ``ops.dequeue()``, since the task is no longer managed by the BPF + scheduler. + 3. When a CPU is ready to schedule, it first looks at its local DSQ. If empty, it then looks at the global DSQ. If there still isn't a task to run, ``ops.dispatch()`` is invoked which can use the following two @@ -318,6 +380,8 @@ by a sched_ext scheduler: /* Any usable CPU becomes available */ ops.dispatch(); /* Task is moved to a local DSQ */ + + ops.dequeue(); /* Exiting BPF scheduler */ } ops.running(); /* Task starts running on its assigned CPU */ while (task->scx.slice > 0 && task is runnable) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index bcb962d5ee7d..4601e5ecb43c 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -84,6 +84,7 @@ struct scx_dispatch_q { /* scx_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 044bb2168dd0..d5e688b9acc0 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -986,12 +986,45 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +/* + * Return true if @p is moving due to an internal SCX migration, false + * otherwise. + */ +static inline bool task_scx_migrating(struct task_struct *p) +{ + /* + * We only need to check sticky_cpu: it is set to the destination + * CPU in move_remote_task_to_local_dsq() before deactivate_task() + * and cleared when the task is enqueued on the destination, so it + * is only non-negative during an internal SCX migration. + */ + return p->scx.sticky_cpu >= 0; +} + +/* + * Call ops.dequeue() if the task is in BPF custody and not migrating. + * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. + */ +static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, u64 deq_flags) +{ + if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) + return; + + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, deq_flags); + + p->scx.flags &= ~SCX_TASK_IN_CUSTODY; +} + static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; + call_task_dequeue(scx_root, rq, p, 0); + /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving @@ -1115,17 +1148,34 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; p->scx.ddsp_enq_flags = 0; + /* + * Update custody and call ops.dequeue() before clearing ops_state: + * once ops_state is cleared, waiters in ops_dequeue() can proceed + * and dequeue_task_scx() will RMW p->scx.flags. If we clear + * ops_state first, both sides would modify p->scx.flags + * concurrently in a non-atomic way. + */ + if (is_local) { + local_dsq_post_enq(dsq, p, enq_flags); + } else { + /* + * Task on global/bypass DSQ: leave custody, task on + * non-terminal DSQ: enter custody. + */ + if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) + call_task_dequeue(sch, rq, p, 0); + else + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + raw_spin_unlock(&dsq->lock); + } + /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire. */ if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - - if (is_local) - local_dsq_post_enq(dsq, p, enq_flags); - else - raw_spin_unlock(&dsq->lock); } static void task_unlink_from_dsq(struct task_struct *p, @@ -1405,6 +1455,12 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; + /* + * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY + * so ops.dequeue() is called when it leaves custody. + */ + p->scx.flags |= SCX_TASK_IN_CUSTODY; + /* * If not directly dispatched, QUEUEING isn't clear yet and dispatch or * dequeue may be waiting. The store_release matches their load_acquire. @@ -1522,6 +1578,14 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { struct scx_sched *sch = scx_root; unsigned long opss; + u64 op_deq_flags = deq_flags; + + /* + * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property + * change (not sleep or core-sched pick). + */ + if (!(op_deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) + op_deq_flags |= SCX_DEQ_SCHED_CHANGE; /* dequeue is always temporary, don't reset runnable_at */ clr_task_runnable(p, false); @@ -1539,10 +1603,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(sch, dequeue)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, - p, deq_flags); - + /* A queued task must always be in BPF scheduler's custody */ + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -1565,6 +1627,22 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); break; } + + /* + * Call ops.dequeue() if the task is still in BPF custody. + * + * The code that clears ops_state to %SCX_OPSS_NONE does not always + * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when + * we're moving a task that was in %SCX_OPSS_DISPATCHING to a + * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE + * so that a concurrent dequeue can proceed, but we clear + * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the + * task. So we can see NONE + IN_CUSTODY here and we must handle + * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see + * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until + * it is enqueued on the destination. + */ + call_task_dequeue(sch, rq, p, op_deq_flags); } static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) @@ -2935,6 +3013,13 @@ static void scx_enable_task(struct task_struct *p) lockdep_assert_rq_held(rq); + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); + /* * Set the weight before calling ops.enable() so that the scheduler * doesn't see a stale value if they inspect the task struct. @@ -2966,6 +3051,13 @@ static void scx_disable_task(struct task_struct *p) if (SCX_HAS_OP(sch, disable)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); + + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); } static void scx_exit_task(struct task_struct *p) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 386c677e4c9a..befa9a5d6e53 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -982,6 +982,13 @@ enum scx_deq_flags { * it hasn't been dispatched yet. Dequeue from the BPF side. */ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, + + /* + * The task is being dequeued due to a property change (e.g., + * sched_setaffinity(), sched_setscheduler(), set_user_nice(), + * etc.). + */ + SCX_DEQ_SCHED_CHANGE = 1LLU << 33, }; enum scx_pick_idle_cpu_flags { diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h index c2c33df9292c..dcc945304760 100644 --- a/tools/sched_ext/include/scx/enum_defs.autogen.h +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -21,6 +21,7 @@ #define HAVE_SCX_CPU_PREEMPT_UNKNOWN #define HAVE_SCX_DEQ_SLEEP #define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DEQ_SCHED_CHANGE #define HAVE_SCX_DSQ_FLAG_BUILTIN #define HAVE_SCX_DSQ_FLAG_LOCAL_ON #define HAVE_SCX_DSQ_INVALID diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h index 2f8002bcc19a..5da50f937684 100644 --- a/tools/sched_ext/include/scx/enums.autogen.bpf.h +++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h @@ -127,3 +127,5 @@ const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak; const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak; #define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ +const volatile u64 __SCX_DEQ_SCHED_CHANGE __weak; +#define SCX_DEQ_SCHED_CHANGE __SCX_DEQ_SCHED_CHANGE diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h index fedec938584b..fc9a7a4d9dea 100644 --- a/tools/sched_ext/include/scx/enums.autogen.h +++ b/tools/sched_ext/include/scx/enums.autogen.h @@ -46,4 +46,5 @@ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \ + SCX_ENUM_SET(skel, scx_deq_flags, SCX_DEQ_SCHED_CHANGE); \ } while (0) -- cgit v1.2.3 From baff45179e90276a14acb9dffce17ff517708453 Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Fri, 30 Jan 2026 17:24:20 +0530 Subject: iio: adc: Add support for QCOM PMIC5 Gen3 ADC The ADC architecture on PMIC5 Gen3 is similar to that on PMIC5 Gen2, with all SW communication to ADC going through PMK8550 which communicates with other PMICs through PBS. One major difference is that the register interface used here is that of an SDAM (Shared Direct Access Memory) peripheral present on PMK8550. There may be more than one SDAM used for ADC5 Gen3 and each has eight channels, which may be used for either immediate reads (same functionality as previous PMIC5 and PMIC5 Gen2 ADC peripherals) or recurring measurements (same as ADC_TM functionality). By convention, we reserve the first channel of the first SDAM for all immediate reads and use the remaining channels across all SDAMs for ADC_TM monitoring functionality. Add support for PMIC5 Gen3 ADC driver for immediate read functionality. ADC_TM is implemented as an auxiliary thermal driver under this ADC driver. Signed-off-by: Jishnu Prakash Signed-off-by: Jonathan Cameron --- drivers/iio/adc/Kconfig | 26 + drivers/iio/adc/Makefile | 1 + drivers/iio/adc/qcom-spmi-adc5-gen3.c | 860 ++++++++++++++++++++++++++ include/linux/iio/adc/qcom-adc5-gen3-common.h | 211 +++++++ 4 files changed, 1098 insertions(+) create mode 100644 drivers/iio/adc/qcom-spmi-adc5-gen3.c create mode 100644 include/linux/iio/adc/qcom-adc5-gen3-common.h (limited to 'include') diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 60038ae8dfc4..1f5915dd192d 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -1366,6 +1366,32 @@ config QCOM_SPMI_ADC5 To compile this driver as a module, choose M here: the module will be called qcom-spmi-adc5. +config QCOM_SPMI_ADC5_GEN3 + tristate "Qualcomm Technologies Inc. SPMI PMIC5 GEN3 ADC" + depends on SPMI && THERMAL + select REGMAP_SPMI + select QCOM_VADC_COMMON + select AUXILIARY_BUS + help + IIO Voltage PMIC5 Gen3 ADC driver for Qualcomm Technologies Inc. + + The driver supports reading multiple channels. The ADC is a 16-bit + sigma-delta ADC. The hardware supports calibrated results for + conversion requests and clients include reading phone power supply + voltage, on board system thermistors connected to the PMIC ADC, + PMIC die temperature, charger temperature, battery current, USB + voltage input and voltage signals connected to supported PMIC GPIO + pins. The hardware supports internal pull-up for thermistors and can + choose between a 30k, 100k or 400k ohm pull up using the ADC channels. + + In addition, the same driver supports ADC thermal monitoring devices + too. They appear as thermal zones with multiple trip points. A thermal + client sets threshold temperature for both warm and cool trips and + gets updated when a threshold is reached. + + To compile this driver as a module, choose M here: the module will + be called qcom-spmi-adc5-gen3. + config RCAR_GYRO_ADC tristate "Renesas R-Car GyroADC driver" depends on ARCH_RCAR_GEN2 || COMPILE_TEST diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index c76550415ff1..097357d146ba 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -116,6 +116,7 @@ obj-$(CONFIG_PAC1934) += pac1934.o obj-$(CONFIG_PALMAS_GPADC) += palmas_gpadc.o obj-$(CONFIG_QCOM_PM8XXX_XOADC) += qcom-pm8xxx-xoadc.o obj-$(CONFIG_QCOM_SPMI_ADC5) += qcom-spmi-adc5.o +obj-$(CONFIG_QCOM_SPMI_ADC5_GEN3) += qcom-spmi-adc5-gen3.o obj-$(CONFIG_QCOM_SPMI_IADC) += qcom-spmi-iadc.o obj-$(CONFIG_QCOM_SPMI_RRADC) += qcom-spmi-rradc.o obj-$(CONFIG_QCOM_SPMI_VADC) += qcom-spmi-vadc.o diff --git a/drivers/iio/adc/qcom-spmi-adc5-gen3.c b/drivers/iio/adc/qcom-spmi-adc5-gen3.c new file mode 100644 index 000000000000..f8168a14b907 --- /dev/null +++ b/drivers/iio/adc/qcom-spmi-adc5-gen3.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ADC5_GEN3_VADC_SDAM 0x0 + +struct adc5_chip; + +/** + * struct adc5_channel_prop - ADC channel structure + * @common_props: structure with ADC channel properties (common to TM usage). + * @adc_tm: indicates TM type if the channel is used for TM measurements. + * @chip: pointer to top-level ADC device structure. + */ +struct adc5_channel_prop { + struct adc5_channel_common_prop common_props; + int adc_tm; + struct adc5_chip *chip; +}; + +/** + * struct adc5_chip - ADC private structure. + * @dev: SPMI ADC5 Gen3 device. + * @dev_data: Top-level ADC device data. + * @nchannels: number of ADC channels. + * @chan_props: array of ADC channel properties. + * @iio_chans: array of IIO channels specification. + * @complete: ADC result notification after interrupt is received. + * @lock: ADC lock for access to the peripheral, to prevent concurrent + * requests from multiple clients. + * @data: software configuration data. + * @n_tm_channels: number of ADC channels used for TM measurements. + * @handler: TM callback to be called for threshold violation interrupt + * on first SDAM. + * @tm_aux: pointer to auxiliary TM device. + */ +struct adc5_chip { + struct device *dev; + struct adc5_device_data dev_data; + unsigned int nchannels; + struct adc5_channel_prop *chan_props; + struct iio_chan_spec *iio_chans; + struct completion complete; + struct mutex lock; + const struct adc5_data *data; + unsigned int n_tm_channels; + void (*handler)(struct auxiliary_device *tm_aux); + struct auxiliary_device *tm_aux; +}; + +int adc5_gen3_read(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len) +{ + return regmap_bulk_read(adc->regmap, + adc->base[sdam_index].base_addr + offset, + data, len); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_read, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_write(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len) +{ + return regmap_bulk_write(adc->regmap, + adc->base[sdam_index].base_addr + offset, + data, len); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_write, "QCOM_SPMI_ADC5_GEN3"); + +static int adc5_gen3_read_voltage_data(struct adc5_chip *adc, u16 *data) +{ + u8 rslt[2]; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CH_DATA0(0), rslt, sizeof(rslt)); + if (ret) + return ret; + + *data = get_unaligned_le16(rslt); + + if (*data == ADC5_USR_DATA_CHECK) { + dev_err(adc->dev, "Invalid data:%#x\n", *data); + return -EINVAL; + } + + dev_dbg(adc->dev, "voltage raw code:%#x\n", *data); + + return 0; +} + +void adc5_gen3_update_dig_param(struct adc5_channel_common_prop *prop, u8 *data) +{ + /* Update calibration select and decimation ratio select */ + *data &= ~(ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK | ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK); + *data |= FIELD_PREP(ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK, prop->cal_method); + *data |= FIELD_PREP(ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK, prop->decimation); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_update_dig_param, "QCOM_SPMI_ADC5_GEN3"); + +#define ADC5_GEN3_READ_CONFIG_REGS 7 + +static int adc5_gen3_configure(struct adc5_chip *adc, + struct adc5_channel_common_prop *prop) +{ + u8 buf[ADC5_GEN3_READ_CONFIG_REGS]; + u8 conv_req = 0; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, ADC5_GEN3_SID, + buf, sizeof(buf)); + if (ret) + return ret; + + /* Write SID */ + buf[0] = FIELD_PREP(ADC5_GEN3_SID_MASK, prop->sid); + + /* + * Use channel 0 by default for immediate conversion and to indicate + * there is an actual conversion request + */ + buf[1] = ADC5_GEN3_CHAN_CONV_REQ | 0; + + buf[2] = ADC5_GEN3_TIME_IMMEDIATE; + + /* Digital param selection */ + adc5_gen3_update_dig_param(prop, &buf[3]); + + /* Update fast average sample value */ + buf[4] = FIELD_PREP(ADC5_GEN3_FAST_AVG_CTL_SAMPLES_MASK, + prop->avg_samples) | ADC5_GEN3_FAST_AVG_CTL_EN; + + /* Select ADC channel */ + buf[5] = prop->channel; + + /* Select HW settle delay for channel */ + buf[6] = FIELD_PREP(ADC5_GEN3_HW_SETTLE_DELAY_MASK, + prop->hw_settle_time_us); + + reinit_completion(&adc->complete); + + ret = adc5_gen3_write(&adc->dev_data, ADC5_GEN3_VADC_SDAM, ADC5_GEN3_SID, + buf, sizeof(buf)); + if (ret) + return ret; + + conv_req = ADC5_GEN3_CONV_REQ_REQ; + return adc5_gen3_write(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CONV_REQ, &conv_req, sizeof(conv_req)); +} + +/* + * Worst case delay from PBS in readying handshake bit can be up to 15ms, when + * PBS is busy running other simultaneous transactions, while in the best case, + * it is already ready at this point. Assigning polling delay and retry count + * accordingly. + */ + +#define ADC5_GEN3_HS_DELAY_US 100 +#define ADC5_GEN3_HS_RETRY_COUNT 150 + +int adc5_gen3_poll_wait_hs(struct adc5_device_data *adc, + unsigned int sdam_index) +{ + u8 conv_req = ADC5_GEN3_CONV_REQ_REQ; + int ret, count; + u8 status = 0; + + for (count = 0; count < ADC5_GEN3_HS_RETRY_COUNT; count++) { + ret = adc5_gen3_read(adc, sdam_index, ADC5_GEN3_HS, &status, sizeof(status)); + if (ret) + return ret; + + if (status == ADC5_GEN3_HS_READY) { + ret = adc5_gen3_read(adc, sdam_index, ADC5_GEN3_CONV_REQ, + &conv_req, sizeof(conv_req)); + if (ret) + return ret; + + if (!conv_req) + return 0; + } + + fsleep(ADC5_GEN3_HS_DELAY_US); + } + + pr_err("Setting HS ready bit timed out, sdam_index:%d, status:%#x\n", + sdam_index, status); + return -ETIMEDOUT; +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_poll_wait_hs, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_status_clear(struct adc5_device_data *adc, + int sdam_index, u16 offset, u8 *val, int len) +{ + u8 value; + int ret; + + ret = adc5_gen3_write(adc, sdam_index, offset, val, len); + if (ret) + return ret; + + /* To indicate conversion request is only to clear a status */ + value = 0; + ret = adc5_gen3_write(adc, sdam_index, ADC5_GEN3_PERPH_CH, &value, + sizeof(value)); + if (ret) + return ret; + + value = ADC5_GEN3_CONV_REQ_REQ; + return adc5_gen3_write(adc, sdam_index, ADC5_GEN3_CONV_REQ, &value, + sizeof(value)); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_status_clear, "QCOM_SPMI_ADC5_GEN3"); + +/* + * Worst case delay from PBS for conversion time can be up to 500ms, when PBS + * has timed out twice, once for the initial attempt and once for a retry of + * the same transaction. + */ + +#define ADC5_GEN3_CONV_TIMEOUT_MS 501 + +static int adc5_gen3_do_conversion(struct adc5_chip *adc, + struct adc5_channel_common_prop *prop, + u16 *data_volt) +{ + unsigned long rc; + int ret; + u8 val; + + guard(mutex)(&adc->lock); + ret = adc5_gen3_poll_wait_hs(&adc->dev_data, ADC5_GEN3_VADC_SDAM); + if (ret) + return ret; + + ret = adc5_gen3_configure(adc, prop); + if (ret) { + dev_err(adc->dev, "ADC configure failed with %d\n", ret); + return ret; + } + + /* No support for polling mode at present */ + rc = wait_for_completion_timeout(&adc->complete, + msecs_to_jiffies(ADC5_GEN3_CONV_TIMEOUT_MS)); + if (!rc) { + dev_err(adc->dev, "Reading ADC channel %s timed out\n", + prop->label); + return -ETIMEDOUT; + } + + ret = adc5_gen3_read_voltage_data(adc, data_volt); + if (ret) + return ret; + + val = BIT(0); + return adc5_gen3_status_clear(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_EOC_CLR, &val, 1); +} + +static irqreturn_t adc5_gen3_isr(int irq, void *dev_id) +{ + struct adc5_chip *adc = dev_id; + struct device *dev = adc->dev; + struct auxiliary_device *adev; + u8 status, eoc_status, val; + u8 tm_status[2]; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_STATUS1, &status, sizeof(status)); + if (ret) { + dev_err(dev, "adc read status1 failed with %d\n", ret); + return IRQ_HANDLED; + } + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_EOC_STS, &eoc_status, sizeof(eoc_status)); + if (ret) { + dev_err(dev, "adc read eoc status failed with %d\n", ret); + return IRQ_HANDLED; + } + + if (status & ADC5_GEN3_STATUS1_CONV_FAULT) { + dev_err_ratelimited(dev, + "Unexpected conversion fault, status:%#x, eoc_status:%#x\n", + status, eoc_status); + val = ADC5_GEN3_CONV_ERR_CLR_REQ; + adc5_gen3_status_clear(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CONV_ERR_CLR, &val, 1); + return IRQ_HANDLED; + } + + /* CHAN0 is the preconfigured channel for immediate conversion */ + if (eoc_status & ADC5_GEN3_EOC_CHAN_0) + complete(&adc->complete); + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_TM_HIGH_STS, tm_status, sizeof(tm_status)); + if (ret) { + dev_err(dev, "adc read TM status failed with %d\n", ret); + return IRQ_HANDLED; + } + + dev_dbg(dev, "Interrupt status:%#x, EOC status:%#x, high:%#x, low:%#x\n", + status, eoc_status, tm_status[0], tm_status[1]); + + if (tm_status[0] || tm_status[1]) { + adev = adc->tm_aux; + if (!adev || !adev->dev.driver) { + dev_err(dev, "adc_tm auxiliary device not initialized\n"); + return IRQ_HANDLED; + } + + adc->handler(adev); + } + + return IRQ_HANDLED; +} + +static int adc5_gen3_fwnode_xlate(struct iio_dev *indio_dev, + const struct fwnode_reference_args *iiospec) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + int i, v_channel; + + for (i = 0; i < adc->nchannels; i++) { + v_channel = ADC5_GEN3_V_CHAN(adc->chan_props[i].common_props); + if (v_channel == iiospec->args[0]) + return i; + } + + return -ENOENT; +} + +static int adc5_gen3_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, int *val, + int *val2, long mask) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + struct adc5_channel_common_prop *prop; + u16 adc_code_volt; + int ret; + + prop = &adc->chan_props[chan->address].common_props; + + switch (mask) { + case IIO_CHAN_INFO_PROCESSED: + ret = adc5_gen3_do_conversion(adc, prop, &adc_code_volt); + if (ret) + return ret; + + ret = qcom_adc5_hw_scale(prop->scale_fn_type, prop->prescale, + adc->data, adc_code_volt, val); + if (ret) + return ret; + + return IIO_VAL_INT; + default: + return -EINVAL; + } +} + +static int adc5_gen3_read_label(struct iio_dev *indio_dev, + const struct iio_chan_spec *chan, char *label) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + struct adc5_channel_prop *prop; + + prop = &adc->chan_props[chan->address]; + return sprintf(label, "%s\n", prop->common_props.label); +} + +static const struct iio_info adc5_gen3_info = { + .read_raw = adc5_gen3_read_raw, + .read_label = adc5_gen3_read_label, + .fwnode_xlate = adc5_gen3_fwnode_xlate, +}; + +struct adc5_channels { + unsigned int prescale_index; + enum iio_chan_type type; + long info_mask; + enum vadc_scale_fn_type scale_fn_type; +}; + +/* In these definitions, _pre refers to an index into adc5_prescale_ratios. */ +#define ADC5_CHAN(_type, _mask, _pre, _scale) \ + { \ + .prescale_index = _pre, \ + .type = _type, \ + .info_mask = _mask, \ + .scale_fn_type = _scale, \ + }, \ + +#define ADC5_CHAN_TEMP(_pre, _scale) \ + ADC5_CHAN(IIO_TEMP, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +#define ADC5_CHAN_VOLT(_pre, _scale) \ + ADC5_CHAN(IIO_VOLTAGE, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +#define ADC5_CHAN_CUR(_pre, _scale) \ + ADC5_CHAN(IIO_CURRENT, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +static const struct adc5_channels adc5_gen3_chans_pmic[ADC5_MAX_CHANNEL] = { + [ADC5_GEN3_REF_GND] = ADC5_CHAN_VOLT(0, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_1P25VREF] = ADC5_CHAN_VOLT(0, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VPH_PWR] = ADC5_CHAN_VOLT(1, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VBAT_SNS_QBG] = ADC5_CHAN_VOLT(1, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_USB_SNS_V_16] = ADC5_CHAN_TEMP(8, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VIN_DIV16_MUX] = ADC5_CHAN_TEMP(8, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_DIE_TEMP] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_PMIC_THERM_PM7) + [ADC5_GEN3_AMUX1_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX2_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX3_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX4_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX5_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX6_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX1_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX2_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX3_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX4_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) +}; + +static int adc5_gen3_get_fw_channel_data(struct adc5_chip *adc, + struct adc5_channel_prop *prop, + struct fwnode_handle *fwnode) +{ + const char *name = fwnode_get_name(fwnode); + const struct adc5_data *data = adc->data; + struct device *dev = adc->dev; + const char *channel_name; + u32 chan, value, sid; + u32 varr[2]; + int ret; + + ret = fwnode_property_read_u32(fwnode, "reg", &chan); + if (ret < 0) + return dev_err_probe(dev, ret, "invalid channel number %s\n", + name); + + /* + * Value read from "reg" is virtual channel number + * virtual channel number = sid << 8 | channel number + */ + sid = FIELD_GET(ADC5_GEN3_VIRTUAL_SID_MASK, chan); + chan = FIELD_GET(ADC5_GEN3_CHANNEL_MASK, chan); + + if (chan > ADC5_MAX_CHANNEL) + return dev_err_probe(dev, -EINVAL, + "%s invalid channel number %d\n", + name, chan); + + prop->common_props.channel = chan; + prop->common_props.sid = sid; + + if (!adc->data->adc_chans[chan].info_mask) + return dev_err_probe(dev, -EINVAL, "Channel %#x not supported\n", chan); + + channel_name = name; + fwnode_property_read_string(fwnode, "label", &channel_name); + prop->common_props.label = channel_name; + + value = data->decimation[ADC5_DECIMATION_DEFAULT]; + fwnode_property_read_u32(fwnode, "qcom,decimation", &value); + ret = qcom_adc5_decimation_from_dt(value, data->decimation); + if (ret < 0) + return dev_err_probe(dev, ret, "%#x invalid decimation %d\n", + chan, value); + prop->common_props.decimation = ret; + + prop->common_props.prescale = adc->data->adc_chans[chan].prescale_index; + ret = fwnode_property_read_u32_array(fwnode, "qcom,pre-scaling", varr, 2); + if (!ret) { + ret = qcom_adc5_prescaling_from_dt(varr[0], varr[1]); + if (ret < 0) + return dev_err_probe(dev, ret, + "%#x invalid pre-scaling <%d %d>\n", + chan, varr[0], varr[1]); + prop->common_props.prescale = ret; + } + + value = data->hw_settle_1[VADC_DEF_HW_SETTLE_TIME]; + fwnode_property_read_u32(fwnode, "qcom,hw-settle-time", &value); + ret = qcom_adc5_hw_settle_time_from_dt(value, data->hw_settle_1); + if (ret < 0) + return dev_err_probe(dev, ret, + "%#x invalid hw-settle-time %d us\n", + chan, value); + prop->common_props.hw_settle_time_us = ret; + + value = BIT(VADC_DEF_AVG_SAMPLES); + fwnode_property_read_u32(fwnode, "qcom,avg-samples", &value); + ret = qcom_adc5_avg_samples_from_dt(value); + if (ret < 0) + return dev_err_probe(dev, ret, "%#x invalid avg-samples %d\n", + chan, value); + prop->common_props.avg_samples = ret; + + if (fwnode_property_read_bool(fwnode, "qcom,ratiometric")) + prop->common_props.cal_method = ADC5_RATIOMETRIC_CAL; + else + prop->common_props.cal_method = ADC5_ABSOLUTE_CAL; + + prop->adc_tm = fwnode_property_read_bool(fwnode, "qcom,adc-tm"); + if (prop->adc_tm) { + adc->n_tm_channels++; + if (adc->n_tm_channels > (adc->dev_data.num_sdams * 8 - 1)) + return dev_err_probe(dev, -EINVAL, + "Number of TM nodes %u greater than channels supported:%u\n", + adc->n_tm_channels, + adc->dev_data.num_sdams * 8 - 1); + } + + return 0; +} + +static const struct adc5_data adc5_gen3_data_pmic = { + .full_scale_code_volt = 0x70e4, + .adc_chans = adc5_gen3_chans_pmic, + .info = &adc5_gen3_info, + .decimation = (unsigned int [ADC5_DECIMATION_SAMPLES_MAX]) + { 85, 340, 1360 }, + .hw_settle_1 = (unsigned int [VADC_HW_SETTLE_SAMPLES_MAX]) + { 15, 100, 200, 300, + 400, 500, 600, 700, + 1000, 2000, 4000, 8000, + 16000, 32000, 64000, 128000 }, +}; + +static const struct of_device_id adc5_match_table[] = { + { + .compatible = "qcom,spmi-adc5-gen3", + .data = &adc5_gen3_data_pmic, + }, + { } +}; +MODULE_DEVICE_TABLE(of, adc5_match_table); + +static int adc5_get_fw_data(struct adc5_chip *adc) +{ + const struct adc5_channels *adc_chan; + struct adc5_channel_prop *chan_props; + struct iio_chan_spec *iio_chan; + struct device *dev = adc->dev; + unsigned int index = 0; + int ret; + + adc->nchannels = device_get_child_node_count(dev); + if (!adc->nchannels) + return dev_err_probe(dev, -EINVAL, "No ADC channels found\n"); + + adc->iio_chans = devm_kcalloc(dev, adc->nchannels, + sizeof(*adc->iio_chans), GFP_KERNEL); + if (!adc->iio_chans) + return -ENOMEM; + + adc->chan_props = devm_kcalloc(dev, adc->nchannels, + sizeof(*adc->chan_props), GFP_KERNEL); + if (!adc->chan_props) + return -ENOMEM; + + chan_props = adc->chan_props; + adc->n_tm_channels = 0; + iio_chan = adc->iio_chans; + adc->data = device_get_match_data(dev); + + device_for_each_child_node_scoped(dev, child) { + ret = adc5_gen3_get_fw_channel_data(adc, chan_props, child); + if (ret) + return ret; + + chan_props->chip = adc; + adc_chan = &adc->data->adc_chans[chan_props->common_props.channel]; + chan_props->common_props.scale_fn_type = adc_chan->scale_fn_type; + + iio_chan->channel = ADC5_GEN3_V_CHAN(chan_props->common_props); + iio_chan->info_mask_separate = adc_chan->info_mask; + iio_chan->type = adc_chan->type; + iio_chan->address = index; + iio_chan->indexed = 1; + iio_chan++; + chan_props++; + index++; + } + + return 0; +} + +static void adc5_gen3_uninit_aux(void *data) +{ + auxiliary_device_uninit(data); +} + +static void adc5_gen3_delete_aux(void *data) +{ + auxiliary_device_delete(data); +} + +static void adc5_gen3_aux_device_release(struct device *dev) {} + +static int adc5_gen3_add_aux_tm_device(struct adc5_chip *adc) +{ + struct tm5_aux_dev_wrapper *aux_device; + int i, ret, i_tm = 0; + + aux_device = devm_kzalloc(adc->dev, sizeof(*aux_device), GFP_KERNEL); + if (!aux_device) + return -ENOMEM; + + aux_device->aux_dev.name = "adc5_tm_gen3"; + aux_device->aux_dev.dev.parent = adc->dev; + aux_device->aux_dev.dev.release = adc5_gen3_aux_device_release; + + aux_device->tm_props = devm_kcalloc(adc->dev, adc->n_tm_channels, + sizeof(*aux_device->tm_props), + GFP_KERNEL); + if (!aux_device->tm_props) + return -ENOMEM; + + aux_device->dev_data = &adc->dev_data; + + for (i = 0; i < adc->nchannels; i++) { + if (!adc->chan_props[i].adc_tm) + continue; + aux_device->tm_props[i_tm] = adc->chan_props[i].common_props; + i_tm++; + } + + device_set_of_node_from_dev(&aux_device->aux_dev.dev, adc->dev); + + aux_device->n_tm_channels = adc->n_tm_channels; + + ret = auxiliary_device_init(&aux_device->aux_dev); + if (ret) + return ret; + + ret = devm_add_action_or_reset(adc->dev, adc5_gen3_uninit_aux, + &aux_device->aux_dev); + if (ret) + return ret; + + ret = auxiliary_device_add(&aux_device->aux_dev); + if (ret) + return ret; + ret = devm_add_action_or_reset(adc->dev, adc5_gen3_delete_aux, + &aux_device->aux_dev); + if (ret) + return ret; + + adc->tm_aux = &aux_device->aux_dev; + + return 0; +} + +void adc5_gen3_mutex_lock(struct device *dev) + __acquires(&adc->lock) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + mutex_lock(&adc->lock); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_mutex_lock, "QCOM_SPMI_ADC5_GEN3"); + +void adc5_gen3_mutex_unlock(struct device *dev) + __releases(&adc->lock) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + mutex_unlock(&adc->lock); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_mutex_unlock, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_get_scaled_reading(struct device *dev, + struct adc5_channel_common_prop *common_props, + int *val) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + u16 adc_code_volt; + int ret; + + ret = adc5_gen3_do_conversion(adc, common_props, &adc_code_volt); + if (ret) + return ret; + + return qcom_adc5_hw_scale(common_props->scale_fn_type, + common_props->prescale, + adc->data, adc_code_volt, val); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_get_scaled_reading, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_therm_code_to_temp(struct device *dev, + struct adc5_channel_common_prop *common_props, + u16 code, int *val) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + return qcom_adc5_hw_scale(common_props->scale_fn_type, + common_props->prescale, + adc->data, code, val); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_therm_code_to_temp, "QCOM_SPMI_ADC5_GEN3"); + +void adc5_gen3_register_tm_event_notifier(struct device *dev, + void (*handler)(struct auxiliary_device *)) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + adc->handler = handler; +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_register_tm_event_notifier, "QCOM_SPMI_ADC5_GEN3"); + +static int adc5_gen3_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct iio_dev *indio_dev; + struct adc5_chip *adc; + struct regmap *regmap; + int ret, i; + u32 *reg; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return -ENODEV; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*adc)); + if (!indio_dev) + return -ENOMEM; + + adc = iio_priv(indio_dev); + adc->dev_data.regmap = regmap; + adc->dev = dev; + + ret = device_property_count_u32(dev, "reg"); + if (ret < 0) + return ret; + + adc->dev_data.num_sdams = ret; + + reg = devm_kcalloc(dev, adc->dev_data.num_sdams, sizeof(u32), + GFP_KERNEL); + if (!reg) + return -ENOMEM; + + ret = device_property_read_u32_array(dev, "reg", reg, + adc->dev_data.num_sdams); + if (ret) + return dev_err_probe(dev, ret, + "Failed to read reg property\n"); + + adc->dev_data.base = devm_kcalloc(dev, adc->dev_data.num_sdams, + sizeof(*adc->dev_data.base), + GFP_KERNEL); + if (!adc->dev_data.base) + return -ENOMEM; + + platform_set_drvdata(pdev, indio_dev); + init_completion(&adc->complete); + ret = devm_mutex_init(dev, &adc->lock); + if (ret) + return ret; + + for (i = 0; i < adc->dev_data.num_sdams; i++) { + adc->dev_data.base[i].base_addr = reg[i]; + + ret = platform_get_irq(pdev, i); + if (ret < 0) + return dev_err_probe(dev, ret, + "Getting IRQ %d failed\n", i); + + adc->dev_data.base[i].irq = ret; + + adc->dev_data.base[i].irq_name = devm_kasprintf(dev, GFP_KERNEL, + "sdam%d", i); + if (!adc->dev_data.base[i].irq_name) + return -ENOMEM; + } + + ret = devm_request_irq(dev, adc->dev_data.base[ADC5_GEN3_VADC_SDAM].irq, + adc5_gen3_isr, 0, + adc->dev_data.base[ADC5_GEN3_VADC_SDAM].irq_name, + adc); + if (ret) + return dev_err_probe(dev, ret, + "Failed to request SDAM%d irq\n", + ADC5_GEN3_VADC_SDAM); + + ret = adc5_get_fw_data(adc); + if (ret) + return ret; + + if (adc->n_tm_channels > 0) { + ret = adc5_gen3_add_aux_tm_device(adc); + if (ret) + dev_err_probe(dev, ret, + "Failed to add auxiliary TM device\n"); + } + + indio_dev->name = "spmi-adc5-gen3"; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->info = &adc5_gen3_info; + indio_dev->channels = adc->iio_chans; + indio_dev->num_channels = adc->nchannels; + + return devm_iio_device_register(dev, indio_dev); +} + +static struct platform_driver adc5_gen3_driver = { + .driver = { + .name = "qcom-spmi-adc5-gen3", + .of_match_table = adc5_match_table, + }, + .probe = adc5_gen3_probe, +}; +module_platform_driver(adc5_gen3_driver); + +MODULE_DESCRIPTION("Qualcomm Technologies Inc. PMIC5 Gen3 ADC driver"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("QCOM_SPMI_ADC5_GEN3"); diff --git a/include/linux/iio/adc/qcom-adc5-gen3-common.h b/include/linux/iio/adc/qcom-adc5-gen3-common.h new file mode 100644 index 000000000000..6303eaa6640b --- /dev/null +++ b/include/linux/iio/adc/qcom-adc5-gen3-common.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + * + * Code used in the main and auxiliary Qualcomm PMIC voltage ADCs + * of type ADC5 Gen3. + */ + +#ifndef QCOM_ADC5_GEN3_COMMON_H +#define QCOM_ADC5_GEN3_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define ADC5_GEN3_HS 0x45 +#define ADC5_GEN3_HS_BUSY BIT(7) +#define ADC5_GEN3_HS_READY BIT(0) + +#define ADC5_GEN3_STATUS1 0x46 +#define ADC5_GEN3_STATUS1_CONV_FAULT BIT(7) +#define ADC5_GEN3_STATUS1_THR_CROSS BIT(6) +#define ADC5_GEN3_STATUS1_EOC BIT(0) + +#define ADC5_GEN3_TM_EN_STS 0x47 +#define ADC5_GEN3_TM_HIGH_STS 0x48 +#define ADC5_GEN3_TM_LOW_STS 0x49 + +#define ADC5_GEN3_EOC_STS 0x4a +#define ADC5_GEN3_EOC_CHAN_0 BIT(0) + +#define ADC5_GEN3_EOC_CLR 0x4b +#define ADC5_GEN3_TM_HIGH_STS_CLR 0x4c +#define ADC5_GEN3_TM_LOW_STS_CLR 0x4d +#define ADC5_GEN3_CONV_ERR_CLR 0x4e +#define ADC5_GEN3_CONV_ERR_CLR_REQ BIT(0) + +#define ADC5_GEN3_SID 0x4f +#define ADC5_GEN3_SID_MASK GENMASK(3, 0) + +#define ADC5_GEN3_PERPH_CH 0x50 +#define ADC5_GEN3_CHAN_CONV_REQ BIT(7) + +#define ADC5_GEN3_TIMER_SEL 0x51 +#define ADC5_GEN3_TIME_IMMEDIATE 0x1 + +#define ADC5_GEN3_DIG_PARAM 0x52 +#define ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK GENMASK(5, 4) +#define ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK GENMASK(3, 2) + +#define ADC5_GEN3_FAST_AVG 0x53 +#define ADC5_GEN3_FAST_AVG_CTL_EN BIT(7) +#define ADC5_GEN3_FAST_AVG_CTL_SAMPLES_MASK GENMASK(2, 0) + +#define ADC5_GEN3_ADC_CH_SEL_CTL 0x54 +#define ADC5_GEN3_DELAY_CTL 0x55 +#define ADC5_GEN3_HW_SETTLE_DELAY_MASK GENMASK(3, 0) + +#define ADC5_GEN3_CH_EN 0x56 +#define ADC5_GEN3_HIGH_THR_INT_EN BIT(1) +#define ADC5_GEN3_LOW_THR_INT_EN BIT(0) + +#define ADC5_GEN3_LOW_THR0 0x57 +#define ADC5_GEN3_LOW_THR1 0x58 +#define ADC5_GEN3_HIGH_THR0 0x59 +#define ADC5_GEN3_HIGH_THR1 0x5a + +#define ADC5_GEN3_CH_DATA0(channel) (0x5c + (channel) * 2) +#define ADC5_GEN3_CH_DATA1(channel) (0x5d + (channel) * 2) + +#define ADC5_GEN3_CONV_REQ 0xe5 +#define ADC5_GEN3_CONV_REQ_REQ BIT(0) + +#define ADC5_GEN3_VIRTUAL_SID_MASK GENMASK(15, 8) +#define ADC5_GEN3_CHANNEL_MASK GENMASK(7, 0) +#define ADC5_GEN3_V_CHAN(x) \ + (FIELD_PREP(ADC5_GEN3_VIRTUAL_SID_MASK, (x).sid) | (x).channel) + +/* ADC channels for PMIC5 Gen3 */ +#define ADC5_GEN3_REF_GND 0x00 +#define ADC5_GEN3_1P25VREF 0x01 +#define ADC5_GEN3_DIE_TEMP 0x03 +#define ADC5_GEN3_USB_SNS_V_16 0x11 +#define ADC5_GEN3_VIN_DIV16_MUX 0x12 +#define ADC5_GEN3_VPH_PWR 0x8e +#define ADC5_GEN3_VBAT_SNS_QBG 0x8f +/* 100k pull-up channels */ +#define ADC5_GEN3_AMUX1_THM_100K_PU 0x44 +#define ADC5_GEN3_AMUX2_THM_100K_PU 0x45 +#define ADC5_GEN3_AMUX3_THM_100K_PU 0x46 +#define ADC5_GEN3_AMUX4_THM_100K_PU 0x47 +#define ADC5_GEN3_AMUX5_THM_100K_PU 0x48 +#define ADC5_GEN3_AMUX6_THM_100K_PU 0x49 +#define ADC5_GEN3_AMUX1_GPIO_100K_PU 0x4a +#define ADC5_GEN3_AMUX2_GPIO_100K_PU 0x4b +#define ADC5_GEN3_AMUX3_GPIO_100K_PU 0x4c +#define ADC5_GEN3_AMUX4_GPIO_100K_PU 0x4d + +#define ADC5_MAX_CHANNEL 0xc0 + +enum adc5_cal_method { + ADC5_NO_CAL = 0, + ADC5_RATIOMETRIC_CAL, + ADC5_ABSOLUTE_CAL, +}; + +enum adc5_time_select { + MEAS_INT_DISABLE = 0, + MEAS_INT_IMMEDIATE, + MEAS_INT_50MS, + MEAS_INT_100MS, + MEAS_INT_1S, + MEAS_INT_NONE, +}; + +/** + * struct adc5_sdam_data - data per SDAM allocated for adc usage + * @base_addr: base address for the ADC SDAM peripheral. + * @irq_name: ADC IRQ name. + * @irq: ADC IRQ number. + */ +struct adc5_sdam_data { + u16 base_addr; + const char *irq_name; + int irq; +}; + +/** + * struct adc5_device_data - Top-level ADC device data + * @regmap: ADC peripheral register map field. + * @base: array of SDAM data. + * @num_sdams: number of ADC SDAM peripherals. + */ +struct adc5_device_data { + struct regmap *regmap; + struct adc5_sdam_data *base; + int num_sdams; +}; + +/** + * struct adc5_channel_common_prop - ADC channel properties (common to ADC and TM). + * @channel: channel number, refer to the channel list. + * @cal_method: calibration method. + * @decimation: sampling rate supported for the channel. + * @sid: ID of PMIC owning the channel. + * @label: Channel name used in device tree. + * @prescale: channel scaling performed on the input signal. + * @hw_settle_time_us: the time between AMUX being configured and the + * start of conversion in uS. + * @avg_samples: ability to provide single result from the ADC + * that is an average of multiple measurements. + * @scale_fn_type: Represents the scaling function to convert voltage + * physical units desired by the client for the channel. + */ +struct adc5_channel_common_prop { + unsigned int channel; + enum adc5_cal_method cal_method; + unsigned int decimation; + unsigned int sid; + const char *label; + unsigned int prescale; + unsigned int hw_settle_time_us; + unsigned int avg_samples; + enum vadc_scale_fn_type scale_fn_type; +}; + +/** + * struct tm5_aux_dev_wrapper - wrapper structure around TM auxiliary device + * @aux_dev: TM auxiliary device structure. + * @dev_data: Top-level ADC device data. + * @tm_props: Array of common ADC channel properties for TM channels. + * @n_tm_channels: number of TM channels. + */ +struct tm5_aux_dev_wrapper { + struct auxiliary_device aux_dev; + struct adc5_device_data *dev_data; + struct adc5_channel_common_prop *tm_props; + unsigned int n_tm_channels; +}; + +int adc5_gen3_read(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_write(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_poll_wait_hs(struct adc5_device_data *adc, + unsigned int sdam_index); + +void adc5_gen3_update_dig_param(struct adc5_channel_common_prop *prop, + u8 *data); + +int adc5_gen3_status_clear(struct adc5_device_data *adc, + int sdam_index, u16 offset, u8 *val, int len); + +void adc5_gen3_mutex_lock(struct device *dev); +void adc5_gen3_mutex_unlock(struct device *dev); +int adc5_gen3_get_scaled_reading(struct device *dev, + struct adc5_channel_common_prop *common_props, + int *val); +int adc5_gen3_therm_code_to_temp(struct device *dev, + struct adc5_channel_common_prop *common_props, + u16 code, int *val); +void adc5_gen3_register_tm_event_notifier(struct device *dev, + void (*handler)(struct auxiliary_device *)); + +#endif /* QCOM_ADC5_GEN3_COMMON_H */ -- cgit v1.2.3 From 3d35d41169d000f4fbf3c23999b8443e1173efce Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Mon, 23 Feb 2026 13:25:55 -0800 Subject: Input: export input_default_setkeycode Export input_default_setkeycode so that a driver can set a custom setkeycode handler to take some driver specific action but still call the default handler at some point. Signed-off-by: Fabio Baltieri Reviewed-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260222003717.471977-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 23 ++++++++++++++++++++--- include/linux/input.h | 4 ++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/input/input.c b/drivers/input/input.c index a500e1e276c2..c227eaa6271a 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -800,14 +800,30 @@ static int input_default_getkeycode(struct input_dev *dev, return 0; } -static int input_default_setkeycode(struct input_dev *dev, - const struct input_keymap_entry *ke, - unsigned int *old_keycode) +/** + * input_default_setkeycode - default setkeycode method + * @dev: input device which keymap is being updated. + * @ke: new keymap entry. + * @old_keycode: pointer to the location where old keycode should be stored. + * + * This function is the default implementation of &input_dev.setkeycode() + * method. It is typically used when a driver does not provide its own + * implementation, but it is also exported so drivers can extend it. + * + * The function must be called with &input_dev.event_lock held. + * + * Return: 0 on success, or a negative error code on failure. + */ +int input_default_setkeycode(struct input_dev *dev, + const struct input_keymap_entry *ke, + unsigned int *old_keycode) { unsigned int index; int error; int i; + lockdep_assert_held(&dev->event_lock); + if (!dev->keycodesize) return -EINVAL; @@ -861,6 +877,7 @@ static int input_default_setkeycode(struct input_dev *dev, __set_bit(ke->keycode, dev->keybit); return 0; } +EXPORT_SYMBOL(input_default_setkeycode); /** * input_get_keycode - retrieve keycode currently mapped to a given scancode diff --git a/include/linux/input.h b/include/linux/input.h index 7d7cb0593a63..06ca62328db1 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -517,6 +517,10 @@ INPUT_GENERATE_ABS_ACCESSORS(res, resolution) int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode); +int input_default_setkeycode(struct input_dev *dev, + const struct input_keymap_entry *ke, + unsigned int *old_keycode); + int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke); int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke); -- cgit v1.2.3 From fa4f81a8c15d4018eb2053b093bf1584777e80d4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 19 Feb 2026 10:47:03 +0900 Subject: ata: libata-scsi: make ata_scsi_simulate() static ata_scsi_simulate() is called only from libata-scsi.c. Move this function definition as a static function before its call in __ata_scsi_queuecmd() and remove its declaration from include/linux/libata.h. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke --- drivers/ata/libata-scsi.c | 147 +++++++++++++++++++++++----------------------- include/linux/libata.h | 1 - 2 files changed, 73 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 41918e21d0f8..ad628b398fc3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4420,6 +4420,79 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd) return NULL; } +/** + * ata_scsi_simulate - simulate SCSI command on ATA device + * @dev: the target device + * @cmd: SCSI command being sent to device. + * + * Interprets and directly executes a select list of SCSI commands + * that can be handled internally. + * + * LOCKING: + * spin_lock_irqsave(host lock) + */ +static void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) +{ + const u8 *scsicmd = cmd->cmnd; + u8 tmp8; + + switch (scsicmd[0]) { + case INQUIRY: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_inquiry); + break; + + case MODE_SENSE: + case MODE_SENSE_10: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_mode_sense); + break; + + case READ_CAPACITY: + case SERVICE_ACTION_IN_16: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_read_cap); + break; + + case REPORT_LUNS: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_report_luns); + break; + + case REQUEST_SENSE: + ata_scsi_set_sense(dev, cmd, 0, 0, 0); + break; + + /* if we reach this, then writeback caching is disabled, + * turning this into a no-op. + */ + case SYNCHRONIZE_CACHE: + case SYNCHRONIZE_CACHE_16: + fallthrough; + + /* no-op's, complete with success */ + case REZERO_UNIT: + case SEEK_6: + case SEEK_10: + case TEST_UNIT_READY: + break; + + case SEND_DIAGNOSTIC: + tmp8 = scsicmd[1] & ~(1 << 3); + if (tmp8 != 0x4 || scsicmd[3] || scsicmd[4]) + ata_scsi_set_invalid_field(dev, cmd, 1, 0xff); + break; + + case MAINTENANCE_IN: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_maint_in); + break; + + /* all other commands */ + default: + ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0); + /* "Invalid command operation code" */ + break; + } + + scsi_done(cmd); +} + enum scsi_qc_status __ata_scsi_queuecmd(struct scsi_cmnd *scmd, struct ata_device *dev) { @@ -4522,80 +4595,6 @@ enum scsi_qc_status ata_scsi_queuecmd(struct Scsi_Host *shost, } EXPORT_SYMBOL_GPL(ata_scsi_queuecmd); -/** - * ata_scsi_simulate - simulate SCSI command on ATA device - * @dev: the target device - * @cmd: SCSI command being sent to device. - * - * Interprets and directly executes a select list of SCSI commands - * that can be handled internally. - * - * LOCKING: - * spin_lock_irqsave(host lock) - */ - -void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) -{ - const u8 *scsicmd = cmd->cmnd; - u8 tmp8; - - switch(scsicmd[0]) { - case INQUIRY: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_inquiry); - break; - - case MODE_SENSE: - case MODE_SENSE_10: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_mode_sense); - break; - - case READ_CAPACITY: - case SERVICE_ACTION_IN_16: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_read_cap); - break; - - case REPORT_LUNS: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_report_luns); - break; - - case REQUEST_SENSE: - ata_scsi_set_sense(dev, cmd, 0, 0, 0); - break; - - /* if we reach this, then writeback caching is disabled, - * turning this into a no-op. - */ - case SYNCHRONIZE_CACHE: - case SYNCHRONIZE_CACHE_16: - fallthrough; - - /* no-op's, complete with success */ - case REZERO_UNIT: - case SEEK_6: - case SEEK_10: - case TEST_UNIT_READY: - break; - - case SEND_DIAGNOSTIC: - tmp8 = scsicmd[1] & ~(1 << 3); - if (tmp8 != 0x4 || scsicmd[3] || scsicmd[4]) - ata_scsi_set_invalid_field(dev, cmd, 1, 0xff); - break; - - case MAINTENANCE_IN: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_maint_in); - break; - - /* all other commands */ - default: - ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0); - /* "Invalid command operation code" */ - break; - } - - scsi_done(cmd); -} - int ata_scsi_add_hosts(struct ata_host *host, const struct scsi_host_template *sht) { int i, rc; diff --git a/include/linux/libata.h b/include/linux/libata.h index 00346ce3af5e..db87c99e4189 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1205,7 +1205,6 @@ extern unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, __le16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); -extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); -- cgit v1.2.3 From ecfa23b486b22844855844202424bc1966cebb33 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 17 Feb 2026 12:26:15 +0100 Subject: jiffies: Remove unused __jiffy_arch_data The __jiffy_arch_data definition was added in 2017 by commit 60b0a8c3d248 ("frv: declare jiffies to be located in the .data section") for the needs of the frv port. The frv support was removed in 2018 by commit fd8773f9f544 ("arch: remove frv port") and no other architecture has required __jiffy_arch_data. Therefore, remove this unused definition. Signed-off-by: Petr Pavlu Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260217112638.1525094-1-petr.pavlu@suse.com --- include/linux/jiffies.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..1a393d160420 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -67,10 +67,6 @@ extern void register_refined_jiffies(long clock_tick_rate); /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -#ifndef __jiffy_arch_data -#define __jiffy_arch_data -#endif - /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. @@ -83,7 +79,7 @@ extern void register_refined_jiffies(long clock_tick_rate); * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; -extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From 2ecd012774bc2342f28f47620100a7ad9046f586 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 23 Feb 2026 16:31:06 -0800 Subject: IB/cache: avoid kernel-doc warnings Use the correct function parameters names to eliminate kernel-doc warnings: Warning: include/rdma/ib_cache.h:47 function parameter 'device_handle' not described in 'ib_get_cached_pkey' Warning: include/rdma/ib_cache.h:89 function parameter 'port_active' not described in 'ib_get_cached_port_state' (not adding missing function return value descriptions) Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260224003106.3172916-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_cache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 2bf09b594d10..eed46d966e40 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -34,7 +34,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); /** * ib_get_cached_pkey - Returns a cached PKey table entry - * @device: The device to query. + * @device_handle: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached PKey table to query. * @pkey: The PKey value found at the specified index. @@ -80,7 +80,7 @@ int ib_get_cached_lmc(struct ib_device *device, * ib_get_cached_port_state - Returns a cached port state table entry * @device: The device to query. * @port_num: The port number of the device to query. - * @port_state: port_state for the specified port for that device. + * @port_active: port_state for the specified port for that device. * * ib_get_cached_port_state() fetches the specified port_state table entry stored in * the local software cache. -- cgit v1.2.3 From ff46d1392750444fab5ae5a0194764ffdc4ac0d2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 23 Feb 2026 16:31:20 -0800 Subject: RDMA/umem: fix kernel-doc warnings Add or correct kernel-doc comments to eliminate warnings: Warning: include/rdma/ib_umem.h:104 function parameter 'biter' not described in 'rdma_umem_for_each_dma_block' Warning: include/rdma/ib_umem.h:140 function parameter 'pgsz_bitmap' not described in 'ib_umem_find_best_pgoff' Warning: include/rdma/ib_umem.h:141 No description found for return value of 'ib_umem_find_best_pgoff' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260224003120.3173892-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_umem.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 0a8e092c0ea8..09b7f7d4685e 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -94,6 +94,7 @@ static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) /** * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem * @umem: umem to iterate over + * @biter: block iterator variable * @pgsz: Page size to split the list into * * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The @@ -121,7 +122,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, * ib_umem_find_best_pgoff - Find best HW page size * * @umem: umem struct - * @pgsz_bitmap bitmap of HW supported page sizes + * @pgsz_bitmap: bitmap of HW supported page sizes * @pgoff_bitmask: Mask of bits that can be represented with an offset * * This is very similar to ib_umem_find_best_pgsz() except instead of accepting @@ -134,6 +135,9 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, * * If the pgoff_bitmask requires either alignment in the low bit or an * unavailable page size for the high bits, this function returns 0. + * + * Returns: best HW page size for the parameters or 0 if none available + * for the given parameters. */ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem, unsigned long pgsz_bitmap, -- cgit v1.2.3 From 16dc2d72de577de4b413ba01b1b4a80d31832022 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 23 Feb 2026 16:31:34 -0800 Subject: RDMA/iwcm: fix some kernel-doc issues in iw_cm.h Use the "typedef" keyword as needed. Correct 2 function parameter names. Warning: include/rdma/iw_cm.h:42 function parameter 'iw_cm_handler' not described in 'int' Warning: include/rdma/iw_cm.h:42 expecting prototype for iw_cm_handler(). Prototype was for int() instead Warning: include/rdma/iw_cm.h:53 function parameter 'iw_event_handler' not described in 'int' Warning: include/rdma/iw_cm.h:53 expecting prototype for iw_event_handler(). Prototype was for int() instead Warning: include/rdma/iw_cm.h:104 function parameter 'cm_handler' not described in 'iw_create_cm_id' Warning: include/rdma/iw_cm.h:158 function parameter 'private_data' not described in 'iw_cm_reject' (not adding missing return value kernel-doc descriptions) Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260224003134.3174856-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/iw_cm.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 2b22f153ef63..57b33edd9ce7 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -33,8 +33,8 @@ struct iw_cm_event { }; /** - * iw_cm_handler - Function to be called by the IW CM when delivering events - * to the client. + * typedef iw_cm_handler - Function to be called by the IW CM when delivering + * events to the client. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. @@ -43,9 +43,9 @@ typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); /** - * iw_event_handler - Function called by the provider when delivering provider - * events to the IW CM. Returns either 0 indicating the event was processed - * or -errno if the event could not be processed. + * typedef iw_event_handler - Function called by the provider when delivering + * provider events to the IW CM. Returns either 0 indicating the event was + * processed or -errno if the event could not be processed. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. @@ -97,7 +97,7 @@ enum iw_flags { * iw_create_cm_id - Create an IW CM identifier. * * @device: The IB device on which to create the IW CM identier. - * @event_handler: User callback invoked to report events associated with the + * @cm_handler: User callback invoked to report events associated with the * returned IW CM identifier. * @context: User specified context associated with the id. */ @@ -147,7 +147,7 @@ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); * iw_cm_reject - Reject an incoming connection request. * * @cm_id: Connection identifier associated with the request. - * @private_daa: Pointer to data to deliver to the remote peer as part of the + * @private_data: Pointer to data to deliver to the remote peer as part of the * reject message. * @private_data_len: The number of bytes in the private_data parameter. * -- cgit v1.2.3 From 2865500db9339bff85a504c7fbad0047ebbf9331 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 23 Feb 2026 16:31:49 -0800 Subject: RDMA/restrack: fix kernel-doc indicator Use "/**" to begin kernel-doc comments. This eliminates these kernel-doc warnings: Warning: include/rdma/restrack.h:123 struct member 'kref' not described in 'rdma_restrack_entry' Warning: include/rdma/restrack.h:123 struct member 'comp' not described in 'rdma_restrack_entry' (not adding missing return value kernel-doc descriptions) Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260224003149.3175815-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/restrack.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 8a9bcf77dace..451f99e3717d 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -87,11 +87,11 @@ struct rdma_restrack_entry { * query stage. */ u8 no_track : 1; - /* + /** * @kref: Protect destroy of the resource */ struct kref kref; - /* + /** * @comp: Signal that all consumers of resource are completed their work */ struct completion comp; -- cgit v1.2.3 From 6974ae5aa23b7f37182da6b66d7f58313a55a88e Mon Sep 17 00:00:00 2001 From: GyoungBo Min Date: Wed, 29 Oct 2025 18:37:28 +0530 Subject: dt-bindings: clock: Add ARTPEC-9 clock controller Add dt-schema for Axis ARTPEC-9 SoC clock controller. The Clock Management Unit (CMU) has a top-level block CMU_CMU which generates clocks for other blocks. Add device-tree binding definitions for following CMU blocks: - CMU_CMU - CMU_BUS - CMU_CORE - CMU_CPUCL - CMU_FSYS0 - CMU_FSYS1 - CMU_IMEM - CMU_PERI Signed-off-by: GyoungBo Min Reviewed-by: Kyunghwan Kim Signed-off-by: Ravi Patel Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20251029130731.51305-2-ravi.patel@samsung.com Signed-off-by: Krzysztof Kozlowski --- .../bindings/clock/axis,artpec9-clock.yaml | 232 +++++++++++++++++++++ include/dt-bindings/clock/axis,artpec9-clk.h | 195 +++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/axis,artpec9-clock.yaml create mode 100644 include/dt-bindings/clock/axis,artpec9-clk.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/axis,artpec9-clock.yaml b/Documentation/devicetree/bindings/clock/axis,artpec9-clock.yaml new file mode 100644 index 000000000000..63442b91e7ac --- /dev/null +++ b/Documentation/devicetree/bindings/clock/axis,artpec9-clock.yaml @@ -0,0 +1,232 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/axis,artpec9-clock.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Axis ARTPEC-9 SoC clock controller + +maintainers: + - Jesper Nilsson + +description: | + ARTPEC-9 clock controller is comprised of several CMU (Clock Management Unit) + units, generating clocks for different domains. Those CMU units are modeled + as separate device tree nodes, and might depend on each other. + The root clock in that root tree is an external clock: OSCCLK (25 MHz). + This external clock must be defined as a fixed-rate clock in dts. + + CMU_CMU is a top-level CMU, where all base clocks are prepared using PLLs and + dividers, all other clocks of function blocks (other CMUs) are usually + derived from CMU_CMU. + + Each clock is assigned an identifier and client nodes can use this identifier + to specify the clock which they consume. All clocks available for usage + in clock consumer nodes are defined as preprocessor macros in + 'include/dt-bindings/clock/axis,artpec9-clk.h' header. + +properties: + compatible: + enum: + - axis,artpec9-cmu-cmu + - axis,artpec9-cmu-bus + - axis,artpec9-cmu-core + - axis,artpec9-cmu-cpucl + - axis,artpec9-cmu-fsys0 + - axis,artpec9-cmu-fsys1 + - axis,artpec9-cmu-imem + - axis,artpec9-cmu-peri + + reg: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 5 + + clock-names: + minItems: 1 + maxItems: 5 + + "#clock-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - "#clock-cells" + +allOf: + - if: + properties: + compatible: + const: axis,artpec9-cmu-cmu + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + + clock-names: + items: + - const: fin_pll + + - if: + properties: + compatible: + const: axis,artpec9-cmu-bus + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_BUS bus clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: bus + + - if: + properties: + compatible: + const: axis,artpec9-cmu-core + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_CORE main clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: main + + - if: + properties: + compatible: + const: axis,artpec9-cmu-cpucl + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_CPUCL switch clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: switch + + - if: + properties: + compatible: + const: axis,artpec9-cmu-fsys0 + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_FSYS0 bus clock (from CMU_CMU) + - description: CMU_FSYS0 IP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: bus + - const: ip + + - if: + properties: + compatible: + const: axis,artpec9-cmu-fsys1 + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_FSYS1 scan0 clock (from CMU_CMU) + - description: CMU_FSYS1 scan1 clock (from CMU_CMU) + - description: CMU_FSYS1 bus clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: scan0 + - const: scan1 + - const: bus + + - if: + properties: + compatible: + const: axis,artpec9-cmu-imem + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_IMEM ACLK clock (from CMU_CMU) + - description: CMU_IMEM CA5 clock (from CMU_CMU) + - description: CMU_IMEM JPEG clock (from CMU_CMU) + - description: CMU_IMEM SSS clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: aclk + - const: ca5 + - const: jpeg + - const: sss + + - if: + properties: + compatible: + const: axis,artpec9-cmu-peri + + then: + properties: + clocks: + items: + - description: External reference clock (25 MHz) + - description: CMU_PERI IP clock (from CMU_CMU) + - description: CMU_PERI DISP clock (from CMU_CMU) + + clock-names: + items: + - const: fin_pll + - const: ip + - const: disp + +additionalProperties: false + +examples: + # Clock controller node for CMU_FSYS1 + - | + #include + + soc { + #address-cells = <2>; + #size-cells = <2>; + + cmu_fsys1: clock-controller@14c10000 { + compatible = "axis,artpec9-cmu-fsys1"; + reg = <0x0 0x14c10000 0x0 0x4000>; + #clock-cells = <1>; + clocks = <&fin_pll>, + <&cmu_cmu CLK_DOUT_CMU_FSYS1_SCAN0>, + <&cmu_cmu CLK_DOUT_CMU_FSYS1_SCAN1>, + <&cmu_cmu CLK_DOUT_CMU_FSYS1_BUS>; + clock-names = "fin_pll", "scan0", "scan1", "bus"; + }; + }; +... diff --git a/include/dt-bindings/clock/axis,artpec9-clk.h b/include/dt-bindings/clock/axis,artpec9-clk.h new file mode 100644 index 000000000000..c6787be8d686 --- /dev/null +++ b/include/dt-bindings/clock/axis,artpec9-clk.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. + * https://www.samsung.com + * Copyright (c) 2025 Axis Communications AB. + * https://www.axis.com + * + * Device Tree binding constants for ARTPEC-9 clock controller. + */ + +#ifndef _DT_BINDINGS_CLOCK_ARTPEC9_H +#define _DT_BINDINGS_CLOCK_ARTPEC9_H + +/* CMU_CMU */ +#define CLK_FOUT_SHARED0_PLL 1 +#define CLK_DOUT_SHARED0_DIV2 2 +#define CLK_DOUT_SHARED0_DIV3 3 +#define CLK_DOUT_SHARED0_DIV4 4 +#define CLK_FOUT_SHARED1_PLL 5 +#define CLK_DOUT_SHARED1_DIV2 6 +#define CLK_DOUT_SHARED1_DIV3 7 +#define CLK_DOUT_SHARED1_DIV4 8 +#define CLK_FOUT_AUDIO_PLL 9 +#define CLK_DOUT_CMU_ADD 10 +#define CLK_DOUT_CMU_BUS 11 +#define CLK_DOUT_CMU_CDC_CORE 12 +#define CLK_DOUT_CMU_CORE_MAIN 13 +#define CLK_DOUT_CMU_CPUCL_SWITCH 14 +#define CLK_DOUT_CMU_DLP_CORE 15 +#define CLK_DOUT_CMU_FSYS0_BUS 16 +#define CLK_DOUT_CMU_FSYS0_IP 17 +#define CLK_DOUT_CMU_FSYS1_BUS 18 +#define CLK_DOUT_CMU_FSYS1_SCAN0 19 +#define CLK_DOUT_CMU_FSYS1_SCAN1 20 +#define CLK_DOUT_CMU_GPU_3D 21 +#define CLK_DOUT_CMU_GPU_2D 22 +#define CLK_DOUT_CMU_IMEM_ACLK 23 +#define CLK_DOUT_CMU_IMEM_CA5 24 +#define CLK_DOUT_CMU_IMEM_JPEG 25 +#define CLK_DOUT_CMU_IMEM_SSS 26 +#define CLK_DOUT_CMU_IPA_CORE 27 +#define CLK_DOUT_CMU_LCPU 28 +#define CLK_DOUT_CMU_MIF_SWITCH 29 +#define CLK_DOUT_CMU_MIF_BUSP 30 +#define CLK_DOUT_CMU_PERI_DISP 31 +#define CLK_DOUT_CMU_PERI_IP 32 +#define CLK_DOUT_CMU_RSP_CORE 33 +#define CLK_DOUT_CMU_TRFM 34 +#define CLK_DOUT_CMU_VIO_CORE_L 35 +#define CLK_DOUT_CMU_VIO_CORE 36 +#define CLK_DOUT_CMU_VIP0 37 +#define CLK_DOUT_CMU_VIP1 38 +#define CLK_DOUT_CMU_VPP_CORE 39 +#define CLK_DOUT_CMU_VIO_AUDIO 40 + +/* CMU_BUS */ +#define CLK_MOUT_BUS_ACLK_USER 1 + +/* CMU_CORE */ +#define CLK_MOUT_CORE_ACLK_USER 1 + +/* CMU_CPUCL */ +#define CLK_FOUT_CPUCL_PLL0 1 +#define CLK_MOUT_CPUCL_PLL0 2 +#define CLK_FOUT_CPUCL_PLL1 3 +#define CLK_MOUT_CPUCL_PLL_SCU 4 +#define CLK_MOUT_CPUCL_SWITCH_SCU_USER 5 +#define CLK_MOUT_CPUCL_SWITCH_USER 6 +#define CLK_DOUT_CPUCL_CPU 7 +#define CLK_DOUT_CPUCL_CLUSTER_PERIPHCLK 8 +#define CLK_DOUT_CPUCL_CLUSTER_GICCLK 9 +#define CLK_DOUT_CPUCL_CLUSTER_PCLK 10 +#define CLK_DOUT_CPUCL_CMUREF 11 +#define CLK_DOUT_CPUCL_CLUSTER_ATCLK 12 +#define CLK_DOUT_CPUCL_CLUSTER_SCU 13 +#define CLK_DOUT_CPUCL_DBG 14 +#define CLK_GOUT_CPUCL_SHORTSTOP 15 +#define CLK_GOUT_CPUCL_CLUSTER_CPU 16 +#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_ATCLK 17 +#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_PCLKDBG 18 + +/* CMU_FSYS0 */ +#define CLK_MOUT_FSYS0_BUS_USER 1 +#define CLK_MOUT_FSYS0_IP_USER 2 +#define CLK_MOUT_FSYS0_MAIN_USER 3 +#define CLK_DOUT_FSYS0_125 4 +#define CLK_DOUT_FSYS0_ADC 5 +#define CLK_DOUT_FSYS0_BUS_300 6 +#define CLK_DOUT_FSYS0_EQOS0 7 +#define CLK_DOUT_FSYS0_EQOS1 8 +#define CLK_DOUT_FSYS0_MMC_CARD0 9 +#define CLK_DOUT_FSYS0_MMC_CARD1 10 +#define CLK_DOUT_FSYS0_MMC_CARD2 11 +#define CLK_DOUT_FSYS0_QSPI 12 +#define CLK_DOUT_FSYS0_SFMC_NAND 13 +#define CLK_GOUT_FSYS0_EQOS_TOP0_IPCLKPORT_ACLK_I 14 +#define CLK_GOUT_FSYS0_EQOS_TOP0_IPCLKPORT_CLK_CSR_I 15 +#define CLK_GOUT_FSYS0_EQOS_TOP0_IPCLKPORT_I_RGMII_PHASE_CLK_250 16 +#define CLK_GOUT_FSYS0_EQOS_TOP0_IPCLKPORT_I_RGMII_TXCLK 17 +#define CLK_GOUT_FSYS0_EQOS_TOP1_IPCLKPORT_I_RGMII_PHASE_CLK_250 18 +#define CLK_GOUT_FSYS0_EQOS_TOP1_IPCLKPORT_I_RGMII_TXCLK 19 +#define CLK_GOUT_FSYS0_EQOS_TOP1_IPCLKPORT_ACLK_I 20 +#define CLK_GOUT_FSYS0_EQOS_TOP1_IPCLKPORT_CLK_CSR_I 21 +#define CLK_GOUT_FSYS0_I3C0_IPCLKPORT_I_APB_S_PCLK 22 +#define CLK_GOUT_FSYS0_I3C0_IPCLKPORT_I_CORE_CLK 23 +#define CLK_GOUT_FSYS0_I3C0_IPCLKPORT_I_DMA_CLK 24 +#define CLK_GOUT_FSYS0_I3C0_IPCLKPORT_I_HDR_TX_CLK 25 +#define CLK_GOUT_FSYS0_I3C1_IPCLKPORT_I_APB_S_PCLK 26 +#define CLK_GOUT_FSYS0_I3C1_IPCLKPORT_I_CORE_CLK 27 +#define CLK_GOUT_FSYS0_I3C1_IPCLKPORT_I_DMA_CLK 28 +#define CLK_GOUT_FSYS0_I3C1_IPCLKPORT_I_HDR_TX_CLK 29 +#define CLK_GOUT_FSYS0_MMC0_IPCLKPORT_SDCLKIN 30 +#define CLK_GOUT_FSYS0_MMC1_IPCLKPORT_SDCLKIN 31 +#define CLK_GOUT_FSYS0_MMC2_IPCLKPORT_SDCLKIN 32 +#define CLK_GOUT_FSYS0_QSPI_IPCLKPORT_HCLK 33 +#define CLK_GOUT_FSYS0_QSPI_IPCLKPORT_SSI_CLK 34 +#define CLK_GOUT_FSYS0_SFMC_IPCLKPORT_I_ACLK_NAND 35 +#define CLK_GOUT_FSYS0_I2C0_IPCLKPORT_I_PCLK 36 +#define CLK_GOUT_FSYS0_I2C1_IPCLKPORT_I_PCLK 37 +#define CLK_GOUT_FSYS0_MMC0_IPCLKPORT_I_ACLK 38 +#define CLK_GOUT_FSYS0_MMC1_IPCLKPORT_I_ACLK 39 +#define CLK_GOUT_FSYS0_MMC2_IPCLKPORT_I_ACLK 40 +#define CLK_GOUT_FSYS0_PWM_IPCLKPORT_I_PCLK_S0 41 + +/* CMU_FSYS1 */ +#define CLK_FOUT_FSYS1_PLL 1 +#define CLK_MOUT_FSYS1_SCAN0_USER 2 +#define CLK_MOUT_FSYS1_SCAN1_USER 3 +#define CLK_MOUT_FSYS1_BUS_USER 4 +#define CLK_DOUT_FSYS1_200 5 +#define CLK_DOUT_FSYS1_BUS_300 6 +#define CLK_DOUT_FSYS1_OTP_MEM 7 +#define CLK_DOUT_FSYS1_PCIE_PHY_REFCLK_SYSPLL 8 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_PHY_APB2CR_PCLK_100 9 +#define CLK_GOUT_FSYS1_UART0_PCLK 10 +#define CLK_GOUT_FSYS1_UART0_SCLK_UART 11 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_PHY_APB2CR_PCLK_300 12 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X1_DBI_ACLK_SOC 13 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X1_MSTR_ACLK_SOC 14 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X1_SLV_ACLK_SOC 15 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X2_DBI_ACLK_SOC 16 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X2_MSTR_ACLK_SOC 17 +#define CLK_GOUT_FSYS1_IPCLKPORT_PCIE_SUB_CON_X2_SLV_ACLK_SOC 18 +#define CLK_GOUT_FSYS1_USB20DRD_IPCLKPORT_ACLK_PHYCTRL_20 19 +#define CLK_GOUT_FSYS1_USB20DRD_IPCLKPORT_BUS_CLK_EARLY 20 +#define CLK_GOUT_FSYS1_XHB_AHBBR_FSYS1_IPCLKPORT_CLK 21 +#define CLK_GOUT_FSYS1_XHB_USB_IPCLKPORT_CLK 22 + +/* CMU_IMEM */ +#define CLK_MOUT_IMEM_ACLK_USER 1 +#define CLK_MOUT_IMEM_CA5_USER 2 +#define CLK_MOUT_IMEM_SSS_USER 3 +#define CLK_MOUT_IMEM_JPEG_USER 4 +#define CLK_DOUT_IMEM_PCLK 5 +#define CLK_GOUT_IMEM_CA5_0_IPCLKPORT_ATCLK 6 +#define CLK_GOUT_IMEM_CA5_0_IPCLKPORT_CLKIN 7 +#define CLK_GOUT_IMEM_CA5_0_IPCLKPORT_PCLK_DBG 8 +#define CLK_GOUT_IMEM_CA5_1_IPCLKPORT_ATCLK 9 +#define CLK_GOUT_IMEM_CA5_1_IPCLKPORT_CLKIN 10 +#define CLK_GOUT_IMEM_CA5_1_IPCLKPORT_PCLK_DBG 11 +#define CLK_GOUT_IMEM_MCT0_PCLK 12 +#define CLK_GOUT_IMEM_MCT1_PCLK 13 +#define CLK_GOUT_IMEM_MCT2_PCLK 14 +#define CLK_GOUT_IMEM_MCT3_PCLK 15 +#define CLK_GOUT_IMEM_PCLK_TMU0_APBIF 16 + +/* CMU_PERI */ +#define CLK_MOUT_PERI_IP_USER 1 +#define CLK_MOUT_PERI_DISP_USER 2 +#define CLK_DOUT_PERI_125 3 +#define CLK_DOUT_PERI_PCLK 4 +#define CLK_DOUT_PERI_SPI 5 +#define CLK_DOUT_PERI_UART1 6 +#define CLK_DOUT_PERI_UART2 7 +#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_APB_CLK 8 +#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_AXI_CLK 9 +#define CLK_GOUT_PERI_I3C2_IPCLKPORT_I_APB_S_PCLK 10 +#define CLK_GOUT_PERI_I3C2_IPCLKPORT_I_CORE_CLK 11 +#define CLK_GOUT_PERI_I3C2_IPCLKPORT_I_DMA_CLK 12 +#define CLK_GOUT_PERI_I3C2_IPCLKPORT_I_HDR_TX_CLK 13 +#define CLK_GOUT_PERI_I3C3_IPCLKPORT_I_APB_S_PCLK 14 +#define CLK_GOUT_PERI_I3C3_IPCLKPORT_I_CORE_CLK 15 +#define CLK_GOUT_PERI_I3C3_IPCLKPORT_I_DMA_CLK 16 +#define CLK_GOUT_PERI_I3C3_IPCLKPORT_I_HDR_TX_CLK 17 +#define CLK_GOUT_PERI_APB_ASYNC_DSIM_IPCLKPORT_PCLKS 18 +#define CLK_GOUT_PERI_I2C2_IPCLKPORT_I_PCLK 19 +#define CLK_GOUT_PERI_I2C3_IPCLKPORT_I_PCLK 20 +#define CLK_GOUT_PERI_SPI0_PCLK 21 +#define CLK_GOUT_PERI_SPI0_SCLK_SPI 22 +#define CLK_GOUT_PERI_UART1_PCLK 23 +#define CLK_GOUT_PERI_UART1_SCLK_UART 24 +#define CLK_GOUT_PERI_UART2_PCLK 25 +#define CLK_GOUT_PERI_UART2_SCLK_UART 26 + +#endif /* _DT_BINDINGS_CLOCK_ARTPEC9_H */ -- cgit v1.2.3 From 196b2b95fec447c2c4460f753b277d840633fbef Mon Sep 17 00:00:00 2001 From: Mel Henning Date: Thu, 19 Feb 2026 15:05:54 -0500 Subject: drm/nouveau: Add DRM_IOCTL_NOUVEAU_GET_ZCULL_INFO Add kernel-side support for using the zcull hardware in nvidia gpus. zcull aims to improve memory bandwidth by using an early approximate depth test, similar to hierarchical Z on an AMD card. Add a new ioctl that exposes zcull information that has been read from the hardware. Userspace uses each of these parameters either in a heuristic for determining zcull region parameters or in the calculation of a buffer size. It appears the hardware hasn't changed its structure for these values since FERMI_C (circa 2011), so the assumption is that it won't change on us too quickly, and is therefore reasonable to include in UAPI. This bypasses the nvif layer and instead accesses nvkm_gr directly, which mirrors existing usage of nvkm_gr_units(). There is no nvif object for nvkm_gr yet, and adding one is not trivial. Signed-off-by: Mel Henning Link: https://patch.msgid.link/20260219-zcull3-v3-2-dbe6a716f104@darkrefraction.com Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nouveau_abi16.c | 29 +++++++++++++++ drivers/gpu/drm/nouveau/nouveau_abi16.h | 1 + drivers/gpu/drm/nouveau/nouveau_drm.c | 1 + include/uapi/drm/nouveau_drm.h | 66 +++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+) (limited to 'include') diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c index f9201f2e73a3..7860877d909b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c @@ -333,6 +333,35 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS) return 0; } +int +nouveau_abi16_ioctl_get_zcull_info(ABI16_IOCTL_ARGS) +{ + struct nouveau_drm *drm = nouveau_drm(dev); + struct nvkm_gr *gr = nvxx_gr(drm); + struct drm_nouveau_get_zcull_info *out = data; + + if (gr->has_zcull_info) { + const struct nvkm_gr_zcull_info *i = &gr->zcull_info; + + out->width_align_pixels = i->width_align_pixels; + out->height_align_pixels = i->height_align_pixels; + out->pixel_squares_by_aliquots = i->pixel_squares_by_aliquots; + out->aliquot_total = i->aliquot_total; + out->zcull_region_byte_multiplier = i->zcull_region_byte_multiplier; + out->zcull_region_header_size = i->zcull_region_header_size; + out->zcull_subregion_header_size = i->zcull_subregion_header_size; + out->subregion_count = i->subregion_count; + out->subregion_width_align_pixels = i->subregion_width_align_pixels; + out->subregion_height_align_pixels = i->subregion_height_align_pixels; + out->ctxsw_size = i->ctxsw_size; + out->ctxsw_align = i->ctxsw_align; + + return 0; + } else { + return -ENOTTY; + } +} + int nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS) { diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h b/drivers/gpu/drm/nouveau/nouveau_abi16.h index af6b4e1cefd2..134b3ab58719 100644 --- a/drivers/gpu/drm/nouveau/nouveau_abi16.h +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h @@ -6,6 +6,7 @@ struct drm_device *dev, void *data, struct drm_file *file_priv int nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS); +int nouveau_abi16_ioctl_get_zcull_info(ABI16_IOCTL_ARGS); int nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS); int nouveau_abi16_ioctl_channel_free(ABI16_IOCTL_ARGS); int nouveau_abi16_ioctl_grobj_alloc(ABI16_IOCTL_ARGS); diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 17c114645d9f..5d8475e4895e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -1304,6 +1304,7 @@ nouveau_ioctls[] = { DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(NOUVEAU_GET_ZCULL_INFO, nouveau_abi16_ioctl_get_zcull_info, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_RENDER_ALLOW), diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h index dd87f8f30793..1fa82fa6af38 100644 --- a/include/uapi/drm/nouveau_drm.h +++ b/include/uapi/drm/nouveau_drm.h @@ -432,6 +432,69 @@ struct drm_nouveau_exec { __u64 push_ptr; }; +struct drm_nouveau_get_zcull_info { + /** + * @width_align_pixels: required alignment for region widths, in pixels + * (typically #TPC's * 16). + */ + __u32 width_align_pixels; + /** + * @height_align_pixels: required alignment for region heights, in + * pixels (typically 32). + */ + __u32 height_align_pixels; + /** + * @pixel_squares_by_aliquots: the pixel area covered by an aliquot + * (typically #Zcull_banks * 16 * 16). + */ + __u32 pixel_squares_by_aliquots; + /** + * @aliquot_total: the total aliquot pool available in hardware + */ + __u32 aliquot_total; + /** + * @zcull_region_byte_multiplier: the size of an aliquot in bytes, which + * is used for save/restore operations on a region + */ + __u32 zcull_region_byte_multiplier; + /** + * @zcull_region_header_size: the region header size in bytes, which is + * used for save/restore operations on a region + */ + __u32 zcull_region_header_size; + /** + * @zcull_subregion_header_size: the subregion header size in bytes, + * which is used for save/restore operations on a region + */ + __u32 zcull_subregion_header_size; + /** + * @subregion_count: the total number of subregions the hardware + * supports + */ + __u32 subregion_count; + /** + * @subregion_width_align_pixels: required alignment for subregion + * widths, in pixels (typically #TPC's * 16). + */ + __u32 subregion_width_align_pixels; + /** + * @subregion_height_align_pixels: required alignment for subregion + * heights, in pixels + */ + __u32 subregion_height_align_pixels; + + /** + * @ctxsw_size: the size, in bytes, of a zcull context switching region. + * Will be zero if the kernel does not support zcull context switching. + */ + __u32 ctxsw_size; + /** + * @ctxsw_align: the alignment, in bytes, of a zcull context switching + * region + */ + __u32 ctxsw_align; +}; + #define DRM_NOUVEAU_GETPARAM 0x00 #define DRM_NOUVEAU_SETPARAM 0x01 /* deprecated */ #define DRM_NOUVEAU_CHANNEL_ALLOC 0x02 @@ -445,6 +508,7 @@ struct drm_nouveau_exec { #define DRM_NOUVEAU_VM_INIT 0x10 #define DRM_NOUVEAU_VM_BIND 0x11 #define DRM_NOUVEAU_EXEC 0x12 +#define DRM_NOUVEAU_GET_ZCULL_INFO 0x13 #define DRM_NOUVEAU_GEM_NEW 0x40 #define DRM_NOUVEAU_GEM_PUSHBUF 0x41 #define DRM_NOUVEAU_GEM_CPU_PREP 0x42 @@ -513,6 +577,8 @@ struct drm_nouveau_svm_bind { #define DRM_IOCTL_NOUVEAU_VM_INIT DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_INIT, struct drm_nouveau_vm_init) #define DRM_IOCTL_NOUVEAU_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_VM_BIND, struct drm_nouveau_vm_bind) #define DRM_IOCTL_NOUVEAU_EXEC DRM_IOWR(DRM_COMMAND_BASE + DRM_NOUVEAU_EXEC, struct drm_nouveau_exec) + +#define DRM_IOCTL_NOUVEAU_GET_ZCULL_INFO DRM_IOR (DRM_COMMAND_BASE + DRM_NOUVEAU_GET_ZCULL_INFO, struct drm_nouveau_get_zcull_info) #if defined(__cplusplus) } #endif -- cgit v1.2.3 From 369cc88049855269b7620426bda4fb9ce2a2d1ca Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Wed, 4 Feb 2026 16:33:20 +0100 Subject: drm/xe/uapi: Introduce a flag to disallow vm overcommit in fault mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some compute applications may try to allocate device memory to probe how much device memory is actually available, assuming that the application will be the only one running on the particular GPU. That strategy fails in fault mode since it allows VM overcommit. While this could be resolved in user-space it's further complicated by cgroups potentially restricting the amount of memory available to the application. Introduce a vm create flag, DRM_XE_VM_CREATE_NO_VM_OVERCOMMIT, that allows fault mode to mimic the behaviour of !fault mode WRT this. It blocks evicting same vm bos during VM_BIND processing. However, it does *not* block evicting same-vm bos during pagefault processing, preferring eviction rather than VM banning in OOM situations. Cc: John Falkowski Cc: Michal Mrozek Cc: Matthew Brost Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20260204153320.17989-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/xe/xe_vm.c | 11 +++++++++-- drivers/gpu/drm/xe/xe_vm.h | 7 +++++++ drivers/gpu/drm/xe/xe_vm_types.h | 1 + include/uapi/drm/xe_drm.h | 6 ++++++ 4 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index a46f11a71c37..550208ef63f8 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1941,7 +1941,8 @@ find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs) #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \ DRM_XE_VM_CREATE_FLAG_LR_MODE | \ - DRM_XE_VM_CREATE_FLAG_FAULT_MODE) + DRM_XE_VM_CREATE_FLAG_FAULT_MODE | \ + DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT) int xe_vm_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) @@ -1980,12 +1981,18 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)) return -EINVAL; + if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) && + args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)) + return -EINVAL; + if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE) flags |= XE_VM_FLAG_SCRATCH_PAGE; if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) flags |= XE_VM_FLAG_LR_MODE; if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) flags |= XE_VM_FLAG_FAULT_MODE; + if (args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT) + flags |= XE_VM_FLAG_NO_VM_OVERCOMMIT; vm = xe_vm_create(xe, flags, xef); if (IS_ERR(vm)) @@ -2906,7 +2913,7 @@ static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma, err = drm_exec_lock_obj(exec, &bo->ttm.base); if (!err && validate) err = xe_bo_validate(bo, vm, - !xe_vm_in_preempt_fence_mode(vm) && + xe_vm_allow_vm_eviction(vm) && res_evict, exec); } diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 288115c7844a..f849e369432b 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -220,6 +220,13 @@ static inline bool xe_vm_in_preempt_fence_mode(struct xe_vm *vm) return xe_vm_in_lr_mode(vm) && !xe_vm_in_fault_mode(vm); } +static inline bool xe_vm_allow_vm_eviction(struct xe_vm *vm) +{ + return !xe_vm_in_lr_mode(vm) || + (xe_vm_in_fault_mode(vm) && + !(vm->flags & XE_VM_FLAG_NO_VM_OVERCOMMIT)); +} + int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q); void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q); diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 43203e90ee3e..1f6f7e30e751 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -232,6 +232,7 @@ struct xe_vm { #define XE_VM_FLAG_TILE_ID(flags) FIELD_GET(GENMASK(7, 6), flags) #define XE_VM_FLAG_SET_TILE_ID(tile) FIELD_PREP(GENMASK(7, 6), (tile)->id) #define XE_VM_FLAG_GSC BIT(8) +#define XE_VM_FLAG_NO_VM_OVERCOMMIT BIT(9) unsigned long flags; /** diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index c9e70f78e723..ef2565048bdf 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -969,6 +969,11 @@ struct drm_xe_gem_mmap_offset { * demand when accessed, and also allows per-VM overcommit of memory. * The xe driver internally uses recoverable pagefaults to implement * this. + * - %DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT - Requires also + * DRM_XE_VM_CREATE_FLAG_FAULT_MODE. This disallows per-VM overcommit + * but only during a &DRM_IOCTL_XE_VM_BIND operation with the + * %DRM_XE_VM_BIND_FLAG_IMMEDIATE flag set. This may be useful for + * user-space naively probing the amount of available memory. */ struct drm_xe_vm_create { /** @extensions: Pointer to the first extension struct, if any */ @@ -977,6 +982,7 @@ struct drm_xe_vm_create { #define DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE (1 << 0) #define DRM_XE_VM_CREATE_FLAG_LR_MODE (1 << 1) #define DRM_XE_VM_CREATE_FLAG_FAULT_MODE (1 << 2) +#define DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT (1 << 3) /** @flags: Flags */ __u32 flags; -- cgit v1.2.3 From 80930d81c4b0753ba2ca750708e4d2fcc0627dc8 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 23 Feb 2026 15:02:55 +0000 Subject: ASoC: soc_sdw_utils: Add device info for CS47L47 Add a device info entry for the Cirrus Logic CS47L47. CS47L47 has UAJ (headset speaker + mic + jack detect) and DMICs. The audio ports are similar to the CS42L45 so can be based on the CS42L45 code. Signed-off-by: Richard Fitzgerald Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260223150256.326143-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 2 + sound/soc/sdw_utils/Makefile | 1 + sound/soc/sdw_utils/soc_sdw_cs47l47.c | 80 +++++++++++++++++++++++++++++++++++ sound/soc/sdw_utils/soc_sdw_utils.c | 36 ++++++++++++++++ 4 files changed, 119 insertions(+) create mode 100644 sound/soc/sdw_utils/soc_sdw_cs47l47.c (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 227347c8f0b3..98531e500cbb 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -259,6 +259,8 @@ int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_so int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs42l45_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs42l45_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_cs47l47_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_cs47l47_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); /* TI */ diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile index e8bd5ffb1a6a..a8d091fd374b 100644 --- a/sound/soc/sdw_utils/Makefile +++ b/sound/soc/sdw_utils/Makefile @@ -6,6 +6,7 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \ soc_sdw_bridge_cs35l56.o \ soc_sdw_cs42l42.o soc_sdw_cs42l43.o \ soc_sdw_cs42l45.o \ + soc_sdw_cs47l47.o \ soc_sdw_cs_amp.o \ soc_sdw_maxim.o \ soc_sdw_ti_amp.o diff --git a/sound/soc/sdw_utils/soc_sdw_cs47l47.c b/sound/soc/sdw_utils/soc_sdw_cs47l47.c new file mode 100644 index 000000000000..259ecf1e0a71 --- /dev/null +++ b/sound/soc/sdw_utils/soc_sdw_cs47l47.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Based on sof_sdw_cs42l45.c +// This file incorporates work covered by the following copyright notice: +// Copyright (c) 2023 Intel Corporation +// Copyright (c) 2024 Advanced Micro Devices, Inc. + +/* + * soc_sdw_cs47l47 - Helpers to handle CS47L47 from generic machine driver + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static struct snd_soc_jack_pin soc_jack_pins[] = { + { + .pin = "cs47l47 OT 43 Headphone", + .mask = SND_JACK_HEADPHONE, + }, + { + .pin = "cs47l47 OT 45 Headset", + .mask = SND_JACK_HEADPHONE, + }, + { + .pin = "cs47l47 IT 31 Microphone", + .mask = SND_JACK_MICROPHONE, + }, + { + .pin = "cs47l47 IT 33 Headset", + .mask = SND_JACK_MICROPHONE, + }, +}; + +int asoc_sdw_cs47l47_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + struct snd_soc_component *component = snd_soc_rtd_to_codec(rtd, 0)->component; + struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card); + struct snd_soc_jack *jack = &ctx->sdw_headset; + int ret; + + card->components = devm_kasprintf(card->dev, GFP_KERNEL, "%s hs:cs47l47", + card->components); + if (!card->components) + return -ENOMEM; + + ret = snd_soc_card_jack_new_pins(card, "Jack", SND_JACK_MECHANICAL | + SND_JACK_HEADSET | SND_JACK_LINEOUT, jack, + soc_jack_pins, ARRAY_SIZE(soc_jack_pins)); + if (ret) { + dev_err(card->dev, "Failed to create jack: %d\n", ret); + return ret; + } + + ret = snd_soc_component_set_jack(component, jack, NULL); + if (ret) { + dev_err(card->dev, "Failed to register jack: %d\n", ret); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_cs47l47_hs_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_cs47l47_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + + card->components = devm_kasprintf(card->dev, GFP_KERNEL, "%s mic:cs47l47-dmic", + card->components); + if (!card->components) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_cs47l47_dmic_rtd_init, "SND_SOC_SDW_UTILS"); diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 0e67d9f34cba..eeeb91dded9d 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -759,6 +759,42 @@ struct asoc_sdw_codec_info codec_info_list[] = { }, .aux_num = 1, }, + { + .part_id = 0x4747, + .name_prefix = "cs47l47", + .dais = { + { + .direction = {true, false}, + .codec_name = "snd_soc_sdca.UAJ.1", + .dai_name = "IT 41", + .dai_type = SOC_SDW_DAI_TYPE_JACK, + .dailink = {SOC_SDW_JACK_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID}, + .rtd_init = asoc_sdw_cs47l47_hs_rtd_init, + }, + { + .direction = {false, true}, + .codec_name = "snd_soc_sdca.SmartMic.0", + .dai_name = "OT 113", + .dai_type = SOC_SDW_DAI_TYPE_MIC, + .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_DMIC_DAI_ID}, + .rtd_init = asoc_sdw_cs47l47_dmic_rtd_init, + }, + { + .direction = {false, true}, + .codec_name = "snd_soc_sdca.UAJ.1", + .dai_name = "OT 36", + .dai_type = SOC_SDW_DAI_TYPE_JACK, + .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_JACK_IN_DAI_ID}, + }, + }, + .dai_num = 3, + .auxs = { + { + .codec_name = "snd_soc_sdca.HID.2", + }, + }, + .aux_num = 1, + }, { .part_id = 0xaaaa, /* generic codec mockup */ .name_prefix = "sdw_mockup_mmulti-function", -- cgit v1.2.3 From 38ab6557234d8629407a824be90e82514d6129a0 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:11 +0100 Subject: regmap: sort header includes Sort the included headers to make spotting duplicates easier and avoid discussions on where to add new includes. Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-1-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index caff2240bdab..c8a6a05bdba1 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -10,15 +10,15 @@ * Author: Mark Brown */ -#include -#include -#include +#include #include #include -#include -#include -#include #include +#include +#include +#include +#include +#include struct module; struct clk; -- cgit v1.2.3 From 37983fad7f3ef296fa0504c8e945987459dc5487 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:12 +0100 Subject: regmap: define cleanup helper for regmap_field For temporary field allocation, the user has to perform manual cleanup, or rely on devm_regmap_field_alloc() to (eventually) clean up the allocated resources when an error occurs. Add a cleanup helper that takes care of freeing the allocated regmap_field whenever it goes out of scope. This can simplify this example: struct regmap_field *field = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) goto out; /* some logic that may also error */ err = regmap_field_write(...); out: regmap_field_free(field); return err; into the shorter: struct regmap_field *field __free(regmap_field) = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) return err; /* some logic that may also error */ return regmap_field_write(...); Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-2-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c8a6a05bdba1..f1c5cb63c171 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -1460,6 +1461,8 @@ struct regmap_field *regmap_field_alloc(struct regmap *regmap, struct reg_field reg_field); void regmap_field_free(struct regmap_field *field); +DEFINE_FREE(regmap_field, struct regmap_field *, if (_T) regmap_field_free(_T)) + struct regmap_field *devm_regmap_field_alloc(struct device *dev, struct regmap *regmap, struct reg_field reg_field); void devm_regmap_field_free(struct device *dev, struct regmap_field *field); -- cgit v1.2.3 From aa8671af0c380c15989b88325a2d5a6c5341771d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:43 +0100 Subject: PCI/PTM: Drop pci_enable_ptm() granularity parameter No pci_enable_ptm() callers supply the "granularity" pointer where the clock granularity would be returned. Drop the unused pci_enable_ptm() parameter. Signed-off-by: Mika Westerberg [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260224111044.3487873-5-mika.westerberg@linux.intel.com --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/idpf/idpf_main.c | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- drivers/pci/pcie/ptm.c | 11 +++-------- include/linux/pci.h | 4 ++-- 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ebf48feffb30..b35c4e4ecd2a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5028,7 +5028,7 @@ static int ice_init(struct ice_pf *pf) } if (pf->hw.mac_type == ICE_MAC_E830) { - err = pci_enable_ptm(pf->pdev, NULL); + err = pci_enable_ptm(pf->pdev); if (err) dev_dbg(dev, "PCIe PTM not supported by PCIe bus/controller\n"); } diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index 0dd741dcfcdb..ab3c409e587b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -257,7 +257,7 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_free; } - err = pci_enable_ptm(pdev, NULL); + err = pci_enable_ptm(pdev); if (err) pci_dbg(pdev, "PCIe PTM is not supported by PCIe bus/controller\n"); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27e5c2109138..b030acf94ac4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7123,7 +7123,7 @@ static int igc_probe(struct pci_dev *pdev, if (err) goto err_pci_reg; - err = pci_enable_ptm(pdev, NULL); + err = pci_enable_ptm(pdev); if (err < 0) dev_info(&pdev->dev, "PCIe PTM not supported by PCIe bus/controller\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..0b94d4ed0ef6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -960,7 +960,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, mlx5_pci_vsc_init(dev); - pci_enable_ptm(pdev, NULL); + pci_enable_ptm(pdev); return 0; diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 91a598ed534c..2c848ae4f15f 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -88,7 +88,7 @@ void pci_ptm_init(struct pci_dev *dev) if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) - pci_enable_ptm(dev, NULL); + pci_enable_ptm(dev); } void pci_save_ptm_state(struct pci_dev *dev) @@ -182,15 +182,13 @@ static int __pci_enable_ptm(struct pci_dev *dev) /** * pci_enable_ptm() - Enable Precision Time Measurement * @dev: PCI device - * @granularity: pointer to return granularity * - * Enable Precision Time Measurement for @dev. If successful and - * @granularity is non-NULL, return the Effective Granularity. + * Enable Precision Time Measurement for @dev. * * Return: zero if successful, or -EINVAL if @dev lacks a PTM Capability or * is not a PTM Root and lacks an upstream path of PTM-enabled devices. */ -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +int pci_enable_ptm(struct pci_dev *dev) { int rc; char clock_desc[8]; @@ -201,9 +199,6 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) dev->ptm_enabled = 1; - if (granularity) - *granularity = dev->ptm_granularity; - switch (dev->ptm_granularity) { case 0: snprintf(clock_desc, sizeof(clock_desc), "unknown"); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..8aaa72dcb164 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1975,11 +1975,11 @@ struct pci_ptm_debugfs { }; #ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +int pci_enable_ptm(struct pci_dev *dev); void pci_disable_ptm(struct pci_dev *dev); bool pcie_ptm_enabled(struct pci_dev *dev); #else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +static inline int pci_enable_ptm(struct pci_dev *dev) { return -EINVAL; } static inline void pci_disable_ptm(struct pci_dev *dev) { } static inline bool pcie_ptm_enabled(struct pci_dev *dev) -- cgit v1.2.3 From e02902dd493bf9c9b05353c761737ac514ad7a5c Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 23 Feb 2026 18:21:01 +0200 Subject: spi: add devm_spi_new_ancillary_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a devres-managed version of spi_new_ancillary_device() that automatically unregisters the ancillary SPI device when the parent device is removed. This follows the same devm_add_action_or_reset() pattern used by the other managed SPI functions (devm_spi_optimize_message, devm_spi_register_controller, etc.) and eliminates the need for drivers to open-code their own devm cleanup callbacks for ancillary devices. Acked-by: Nuno Sá Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20260223162110.156746-3-antoniu.miclaus@analog.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/spi/spi.h | 1 + 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8fbed4754de4..26cc10aa7533 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2747,6 +2747,46 @@ err_out: } EXPORT_SYMBOL_GPL(spi_new_ancillary_device); +static void devm_spi_unregister_device(void *spi) +{ + spi_unregister_device(spi); +} + +/** + * devm_spi_new_ancillary_device() - Register managed ancillary SPI device + * @spi: Pointer to the main SPI device registering the ancillary device + * @chip_select: Chip Select of the ancillary device + * + * Register an ancillary SPI device; for example some chips have a chip-select + * for normal device usage and another one for setup/firmware upload. + * + * This is the managed version of spi_new_ancillary_device(). The ancillary + * device will be unregistered automatically when the parent SPI device is + * unregistered. + * + * This may only be called from main SPI device's probe routine. + * + * Return: Pointer to new ancillary device on success; ERR_PTR on failure + */ +struct spi_device *devm_spi_new_ancillary_device(struct spi_device *spi, + u8 chip_select) +{ + struct spi_device *ancillary; + int ret; + + ancillary = spi_new_ancillary_device(spi, chip_select); + if (IS_ERR(ancillary)) + return ancillary; + + ret = devm_add_action_or_reset(&spi->dev, devm_spi_unregister_device, + ancillary); + if (ret) + return ERR_PTR(ret); + + return ancillary; +} +EXPORT_SYMBOL_GPL(devm_spi_new_ancillary_device); + #ifdef CONFIG_ACPI struct acpi_spi_lookup { struct spi_controller *ctlr; diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af7cfee7b8f6..1c9aab627583 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -387,6 +387,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) } extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); +extern struct spi_device *devm_spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); /* Use a define to avoid include chaining to get THIS_MODULE */ #define spi_register_driver(driver) \ -- cgit v1.2.3 From 477174ac35c510d0ed3043f5bd4fba25546a21ce Mon Sep 17 00:00:00 2001 From: David Carlier Date: Tue, 24 Feb 2026 05:56:37 +0000 Subject: sched_ext: Optimize sched_ext_entity layout for cache locality Reorder struct sched_ext_entity to place ops_state, ddsp_dsq_id, and ddsp_enq_flags immediately after dsq. These fields are accessed together in the do_enqueue_task() and finish_dispatch() hot paths but were previously spread across three different cache lines. Grouping them on the same cache line reduces cache misses on every enqueue and dispatch operation. Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 4601e5ecb43c..0150b3fe6230 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -163,6 +163,9 @@ struct scx_dsq_list_node { */ struct sched_ext_entity { struct scx_dispatch_q *dsq; + atomic_long_t ops_state; + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ u32 dsq_seq; @@ -174,7 +177,6 @@ struct sched_ext_entity { s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ - atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; @@ -182,8 +184,6 @@ struct sched_ext_entity { #ifdef CONFIG_SCHED_CORE u64 core_sched_at; /* see scx_prio_less() */ #endif - u64 ddsp_dsq_id; - u64 ddsp_enq_flags; /* BPF scheduler modifiable fields */ -- cgit v1.2.3 From 2550def53bbf2323894265e0e64363998bf9e5c3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 09:27:15 +0000 Subject: net: __lock_sock() can be static After commit 6511882cdd82 ("mptcp: allocate fwd memory separately on the rx and tx path") __lock_sock() can be static again. Make sure __lock_sock() is not inlined, so that lock_sock_nested() no longer needs a stack canary. Add a noinline attribute on lock_sock_nested() so that calls to lock_sock() from net/core/sock.c are not inlined, none of them are fast path to deserve that: - sockopt_lock_sock() - sock_set_reuseport() - sock_set_reuseaddr() - sock_set_mark() - sock_set_keepalive() - sock_no_linger() - sock_bindtoindex() - sk_wait_data() - sock_set_rcvbuf() $ scripts/bloat-o-meter -t vmlinux.old vmlinux add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-312 (-312) Function old new delta __lock_sock 192 188 -4 __lock_sock_fast 239 86 -153 lock_sock_nested 227 72 -155 Total: Before=24888707, After=24888395, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223092716.3673939-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/core/sock.c | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 66b56288c1d3..55b61e4b0d83 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1709,7 +1709,6 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } -void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 693e6d80f501..cfb2a6209946 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3175,7 +3175,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -void __lock_sock(struct sock *sk) +static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -3774,14 +3774,14 @@ void sock_init_data(struct socket *sock, struct sock *sk) } EXPORT_SYMBOL(sock_init_data); -void lock_sock_nested(struct sock *sk, int subclass) +void noinline lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (sock_owned_by_user_nocheck(sk)) + if (unlikely(sock_owned_by_user_nocheck(sk))) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); @@ -3810,7 +3810,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sock_owned_by_user_nocheck(sk)) { + if (likely(!sock_owned_by_user_nocheck(sk))) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. -- cgit v1.2.3 From f033335937d6f72a13bb38d82422eef30da31972 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 09:34:45 +0000 Subject: udp: move udp6_csum_init() back to net/ipv6/udp.c This function has a single caller in net/ipv6/udp.c. Move it there so that the compiler can decide to (auto)inline it if he prefers to. IBT glue is removed anyway. With clang, we can see it was able to inline it and also inlined one other helper at the same time. UDPLITE removal will also help. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 1/0 up/down: 840/-785 (55) Function old new delta __udp6_lib_rcv 1247 2087 +840 __pfx_udp6_csum_init 16 - -16 udp6_csum_init 769 - -769 Total: Before=25074399, After=25074454, chg +0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223093445.3696368-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_checksum.h | 2 -- net/ipv6/ip6_checksum.c | 47 ---------------------------------------------- net/ipv6/udp.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index c8a96b888277..6677b3cc3972 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -82,6 +82,4 @@ static inline __sum16 udp_v6_check(int len, void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); - -int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 377717045f8f..8bb68a0cdfd6 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -62,53 +62,6 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, EXPORT_SYMBOL(csum_ipv6_magic); #endif -int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) -{ - int err; - - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - - if (UDP_SKB_CB(skb)->partial_cov) { - skb->csum = ip6_compute_pseudo(skb, proto); - return 0; - } - } - - /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) - * we accept a checksum of zero here. When we find the socket - * for the UDP packet we'll check if that socket allows zero checksum - * for IPv6 (set by socket option). - * - * Note, we are only interested in != 0 or == 0, thus the - * force to int. - */ - err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, - ip6_compute_pseudo); - if (err) - return err; - - if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { - /* If SW calculated the value, we know it's bad */ - if (skb->csum_complete_sw) - return 1; - - /* HW says the value is bad. Let's validate that. - * skb->csum is no longer the full packet checksum, - * so don't treat is as such. - */ - skb_checksum_complete_unset(skb); - } - - return 0; -} -EXPORT_SYMBOL(udp6_csum_init); - /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 010b909275dd..48f73401adf4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1069,6 +1069,52 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, return 0; } +static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + + if (UDP_SKB_CB(skb)->partial_cov) { + skb->csum = ip6_compute_pseudo(skb, proto); + return 0; + } + } + + /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) + * we accept a checksum of zero here. When we find the socket + * for the UDP packet we'll check if that socket allows zero checksum + * for IPv6 (set by socket option). + * + * Note, we are only interested in != 0 or == 0, thus the + * force to int. + */ + err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); + if (err) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { + /* If SW calculated the value, we know it's bad */ + if (skb->csum_complete_sw) + return 1; + + /* HW says the value is bad. Let's validate that. + * skb->csum is no longer the full packet checksum, + * so don't treat is as such. + */ + skb_checksum_complete_unset(skb); + } + + return 0; +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { -- cgit v1.2.3 From bd5e5e1d41d316c47dd9001104b62033992daf6e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 10:07:27 +0000 Subject: tcp: inline __tcp_v4_send_check() Inline __tcp_v4_send_check(), like __tcp_v6_send_check(). Move tcp_v4_send_check() to tcp_output.c close to its fast path caller (__tcp_transmit_skb()). Note __tcp_v4_send_check() is still out-of-line for tcp4_gso_segment() because it is called in an unlikely() section. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux.1 add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-9 (-9) Function old new delta __tcp_v4_send_check 130 121 -9 Total: Before=25143100, After=25143091, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223100729.3761597-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 10 +++++++++- net/ipv4/tcp_ipv4.c | 18 ------------------ net/ipv4/tcp_output.c | 10 +++++++++- 3 files changed, 18 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 40e72b9cb85f..3c84433f3d57 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2382,7 +2382,15 @@ void tcp_gro_complete(struct sk_buff *skb); static inline void tcp_gro_complete(struct sk_buff *skb) { } #endif -void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); +static inline void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); +} static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6264fc0b2be5..da708aff0623 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -661,24 +661,6 @@ out: return 0; } -void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) -{ - struct tcphdr *th = tcp_hdr(skb); - - th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); -} - -/* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - - __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); -} -EXPORT_IPV6_MOD(tcp_v4_send_check); - #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 326b58ff1118..29056d6fc787 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1496,7 +1496,15 @@ static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); -INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + + __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); +} +EXPORT_IPV6_MOD(tcp_v4_send_check); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial -- cgit v1.2.3 From 255688652b8c439eb35c68ec5cdac4aa63737d35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 10:07:28 +0000 Subject: tcp: move tcp_v6_send_check() to tcp_output.c Move tcp_v6_send_check() so that __tcp_transmit_skb() can inline it. $ scripts/bloat-o-meter -t vmlinux.1 vmlinux.2 add/remove: 0/0 grow/shrink: 1/0 up/down: 105/0 (105) Function old new delta __tcp_transmit_skb 3321 3426 +105 Total: Before=25143091, After=25143196, chg +0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223100729.3761597-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 ++- net/ipv4/tcp_output.c | 19 ++++++++++++++++--- net/ipv6/tcp_ipv6.c | 5 ----- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c84433f3d57..feaddce9d805 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1132,7 +1132,8 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) extern const struct inet_connection_sock_af_ops ipv6_specific; -INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); +void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb); + INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); void tcp_v6_early_demux(struct sk_buff *skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 29056d6fc787..fdddb16630a5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1506,6 +1506,16 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) } EXPORT_IPV6_MOD(tcp_v4_send_check); +#if IS_ENABLED(CONFIG_IPV6) +#include + +void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); +} +EXPORT_IPV6_MOD(tcp_v6_send_check); +#endif + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1667,9 +1677,12 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); - INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, - tcp_v6_send_check, tcp_v4_send_check, - sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + if (likely(icsk->icsk_af_ops->send_check == tcp_v6_send_check)) + tcp_v6_send_check(sk, skb); + else +#endif + tcp_v4_send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d10487b4e5bf..306ca0585b4a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2015,11 +2015,6 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), }; -INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); -} - const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, -- cgit v1.2.3 From fcd3d039fab693df3d41ac9bcb12fb4e8ddd69fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 10:07:29 +0000 Subject: tcp: make tcp_v{4,6}_send_check() static tcp_v{4,6}_send_check() are only called from tcp_output.c and should be made static so that the compiler does not need to put an out of line copy of them. Remove (struct inet_connection_sock_af_ops) send_check field and use instead @net_header_len. Move @net_header_len close to @queue_xmit for data locality as both are used in TCP tx fast path. $ scripts/bloat-o-meter -t vmlinux.2 vmlinux.3 add/remove: 0/2 grow/shrink: 0/3 up/down: 0/-172 (-172) Function old new delta __tcp_transmit_skb 3426 3423 -3 tcp_v4_send_check 136 132 -4 mptcp_subflow_init 777 763 -14 __pfx_tcp_v6_send_check 16 - -16 tcp_v6_send_check 135 - -135 Total: Before=25143196, After=25143024, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223100729.3761597-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 +-- include/net/tcp.h | 3 --- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_output.c | 8 +++----- net/ipv6/tcp_ipv6.c | 2 -- net/mptcp/subflow.c | 1 - net/tls/tls_device_fallback.c | 3 --- 7 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ecb362025c4e..bbc9355871c7 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -34,7 +34,7 @@ struct tcp_congestion_ops; */ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); - void (*send_check)(struct sock *sk, struct sk_buff *skb); + u16 net_header_len; int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); @@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops { struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); - u16 net_header_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, diff --git a/include/net/tcp.h b/include/net/tcp.h index feaddce9d805..dfcd38089f11 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -531,7 +531,6 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, * TCP v4 functions exported for the inet6 API */ -void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); @@ -1132,8 +1131,6 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) extern const struct inet_connection_sock_af_ops ipv6_specific; -void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb); - INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); void tcp_v6_early_demux(struct sk_buff *skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index da708aff0623..bd613e401d48 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2405,7 +2405,6 @@ EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, - .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fdddb16630a5..1ef419c66a0e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1498,22 +1498,20 @@ INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) { const struct inet_sock *inet = inet_sk(sk); __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } -EXPORT_IPV6_MOD(tcp_v4_send_check); #if IS_ENABLED(CONFIG_IPV6) #include -void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } -EXPORT_IPV6_MOD(tcp_v6_send_check); #endif /* This routine actually transmits TCP packets queued in by @@ -1678,7 +1676,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); #if IS_ENABLED(CONFIG_IPV6) - if (likely(icsk->icsk_af_ops->send_check == tcp_v6_send_check)) + if (likely(icsk->icsk_af_ops->net_header_len == sizeof(struct ipv6hdr))) tcp_v6_send_check(sk, skb); else #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 306ca0585b4a..f17da56b449e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2017,7 +2017,6 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, - .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, @@ -2049,7 +2048,6 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, - .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f66129f1e649..dd79c5b37a6b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2190,7 +2190,6 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; - subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 03d508a45aae..de7d86bdd7ec 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -149,9 +149,6 @@ static int tls_enc_records(struct aead_request *aead_req, return rc; } -/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses - * might have been changed by NAT. - */ static void update_chksum(struct sk_buff *skb, int headln) { struct tcphdr *th = tcp_hdr(skb); -- cgit v1.2.3 From cd0aa651535092afd5d776bfe94e4fdf750f89c3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 23 Feb 2026 09:34:41 +0000 Subject: net: stmmac: pass interface mode into fix_mac_speed() method Pass the current interface mode reported by phylink into the fix_mac_speed() method. This will be used by qcom-ethqos for its "SGMII" configuration. Reviewed-by: Maxime Chevallier Tested-by: Mohd Ayaan Anwar Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vuSKv-0000000AScG-1zv6@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 11 +++++++---- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 11 +++++++---- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++- include/linux/stmmac.h | 3 ++- 8 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index d043bad4a862..0495437d3a6e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -112,7 +112,8 @@ static int dwc_qos_probe(struct platform_device *pdev, #define AUTO_CAL_STATUS 0x880c #define AUTO_CAL_STATUS_ACTIVE BIT(31) -static void tegra_eqos_fix_speed(void *bsp_priv, int speed, unsigned int mode) +static void tegra_eqos_fix_speed(void *bsp_priv, phy_interface_t interface, + int speed, unsigned int mode) { struct tegra_eqos *eqos = bsp_priv; bool needs_calibration = false; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index c4e85197629d..9f5a15b81f8a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -48,7 +48,8 @@ struct imx_dwmac_ops { int (*fix_soc_reset)(struct stmmac_priv *priv); int (*set_intf_mode)(struct imx_priv_data *dwmac, u8 phy_intf_sel); - void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; struct imx_priv_data { @@ -160,7 +161,8 @@ static int imx_dwmac_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, return stmmac_set_clk_tx_rate(bsp_priv, clk_tx_i, interface, speed); } -static void imx_dwmac_fix_speed(void *priv, int speed, unsigned int mode) +static void imx_dwmac_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct plat_stmmacenet_data *plat_dat; struct imx_priv_data *dwmac = priv; @@ -185,13 +187,14 @@ static void imx_dwmac_fix_speed(void *priv, int speed, unsigned int mode) dev_err(dwmac->dev, "failed to set tx rate %lu\n", rate); } -static void imx93_dwmac_fix_speed(void *priv, int speed, unsigned int mode) +static void imx93_dwmac_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct imx_priv_data *dwmac = priv; unsigned int iface; int ctrl, old_ctrl; - imx_dwmac_fix_speed(priv, speed, mode); + imx_dwmac_fix_speed(priv, interface, speed, mode); if (!dwmac || mode != MLO_AN_FIXED) return; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 815213223583..9c51c96223ad 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -143,7 +143,8 @@ static struct stmmac_pci_info loongson_gmac_pci_info = { .setup = loongson_gmac_data, }; -static void loongson_gnet_fix_speed(void *priv, int speed, unsigned int mode) +static void loongson_gnet_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct loongson_data *ld = (struct loongson_data *)priv; struct net_device *ndev = dev_get_drvdata(ld->dev); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index bd5d3bf90400..9b29516a5a7c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -643,7 +643,8 @@ static void ethqos_configure(struct qcom_ethqos *ethqos, int speed) return ethqos->configure_func(ethqos, speed); } -static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) +static void ethqos_fix_mac_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct qcom_ethqos *ethqos = priv; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 4c8991f3b38d..c6b99814d391 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -72,7 +72,8 @@ struct socfpga_dwmac { const struct socfpga_dwmac_ops *ops; }; -static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, int speed, +static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, + phy_interface_t interface, int speed, unsigned int mode) { struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)bsp_priv; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index f50547b67fbc..6ebbf95d158f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -91,11 +91,13 @@ struct sti_dwmac { struct regmap *regmap; bool gmac_en; int speed; - void (*fix_retime_src)(void *priv, int speed, unsigned int mode); + void (*fix_retime_src)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; struct sti_dwmac_of_data { - void (*fix_retime_src)(void *priv, int speed, unsigned int mode); + void (*fix_retime_src)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; enum { @@ -114,7 +116,8 @@ static u32 stih4xx_tx_retime_val[] = { | STIH4XX_ETH_SEL_INTERNAL_NOTEXT_PHYCLK, }; -static void stih4xx_fix_retime_src(void *priv, int spd, unsigned int mode) +static void stih4xx_fix_retime_src(void *priv, phy_interface_t interface, + int spd, unsigned int mode) { struct sti_dwmac *dwmac = priv; u32 src = dwmac->tx_retime_src; @@ -170,7 +173,7 @@ static int sti_set_phy_intf_sel(void *bsp_priv, u8 phy_intf_sel) val = (dwmac->interface == PHY_INTERFACE_MODE_REVMII) ? 0 : ENMII; regmap_update_bits(regmap, reg, ENMII_MASK, val); - dwmac->fix_retime_src(dwmac, dwmac->speed, 0); + dwmac->fix_retime_src(dwmac, dwmac->interface, dwmac->speed, 0); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 82375d34ad57..d7c730179a7f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1071,7 +1071,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, } if (priv->plat->fix_mac_speed) - priv->plat->fix_mac_speed(priv->plat->bsp_priv, speed, mode); + priv->plat->fix_mac_speed(priv->plat->bsp_priv, interface, + speed, mode); if (!duplex) ctrl &= ~priv->hw->link.duplex; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32352a216567..b96ae9dadfab 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -256,7 +256,8 @@ struct plat_stmmacenet_data { int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); - void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); int (*fix_soc_reset)(struct stmmac_priv *priv); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); -- cgit v1.2.3 From 539a6cf0844da56c32513b86305a7327760f9932 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Feb 2026 15:30:47 +0000 Subject: tcp: move inet6_csk_update_pmtu() to tcp_ipv6.c This function is only called from tcp_v6_mtu_reduced() and can be (auto)inlined by the compiler. Note that inet6_csk_route_socket() is no longer (auto)inlined, which is a good thing as it is slow path. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux.1 add/remove: 0/2 grow/shrink: 2/0 up/down: 93/-129 (-36) Function old new delta tcp_v6_mtu_reduced 139 228 +89 inet6_csk_route_socket 486 490 +4 __pfx_inet6_csk_update_pmtu 16 - -16 inet6_csk_update_pmtu 113 - -113 Total: Before=25076512, After=25076476, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260223153047.886683-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_connection_sock.h | 4 +++- net/ipv6/inet6_connection_sock.c | 19 ++----------------- net/ipv6/tcp_ipv6.c | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index ece8dabd209a..b814e1acc512 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -18,6 +18,9 @@ struct sk_buff; struct sock; struct sockaddr; +struct dst_entry *inet6_csk_route_socket(struct sock *sk, + struct flowi6 *fl6); + struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct dst_entry *dst, struct flowi6 *fl6, @@ -25,5 +28,4 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); -struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 11fc2f7de2fe..37534e116899 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -56,8 +56,8 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -static struct dst_entry *inet6_csk_route_socket(struct sock *sk, - struct flowi6 *fl6) +struct dst_entry *inet6_csk_route_socket(struct sock *sk, + struct flowi6 *fl6) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -118,18 +118,3 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused return res; } EXPORT_SYMBOL_GPL(inet6_csk_xmit); - -struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) -{ - struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; - struct dst_entry *dst; - - dst = inet6_csk_route_socket(sk, fl6); - - if (IS_ERR(dst)) - return NULL; - dst->ops->update_pmtu(dst, sk, NULL, mtu, true); - - dst = inet6_csk_route_socket(sk, fl6); - return IS_ERR(dst) ? NULL : dst; -} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f17da56b449e..07e07afb1ff1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -349,6 +349,21 @@ failure: return err; } +static struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; + struct dst_entry *dst; + + dst = inet6_csk_route_socket(sk, fl6); + + if (IS_ERR(dst)) + return NULL; + dst->ops->update_pmtu(dst, sk, NULL, mtu, true); + + dst = inet6_csk_route_socket(sk, fl6); + return IS_ERR(dst) ? NULL : dst; +} + static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; -- cgit v1.2.3 From fc1f97929ada9e923c3b0c70a999469eeb0b9f94 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 23 Feb 2026 23:07:18 +0000 Subject: bonding: Optimise is_netpoll_tx_blocked(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bond_start_xmit() spends some cycles in is_netpoll_tx_blocked(): if (unlikely(is_netpoll_tx_blocked(dev))) return NETDEV_TX_BUSY; because of the "pushf;pop reg" sequence (aka irqs_disabled()). Let's swap the conditions in is_netpoll_tx_blocked() and convert netpoll_block_tx to a static key. Before: 1.23 │ mov %gs:0x28,%rax 1.24 │ mov %rax,0x18(%rsp) 29.45 │ pushfq 0.50 │ pop %rax 0.47 │ test $0x200,%eax │ ↓ je 1b4 0.49 │ 32: lea 0x980(%rsi),%rbx After: 0.72 │ mov %gs:0x28,%rax 0.81 │ mov %rax,0x18(%rsp) 0.82 │ nop 2.77 │ 2a: lea 0x980(%rsi),%rbx Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260223230749.2376145-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 4 ++-- include/net/bonding.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 55a960da42b5..2f2dddcf9d30 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -206,7 +206,7 @@ MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " /*----------------------------- Global variables ----------------------------*/ #ifdef CONFIG_NET_POLL_CONTROLLER -atomic_t netpoll_block_tx = ATOMIC_INIT(0); +DEFINE_STATIC_KEY_FALSE(netpoll_block_tx); #endif unsigned int bond_net_id __read_mostly; @@ -6589,7 +6589,7 @@ static void __exit bonding_exit(void) #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ - WARN_ON(atomic_read(&netpoll_block_tx)); + WARN_ON(static_branch_unlikely(&netpoll_block_tx)); #endif } diff --git a/include/net/bonding.h b/include/net/bonding.h index 4ad5521e7731..7f8fda2fdfcb 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -91,22 +91,22 @@ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER -extern atomic_t netpoll_block_tx; +DECLARE_STATIC_KEY_FALSE(netpoll_block_tx); static inline void block_netpoll_tx(void) { - atomic_inc(&netpoll_block_tx); + static_branch_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { - atomic_dec(&netpoll_block_tx); + static_branch_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { - if (unlikely(netpoll_tx_running(dev))) - return atomic_read(&netpoll_block_tx); + if (static_branch_unlikely(&netpoll_block_tx)) + return netpoll_tx_running(dev); return 0; } #else -- cgit v1.2.3 From 46a9d97069cab311738c950d0fcef85a459c7b8f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 24 Feb 2026 11:06:38 +0900 Subject: ata: libata-eh: avoid unnecessary calls to ata_scsi_port_error_handler() When handling SCSI command timeouts, if we had no actual command timeouts (either because the command was a deferred qc or the completion path won the race with ata_scsi_cmd_error_handler()), we do not need to go through a port error handling, as there was in fact no errors at all. Modify ata_scsi_cmd_error_handler() to return the number of commands that timed out and use this return value in ata_scsi_error() to call ata_scsi_port_error_handler() only if we had command timeouts, or if the port EH has already been scheduled due to failed commands. Otherwise, simply call scsi_eh_flush_done_q() to finish the completed commands without running the full port error handling. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Niklas Cassel Signed-off-by: Niklas Cassel --- drivers/ata/libata-eh.c | 28 +++++++++++++++++++--------- include/linux/libata.h | 3 ++- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 72a22b6c9682..12c6740398fa 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -560,21 +560,27 @@ void ata_scsi_error(struct Scsi_Host *host) { struct ata_port *ap = ata_shost_to_port(host); unsigned long flags; + int nr_timedout; LIST_HEAD(eh_work_q); spin_lock_irqsave(host->host_lock, flags); list_splice_init(&host->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(host->host_lock, flags); - ata_scsi_cmd_error_handler(host, ap, &eh_work_q); - - /* If we timed raced normal completion and there is nothing to - recover nr_timedout == 0 why exactly are we doing error recovery ? */ - ata_scsi_port_error_handler(host, ap); + /* + * First check what errors we got with ata_scsi_cmd_error_handler(). + * If we had no command timeouts and EH is not scheduled for this port, + * meaning that we do not have any failed command, then there is no + * need to go through the full port error handling. We only need to + * flush the completed commands we have. + */ + nr_timedout = ata_scsi_cmd_error_handler(host, ap, &eh_work_q); + if (nr_timedout || ata_port_eh_scheduled(ap)) + ata_scsi_port_error_handler(host, ap); + else + scsi_eh_flush_done_q(&ap->eh_done_q); - /* finish or retry handled scmd's and clean up */ WARN_ON(!list_empty(&eh_work_q)); - } /** @@ -586,9 +592,11 @@ void ata_scsi_error(struct Scsi_Host *host) * process the given list of commands and return those finished to the * ap->eh_done_q. This function is the first part of the libata error * handler which processes a given list of failed commands. + * + * Return the number of commands that timed out. */ -void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, - struct list_head *eh_work_q) +int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, + struct list_head *eh_work_q) { int i; unsigned long flags; @@ -678,6 +686,8 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, ap->eh_tries = ATA_EH_MAX_TRIES; spin_unlock_irqrestore(ap->lock, flags); + + return nr_timedout; } EXPORT_SYMBOL(ata_scsi_cmd_error_handler); diff --git a/include/linux/libata.h b/include/linux/libata.h index db87c99e4189..5c085ef4eda7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1225,7 +1225,8 @@ extern int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev, extern struct ata_device *ata_dev_pair(struct ata_device *adev); int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev); extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap); -extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q); +int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, + struct list_head *eh_q); /* * SATA specific code - drivers/ata/libata-sata.c -- cgit v1.2.3 From d9d5e1bdd18074ea27985c777ddc3a8a0b007468 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Mon, 16 Feb 2026 00:22:16 +0900 Subject: dmaengine: dw-edma: Add virtual IRQ for interrupt-emulation doorbells Interrupt emulation can assert the dw-edma IRQ line without updating the DONE/ABORT bits. With the shared read/write/common IRQ handlers, the driver cannot reliably distinguish such an emulated interrupt from a real one and leaving a level IRQ asserted may wedge the line. Allocate a dedicated, requestable Linux virtual IRQ (db_irq) for interrupt emulation and attach an irq_chip whose .irq_ack runs the core-specific deassert sequence (.ack_emulated_irq()). The physical dw-edma interrupt handlers raise this virtual IRQ via generic_handle_irq(), ensuring emulated IRQs are always deasserted. Export the virtual IRQ number (db_irq) and the doorbell register offset (db_offset) via struct dw_edma_chip so platform users can expose interrupt emulation as a doorbell. Without this, a single interrupt-emulation write can leave the level IRQ line asserted and cause the generic IRQ layer to disable it. Signed-off-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260215152216.3393561-3-den@valinux.co.jp Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 127 +++++++++++++++++++++++++++++++++++-- include/linux/dma/edma.h | 6 ++ 2 files changed, 128 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index e7d698b352d3..cd34a3ea602d 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -663,7 +663,96 @@ static void dw_edma_abort_interrupt(struct dw_edma_chan *chan) chan->status = EDMA_ST_IDLE; } -static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) +static void dw_edma_emul_irq_ack(struct irq_data *d) +{ + struct dw_edma *dw = irq_data_get_irq_chip_data(d); + + dw_edma_core_ack_emulated_irq(dw); +} + +/* + * irq_chip implementation for interrupt-emulation doorbells. + * + * The emulated source has no mask/unmask mechanism. With handle_level_irq(), + * the flow is therefore: + * 1) .irq_ack() deasserts the source + * 2) registered handlers (if any) are dispatched + * Since deassertion is already done in .irq_ack(), handlers do not need to take + * care of it, hence IRQCHIP_ONESHOT_SAFE. + */ +static struct irq_chip dw_edma_emul_irqchip = { + .name = "dw-edma-emul", + .irq_ack = dw_edma_emul_irq_ack, + .flags = IRQCHIP_ONESHOT_SAFE | IRQCHIP_SKIP_SET_WAKE, +}; + +static int dw_edma_emul_irq_alloc(struct dw_edma *dw) +{ + struct dw_edma_chip *chip = dw->chip; + int virq; + + chip->db_irq = 0; + chip->db_offset = ~0; + + /* + * Only meaningful when the core provides the deassert sequence + * for interrupt emulation. + */ + if (!dw->core->ack_emulated_irq) + return 0; + + /* + * Allocate a single, requestable Linux virtual IRQ number. + * Use >= 1 so that 0 can remain a "not available" sentinel. + */ + virq = irq_alloc_desc(NUMA_NO_NODE); + if (virq < 0) + return virq; + + irq_set_chip_and_handler(virq, &dw_edma_emul_irqchip, handle_level_irq); + irq_set_chip_data(virq, dw); + irq_set_noprobe(virq); + + chip->db_irq = virq; + chip->db_offset = dw_edma_core_db_offset(dw); + + return 0; +} + +static void dw_edma_emul_irq_free(struct dw_edma *dw) +{ + struct dw_edma_chip *chip = dw->chip; + + if (!chip) + return; + if (chip->db_irq <= 0) + return; + + irq_free_descs(chip->db_irq, 1); + chip->db_irq = 0; + chip->db_offset = ~0; +} + +static inline irqreturn_t dw_edma_interrupt_emulated(void *data) +{ + struct dw_edma_irq *dw_irq = data; + struct dw_edma *dw = dw_irq->dw; + int db_irq = dw->chip->db_irq; + + if (db_irq > 0) { + /* + * Interrupt emulation may assert the IRQ line without updating the + * normal DONE/ABORT status bits. With a shared IRQ handler we + * cannot reliably detect such events by status registers alone, so + * always perform the core-specific deassert sequence. + */ + generic_handle_irq(db_irq); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static inline irqreturn_t dw_edma_interrupt_write_inner(int irq, void *data) { struct dw_edma_irq *dw_irq = data; @@ -672,7 +761,7 @@ static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) dw_edma_abort_interrupt); } -static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) +static inline irqreturn_t dw_edma_interrupt_read_inner(int irq, void *data) { struct dw_edma_irq *dw_irq = data; @@ -681,12 +770,33 @@ static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) dw_edma_abort_interrupt); } -static irqreturn_t dw_edma_interrupt_common(int irq, void *data) +static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) +{ + irqreturn_t ret = IRQ_NONE; + + ret |= dw_edma_interrupt_write_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); + + return ret; +} + +static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) { irqreturn_t ret = IRQ_NONE; - ret |= dw_edma_interrupt_write(irq, data); - ret |= dw_edma_interrupt_read(irq, data); + ret |= dw_edma_interrupt_read_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); + + return ret; +} + +static inline irqreturn_t dw_edma_interrupt_common(int irq, void *data) +{ + irqreturn_t ret = IRQ_NONE; + + ret |= dw_edma_interrupt_write_inner(irq, data); + ret |= dw_edma_interrupt_read_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); return ret; } @@ -973,6 +1083,11 @@ int dw_edma_probe(struct dw_edma_chip *chip) if (err) return err; + /* Allocate a dedicated virtual IRQ for interrupt-emulation doorbells */ + err = dw_edma_emul_irq_alloc(dw); + if (err) + dev_warn(dev, "Failed to allocate emulation IRQ: %d\n", err); + /* Setup write/read channels */ err = dw_edma_channel_setup(dw, wr_alloc, rd_alloc); if (err) @@ -988,6 +1103,7 @@ int dw_edma_probe(struct dw_edma_chip *chip) err_irq_free: for (i = (dw->nr_irqs - 1); i >= 0; i--) free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]); + dw_edma_emul_irq_free(dw); return err; } @@ -1010,6 +1126,7 @@ int dw_edma_remove(struct dw_edma_chip *chip) /* Free irqs */ for (i = (dw->nr_irqs - 1); i >= 0; i--) free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]); + dw_edma_emul_irq_free(dw); /* Deregister eDMA device */ dma_async_device_unregister(&dw->dma); diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 270b5458aecf..9da53c75e49b 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -73,6 +73,8 @@ enum dw_edma_chip_flags { * @ll_region_rd: DMA descriptor link list memory for read channel * @dt_region_wr: DMA data memory for write channel * @dt_region_rd: DMA data memory for read channel + * @db_irq: Virtual IRQ dedicated to interrupt emulation + * @db_offset: Offset from DMA register base * @mf: DMA register map format * @dw: struct dw_edma that is filled by dw_edma_probe() */ @@ -94,6 +96,10 @@ struct dw_edma_chip { struct dw_edma_region dt_region_wr[EDMA_MAX_WR_CH]; struct dw_edma_region dt_region_rd[EDMA_MAX_RD_CH]; + /* interrupt emulation */ + int db_irq; + resource_size_t db_offset; + enum dw_edma_map_format mf; struct dw_edma *dw; -- cgit v1.2.3 From 0289ada4a31661016a0611a41a4886bb958e9985 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 9 Feb 2026 12:44:33 +0000 Subject: coresight: Fix memory leak in coresight_alloc_device_name() The memory leak detector reports: echo clear > /sys/kernel/debug/kmemleak modprobe coresight_funnel rmmod coresight_funnel # Scan memory leak and report it echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak unreferenced object 0xffff0008020c7200 (size 64): comm "modprobe", pid 410, jiffies 4295333721 hex dump (first 32 bytes): d8 da fe 7e 09 00 ff ff e8 2e ff 7e 09 00 ff ff ...~.......~.... b0 6c ff 7e 09 00 ff ff 30 83 00 7f 09 00 ff ff .l.~....0....... backtrace (crc 4116a690): kmemleak_alloc+0xd8/0xf8 __kmalloc_node_track_caller_noprof+0x2c8/0x6f0 krealloc_node_align_noprof+0x13c/0x2c8 coresight_alloc_device_name+0xe4/0x158 [coresight] 0xffffd327ecef8394 0xffffd327ecef85ec amba_probe+0x118/0x1c8 really_probe+0xc8/0x3f0 __driver_probe_device+0x88/0x190 driver_probe_device+0x44/0x120 __driver_attach+0x100/0x238 bus_for_each_dev+0x84/0xf0 driver_attach+0x2c/0x40 bus_add_driver+0x128/0x258 driver_register+0x64/0x138 __amba_driver_register+0x2c/0x48 The memory leak is caused by not freeing the device list that maintains device indices. This device list preserves stable device indices across unbind and rebind device operations, so it does not share the same lifetime as a device instances and must only be freed when the module is unloaded. Some modules do not implement a module exit callback because they are registered using module_platform_driver(). As a result, the device list cannot be released during module exit for those modules. Fix this by moving the device list into the core layer. As a general solution, instead of maintaining a static list in each driver, drivers now allocate device lists via coresight_allocate_device_list() and device indices via coresight_allocate_device_idx(). The list is released only when the core module is unloaded by calling coresight_release_device_list(), avoiding the leak. Fixes: 0f5f9b6ba9e1 ("coresight: Use platform agnostic names") Reviewed-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20260209-arm_coresight_refactor_dev_register-v4-1-62d6042f76f7@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 4 +- drivers/hwtracing/coresight/coresight-core.c | 127 +++++++++++++++------ drivers/hwtracing/coresight/coresight-ctcu-core.c | 4 +- drivers/hwtracing/coresight/coresight-cti-core.c | 19 ++- drivers/hwtracing/coresight/coresight-dummy.c | 7 +- drivers/hwtracing/coresight/coresight-etb10.c | 4 +- drivers/hwtracing/coresight/coresight-funnel.c | 4 +- drivers/hwtracing/coresight/coresight-replicator.c | 4 +- drivers/hwtracing/coresight/coresight-stm.c | 4 +- drivers/hwtracing/coresight/coresight-tmc-core.c | 12 +- drivers/hwtracing/coresight/coresight-tnoc.c | 4 +- drivers/hwtracing/coresight/coresight-tpda.c | 4 +- drivers/hwtracing/coresight/coresight-tpdm.c | 4 +- drivers/hwtracing/coresight/coresight-tpiu.c | 4 +- drivers/hwtracing/coresight/ultrasoc-smb.c | 4 +- include/linux/coresight.h | 14 +-- 16 files changed, 120 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index dfd035852b12..ce71dcddfca2 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -30,8 +30,6 @@ #define catu_dbg(x, ...) do {} while (0) #endif -DEFINE_CORESIGHT_DEVLIST(catu_devs, "catu"); - struct catu_etr_buf { struct tmc_sg_table *catu_table; dma_addr_t sladdr; @@ -530,7 +528,7 @@ static int __catu_probe(struct device *dev, struct resource *res) if (ret) return ret; - catu_desc.name = coresight_alloc_device_name(&catu_devs, dev); + catu_desc.name = coresight_alloc_device_name("catu", dev); if (!catu_desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 80e26396ad0a..6881fdc5da92 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -53,6 +53,9 @@ struct coresight_node { const u32 coresight_barrier_pkt[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; EXPORT_SYMBOL_GPL(coresight_barrier_pkt); +/* List maintains the device index */ +static LIST_HEAD(coresight_dev_idx_list); + static const struct cti_assoc_op *cti_assoc_ops; void coresight_set_cti_ops(const struct cti_assoc_op *cti_op) @@ -1438,22 +1441,55 @@ void coresight_unregister(struct coresight_device *csdev) } EXPORT_SYMBOL_GPL(coresight_unregister); +static struct coresight_dev_list * +coresight_allocate_device_list(const char *prefix) +{ + struct coresight_dev_list *list; -/* - * coresight_search_device_idx - Search the fwnode handle of a device - * in the given dev_idx list. Must be called with the coresight_mutex held. - * - * Returns the index of the entry, when found. Otherwise, -ENOENT. - */ -static int coresight_search_device_idx(struct coresight_dev_list *dict, - struct fwnode_handle *fwnode) + /* Check if have already allocated */ + list_for_each_entry(list, &coresight_dev_idx_list, node) { + if (!strcmp(list->pfx, prefix)) + return list; + } + + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) + return NULL; + + list->pfx = kstrdup(prefix, GFP_KERNEL); + if (!list->pfx) { + kfree(list); + return NULL; + } + + list_add(&list->node, &coresight_dev_idx_list); + return list; +} + +static int coresight_allocate_device_idx(struct coresight_dev_list *list, + struct device *dev) { - int i; + struct fwnode_handle **fwnode_list; + struct fwnode_handle *fwnode = dev_fwnode(dev); + int idx; + + for (idx = 0; idx < list->nr_idx; idx++) + if (list->fwnode_list[idx] == fwnode) + return idx; + + /* Make space for the new entry */ + idx = list->nr_idx; + fwnode_list = krealloc_array(list->fwnode_list, + idx + 1, sizeof(*list->fwnode_list), + GFP_KERNEL); + if (!fwnode_list) + return -ENOMEM; - for (i = 0; i < dict->nr_idx; i++) - if (dict->fwnode_list[i] == fwnode) - return i; - return -ENOENT; + fwnode_list[idx] = fwnode; + list->fwnode_list = fwnode_list; + list->nr_idx = idx + 1; + + return idx; } static bool coresight_compare_type(enum coresight_dev_type type_a, @@ -1527,45 +1563,63 @@ bool coresight_loses_context_with_cpu(struct device *dev) EXPORT_SYMBOL_GPL(coresight_loses_context_with_cpu); /* - * coresight_alloc_device_name - Get an index for a given device in the - * device index list specific to a driver. An index is allocated for a - * device and is tracked with the fwnode_handle to prevent allocating + * coresight_alloc_device_name - Get an index for a given device in the list + * specific to a driver (presented by the prefix string). An index is allocated + * for a device and is tracked with the fwnode_handle to prevent allocating * duplicate indices for the same device (e.g, if we defer probing of * a device due to dependencies), in case the index is requested again. */ -char *coresight_alloc_device_name(struct coresight_dev_list *dict, - struct device *dev) +char *coresight_alloc_device_name(const char *prefix, struct device *dev) { - int idx; + struct coresight_dev_list *list; char *name = NULL; - struct fwnode_handle **list; + int idx; mutex_lock(&coresight_mutex); - idx = coresight_search_device_idx(dict, dev_fwnode(dev)); - if (idx < 0) { - /* Make space for the new entry */ - idx = dict->nr_idx; - list = krealloc_array(dict->fwnode_list, - idx + 1, sizeof(*dict->fwnode_list), - GFP_KERNEL); - if (ZERO_OR_NULL_PTR(list)) { - idx = -ENOMEM; - goto done; - } + list = coresight_allocate_device_list(prefix); + if (!list) + goto done; - list[idx] = dev_fwnode(dev); - dict->fwnode_list = list; - dict->nr_idx = idx + 1; - } + idx = coresight_allocate_device_idx(list, dev); - name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", dict->pfx, idx); + /* + * If index allocation fails, the device list is not released here; + * it is instead freed later by coresight_release_device_list() when + * the coresight_core module is unloaded. + */ + if (idx < 0) + goto done; + + name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", list->pfx, idx); done: mutex_unlock(&coresight_mutex); return name; } EXPORT_SYMBOL_GPL(coresight_alloc_device_name); +static void coresight_release_device_list(void) +{ + struct coresight_dev_list *list, *next; + int i; + + /* + * Here is no need to take coresight_mutex; this is during core module + * unloading, no race condition with other modules. + */ + + list_for_each_entry_safe(list, next, &coresight_dev_idx_list, node) { + for (i = 0; i < list->nr_idx; i++) + list->fwnode_list[i] = NULL; + list->nr_idx = 0; + list_del(&list->node); + + kfree(list->pfx); + kfree(list->fwnode_list); + kfree(list); + } +} + const struct bus_type coresight_bustype = { .name = "coresight", }; @@ -1639,6 +1693,7 @@ static void __exit coresight_exit(void) &coresight_notifier); etm_perf_exit(); bus_unregister(&coresight_bustype); + coresight_release_device_list(); } module_init(coresight_init); diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index abed15eb72b4..6813ae6e929b 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -19,8 +19,6 @@ #include "coresight-ctcu.h" #include "coresight-priv.h" -DEFINE_CORESIGHT_DEVLIST(ctcu_devs, "ctcu"); - #define ctcu_writel(drvdata, val, offset) __raw_writel((val), drvdata->base + offset) #define ctcu_readl(drvdata, offset) __raw_readl(drvdata->base + offset) @@ -187,7 +185,7 @@ static int ctcu_probe(struct platform_device *pdev) void __iomem *base; int i, ret; - desc.name = coresight_alloc_device_name(&ctcu_devs, dev); + desc.name = coresight_alloc_device_name("ctcu", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c index 7a8f1ef6b94e..fddc8f31b91d 100644 --- a/drivers/hwtracing/coresight/coresight-cti-core.c +++ b/drivers/hwtracing/coresight/coresight-cti-core.c @@ -48,15 +48,6 @@ static int nr_cti_cpu; /* quick lookup list for CPU bound CTIs when power handling */ static struct cti_drvdata *cti_cpu_drvdata[NR_CPUS]; -/* - * CTI naming. CTI bound to cores will have the name cti_cpu where - * N is the CPU ID. System CTIs will have the name cti_sys where I - * is an index allocated by order of discovery. - * - * CTI device name list - for CTI not bound to cores. - */ -DEFINE_CORESIGHT_DEVLIST(cti_sys_devs, "cti_sys"); - /* write set of regs to hardware - call with spinlock claimed */ void cti_write_all_hw_regs(struct cti_drvdata *drvdata) { @@ -889,12 +880,18 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id) /* default to powered - could change on PM notifications */ drvdata->config.hw_powered = true; - /* set up device name - will depend if cpu bound or otherwise */ + /* + * Set up device name - will depend if cpu bound or otherwise. + * + * CTI bound to cores will have the name cti_cpu where N is th + * eCPU ID. System CTIs will have the name cti_sys where I is an + * index allocated by order of discovery. + */ if (drvdata->ctidev.cpu >= 0) cti_desc.name = devm_kasprintf(dev, GFP_KERNEL, "cti_cpu%d", drvdata->ctidev.cpu); else - cti_desc.name = coresight_alloc_device_name(&cti_sys_devs, dev); + cti_desc.name = coresight_alloc_device_name("cti_sys", dev); if (!cti_desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-dummy.c b/drivers/hwtracing/coresight/coresight-dummy.c index 14322c99e29d..c176a2f57300 100644 --- a/drivers/hwtracing/coresight/coresight-dummy.c +++ b/drivers/hwtracing/coresight/coresight-dummy.c @@ -19,9 +19,6 @@ struct dummy_drvdata { u8 traceid; }; -DEFINE_CORESIGHT_DEVLIST(source_devs, "dummy_source"); -DEFINE_CORESIGHT_DEVLIST(sink_devs, "dummy_sink"); - static int dummy_source_enable(struct coresight_device *csdev, struct perf_event *event, enum cs_mode mode, __maybe_unused struct coresight_path *path) @@ -126,7 +123,7 @@ static int dummy_probe(struct platform_device *pdev) if (of_device_is_compatible(node, "arm,coresight-dummy-source")) { - desc.name = coresight_alloc_device_name(&source_devs, dev); + desc.name = coresight_alloc_device_name("dummy_source", dev); if (!desc.name) return -ENOMEM; @@ -155,7 +152,7 @@ static int dummy_probe(struct platform_device *pdev) drvdata->traceid = (u8)trace_id; } else if (of_device_is_compatible(node, "arm,coresight-dummy-sink")) { - desc.name = coresight_alloc_device_name(&sink_devs, dev); + desc.name = coresight_alloc_device_name("dummy_sink", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c index 6657602d8f2e..b952a1d47f12 100644 --- a/drivers/hwtracing/coresight/coresight-etb10.c +++ b/drivers/hwtracing/coresight/coresight-etb10.c @@ -63,8 +63,6 @@ #define ETB_FFSR_BIT 1 #define ETB_FRAME_SIZE_WORDS 4 -DEFINE_CORESIGHT_DEVLIST(etb_devs, "etb"); - /** * struct etb_drvdata - specifics associated to an ETB component * @base: memory mapped base address for this component. @@ -722,7 +720,7 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id) struct resource *res = &adev->res; struct coresight_desc desc = { 0 }; - desc.name = coresight_alloc_device_name(&etb_devs, dev); + desc.name = coresight_alloc_device_name("etb", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index 3b248e54471a..3f56ceccd8c9 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -30,8 +30,6 @@ #define FUNNEL_HOLDTIME (0x7 << FUNNEL_HOLDTIME_SHFT) #define FUNNEL_ENSx_MASK 0xff -DEFINE_CORESIGHT_DEVLIST(funnel_devs, "funnel"); - /** * struct funnel_drvdata - specifics associated to a funnel component * @base: memory mapped base address for this component. @@ -223,7 +221,7 @@ static int funnel_probe(struct device *dev, struct resource *res) of_device_is_compatible(dev->of_node, "arm,coresight-funnel")) dev_warn_once(dev, "Uses OBSOLETE CoreSight funnel binding\n"); - desc.name = coresight_alloc_device_name(&funnel_devs, dev); + desc.name = coresight_alloc_device_name("funnel", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index e6472658235d..07fc04f53b88 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -24,8 +24,6 @@ #define REPLICATOR_IDFILTER0 0x000 #define REPLICATOR_IDFILTER1 0x004 -DEFINE_CORESIGHT_DEVLIST(replicator_devs, "replicator"); - /** * struct replicator_drvdata - specifics associated to a replicator component * @base: memory mapped base address for this component. Also indicates @@ -230,7 +228,7 @@ static int replicator_probe(struct device *dev, struct resource *res) dev_warn_once(dev, "Uses OBSOLETE CoreSight replicator binding\n"); - desc.name = coresight_alloc_device_name(&replicator_devs, dev); + desc.name = coresight_alloc_device_name("replicator", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index e68529bf89c9..aca6cec7885a 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -110,8 +110,6 @@ struct channel_space { unsigned long *guaranteed; }; -DEFINE_CORESIGHT_DEVLIST(stm_devs, "stm"); - /** * struct stm_drvdata - specifics associated to an STM component * @base: memory mapped base address for this component. @@ -834,7 +832,7 @@ static int __stm_probe(struct device *dev, struct resource *res) struct resource ch_res; struct coresight_desc desc = { 0 }; - desc.name = coresight_alloc_device_name(&stm_devs, dev); + desc.name = coresight_alloc_device_name("stm", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 36599c431be6..58b469ee73b4 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -32,10 +32,6 @@ #include "coresight-priv.h" #include "coresight-tmc.h" -DEFINE_CORESIGHT_DEVLIST(etb_devs, "tmc_etb"); -DEFINE_CORESIGHT_DEVLIST(etf_devs, "tmc_etf"); -DEFINE_CORESIGHT_DEVLIST(etr_devs, "tmc_etr"); - int tmc_wait_for_tmcready(struct tmc_drvdata *drvdata) { struct coresight_device *csdev = drvdata->csdev; @@ -777,7 +773,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct tmc_drvdata *drvdata; struct coresight_desc desc = { 0 }; - struct coresight_dev_list *dev_list = NULL; + const char *dev_list = NULL; drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL); if (!drvdata) @@ -827,7 +823,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) desc.type = CORESIGHT_DEV_TYPE_SINK; desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER; desc.ops = &tmc_etb_cs_ops; - dev_list = &etb_devs; + dev_list = "tmc_etb"; break; case TMC_CONFIG_TYPE_ETR: desc.groups = coresight_etr_groups; @@ -839,7 +835,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) goto out; idr_init(&drvdata->idr); mutex_init(&drvdata->idr_mutex); - dev_list = &etr_devs; + dev_list = "tmc_etr"; break; case TMC_CONFIG_TYPE_ETF: desc.groups = coresight_etf_groups; @@ -847,7 +843,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER; desc.subtype.link_subtype = CORESIGHT_DEV_SUBTYPE_LINK_FIFO; desc.ops = &tmc_etf_cs_ops; - dev_list = &etf_devs; + dev_list = "tmc_etf"; break; default: pr_err("%s: Unsupported TMC config\n", desc.name); diff --git a/drivers/hwtracing/coresight/coresight-tnoc.c b/drivers/hwtracing/coresight/coresight-tnoc.c index 1128612e70a7..96a25877b824 100644 --- a/drivers/hwtracing/coresight/coresight-tnoc.c +++ b/drivers/hwtracing/coresight/coresight-tnoc.c @@ -47,8 +47,6 @@ struct trace_noc_drvdata { int atid; }; -DEFINE_CORESIGHT_DEVLIST(trace_noc_devs, "traceNoc"); - static void trace_noc_enable_hw(struct trace_noc_drvdata *drvdata) { u32 val; @@ -191,7 +189,7 @@ static int _tnoc_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; int ret; - desc.name = coresight_alloc_device_name(&trace_noc_devs, dev); + desc.name = coresight_alloc_device_name("traceNoc", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-tpda.c b/drivers/hwtracing/coresight/coresight-tpda.c index 7055f8f13427..89c8f71f0aff 100644 --- a/drivers/hwtracing/coresight/coresight-tpda.c +++ b/drivers/hwtracing/coresight/coresight-tpda.c @@ -20,8 +20,6 @@ #include "coresight-trace-id.h" #include "coresight-tpdm.h" -DEFINE_CORESIGHT_DEVLIST(tpda_devs, "tpda"); - static void tpda_clear_element_size(struct coresight_device *csdev) { struct tpda_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); @@ -585,7 +583,7 @@ static int tpda_probe(struct amba_device *adev, const struct amba_id *id) if (ret) return ret; - desc.name = coresight_alloc_device_name(&tpda_devs, dev); + desc.name = coresight_alloc_device_name("tpda", dev); if (!desc.name) return -ENOMEM; desc.type = CORESIGHT_DEV_TYPE_LINK; diff --git a/drivers/hwtracing/coresight/coresight-tpdm.c b/drivers/hwtracing/coresight/coresight-tpdm.c index 06e0a905a67d..da77bdaad0a4 100644 --- a/drivers/hwtracing/coresight/coresight-tpdm.c +++ b/drivers/hwtracing/coresight/coresight-tpdm.c @@ -19,8 +19,6 @@ #include "coresight-priv.h" #include "coresight-tpdm.h" -DEFINE_CORESIGHT_DEVLIST(tpdm_devs, "tpdm"); - static bool tpdm_has_dsb_dataset(struct tpdm_drvdata *drvdata) { return (drvdata->datasets & TPDM_PIDR0_DS_DSB); @@ -1416,7 +1414,7 @@ static int tpdm_probe(struct device *dev, struct resource *res) } /* Set up coresight component description */ - desc.name = coresight_alloc_device_name(&tpdm_devs, dev); + desc.name = coresight_alloc_device_name("tpdm", dev); if (!desc.name) return -ENOMEM; desc.type = CORESIGHT_DEV_TYPE_SOURCE; diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index aaa44bc521c3..b8560b140e0f 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -49,8 +49,6 @@ #define FFCR_FON_MAN BIT(6) #define FFCR_STOP_FI BIT(12) -DEFINE_CORESIGHT_DEVLIST(tpiu_devs, "tpiu"); - /* * @base: memory mapped base address for this component. * @atclk: optional clock for the core parts of the TPIU. @@ -134,7 +132,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; int ret; - desc.name = coresight_alloc_device_name(&tpiu_devs, dev); + desc.name = coresight_alloc_device_name("tpiu", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/ultrasoc-smb.c b/drivers/hwtracing/coresight/ultrasoc-smb.c index 8f7922a5e534..5776f63468fa 100644 --- a/drivers/hwtracing/coresight/ultrasoc-smb.c +++ b/drivers/hwtracing/coresight/ultrasoc-smb.c @@ -17,8 +17,6 @@ #include "coresight-priv.h" #include "ultrasoc-smb.h" -DEFINE_CORESIGHT_DEVLIST(sink_devs, "ultra_smb"); - #define ULTRASOC_SMB_DSM_UUID "82ae1283-7f6a-4cbe-aa06-53e8fb24db18" static bool smb_buffer_not_empty(struct smb_drv_data *drvdata) @@ -478,7 +476,7 @@ static int smb_register_sink(struct platform_device *pdev, desc.pdata = pdata; desc.dev = &pdev->dev; desc.groups = smb_sink_groups; - desc.name = coresight_alloc_device_name(&sink_devs, &pdev->dev); + desc.name = coresight_alloc_device_name("ultra_smb", &pdev->dev); if (!desc.name) { dev_err(&pdev->dev, "Failed to alloc coresight device name"); return -ENOMEM; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2b48be97fcd0..2131febebee9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -306,24 +306,19 @@ struct coresight_device { * coresight_dev_list - Mapping for devices to "name" index for device * names. * + * @node: Node on the global device index list. * @nr_idx: Number of entries already allocated. * @pfx: Prefix pattern for device name. * @fwnode_list: Array of fwnode_handles associated with each allocated * index, upto nr_idx entries. */ struct coresight_dev_list { + struct list_head node; int nr_idx; - const char *pfx; + char *pfx; struct fwnode_handle **fwnode_list; }; -#define DEFINE_CORESIGHT_DEVLIST(var, dev_pfx) \ -static struct coresight_dev_list (var) = { \ - .pfx = dev_pfx, \ - .nr_idx = 0, \ - .fwnode_list = NULL, \ -} - #define to_coresight_device(d) container_of(d, struct coresight_device, dev) /** @@ -663,8 +658,7 @@ void coresight_clear_self_claim_tag(struct csdev_access *csa); void coresight_clear_self_claim_tag_unlocked(struct csdev_access *csa); void coresight_disclaim_device(struct coresight_device *csdev); void coresight_disclaim_device_unlocked(struct coresight_device *csdev); -char *coresight_alloc_device_name(struct coresight_dev_list *devs, - struct device *dev); +char *coresight_alloc_device_name(const char *prefix, struct device *dev); bool coresight_loses_context_with_cpu(struct device *dev); -- cgit v1.2.3 From e736a223ab150689b639a60c70a9490d884971ad Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Tue, 17 Feb 2026 11:23:03 +0000 Subject: RDMA/efa: Expose new extended max inline buff size Add new extended max inline query and report the new value to userspace. Reviewed-by: Firas Jahjah Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Link: https://patch.msgid.link/20260217112304.36849-3-ynachum@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 15 ++++++++++++++- drivers/infiniband/hw/efa/efa_com_cmd.c | 15 +++++++++++++++ drivers/infiniband/hw/efa/efa_com_cmd.h | 3 ++- drivers/infiniband/hw/efa/efa_verbs.c | 3 ++- include/uapi/rdma/efa-abi.h | 5 +++-- 5 files changed, 36 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 5bbc765b6e3f..ad34ea5da6b0 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -42,6 +42,7 @@ enum efa_admin_aq_feature_id { EFA_ADMIN_HW_HINTS = 5, EFA_ADMIN_HOST_INFO = 6, EFA_ADMIN_EVENT_QUEUE_ATTR = 7, + EFA_ADMIN_QUEUE_ATTR_2 = 9, }; /* QP transport type */ @@ -751,7 +752,12 @@ struct efa_admin_feature_queue_attr_desc_1 { /* Maximum number of WQEs per Send Queue */ u32 max_sq_depth; - /* Maximum size of data that can be sent inline in a Send WQE */ + /* + * Maximum size of data that can be sent inline in a Send WQE + * (deprecated by + * efa_admin_feature_queue_attr_desc_2::inline_buf_size_ex on + * supporting devices) + */ u32 inline_buf_size; /* Maximum number of buffer descriptors per Recv Queue */ @@ -805,6 +811,11 @@ struct efa_admin_feature_queue_attr_desc_1 { u16 max_tx_batch; }; +struct efa_admin_feature_queue_attr_desc_2 { + /* Maximum size of data that can be sent inline in a Send WQE */ + u16 inline_buf_size_ex; +}; + struct efa_admin_event_queue_attr_desc { /* The maximum number of event queues supported */ u32 max_eq; @@ -874,6 +885,8 @@ struct efa_admin_get_feature_resp { struct efa_admin_feature_queue_attr_desc_1 queue_attr_1; + struct efa_admin_feature_queue_attr_desc_2 queue_attr_2; + struct efa_admin_event_queue_attr_desc event_queue_attr; struct efa_admin_hw_hints hw_hints; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 592c420e4473..63c7f07806a8 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -505,6 +505,21 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->max_tx_batch = resp.u.queue_attr_1.max_tx_batch; result->min_sq_depth = resp.u.queue_attr_1.min_sq_depth; + if (efa_com_check_supported_feature_id(edev, EFA_ADMIN_QUEUE_ATTR_2)) { + err = efa_com_get_feature(edev, &resp, + EFA_ADMIN_QUEUE_ATTR_2); + if (err) { + ibdev_err_ratelimited( + edev->efa_dev, + "Failed to get queue attributes2 %d\n", err); + return err; + } + + result->inline_buf_size_ex = resp.u.queue_attr_2.inline_buf_size_ex; + } else { + result->inline_buf_size_ex = result->inline_buf_size; + } + err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR); if (err) { ibdev_err_ratelimited(edev->efa_dev, diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 3ac2686abba1..ef15b3c38429 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_CMD_H_ @@ -127,6 +127,7 @@ struct efa_com_get_device_attr_result { u32 max_cq; u32 max_cq_depth; /* cqes */ u32 inline_buf_size; + u32 inline_buf_size_ex; u32 max_mr; u32 max_pd; u32 max_ah; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index b5b93b42e6c4..6eb8cf8ecf80 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -1988,6 +1988,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH; resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq; resp.inline_buf_size = dev->dev_attr.inline_buf_size; + resp.inline_buf_size_ex = dev->dev_attr.inline_buf_size_ex; resp.max_llq_size = dev->dev_attr.max_llq_size; resp.max_tx_batch = dev->dev_attr.max_tx_batch; resp.min_sq_wr = dev->dev_attr.min_sq_depth; diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 98b71b9979f8..13225b038124 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -44,7 +44,8 @@ struct efa_ibv_alloc_ucontext_resp { __u32 max_llq_size; /* bytes */ __u16 max_tx_batch; /* units of 64 bytes */ __u16 min_sq_wr; - __u8 reserved_a0[4]; + __u16 inline_buf_size_ex; + __u8 reserved_b0[2]; }; struct efa_ibv_alloc_pd_resp { -- cgit v1.2.3 From 6094ea64c69520ed1e770e7c79c43412de202bfa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 13 Feb 2026 12:57:37 +0200 Subject: RDMA: Move DMA block iterator logic into dedicated files The DMA iterator logic was mixed into verbs and umem-specific code, forcing all users to include rdma/ib_umem.h. Move the block iterator logic into iter.c and rdma/iter.h so that rdma/ib_umem.h and rdma/ib_verbs.h can be separated in a follow-up patch. Link: https://patch.msgid.link/20260213-refactor-umem-v1-1-f3be85847922@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/iter.c | 43 ++++++++++++++ drivers/infiniband/core/verbs.c | 38 ------------ drivers/infiniband/hw/bnxt_re/qplib_res.c | 2 +- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- drivers/infiniband/hw/erdma/erdma_verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +- drivers/infiniband/hw/ionic/ionic_ibdev.h | 2 +- drivers/infiniband/hw/irdma/main.h | 2 +- drivers/infiniband/hw/mana/mana_ib.h | 2 +- drivers/infiniband/hw/mlx4/mr.c | 1 + drivers/infiniband/hw/mlx5/mem.c | 1 + drivers/infiniband/hw/mlx5/umr.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- drivers/infiniband/hw/qedr/verbs.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 2 +- include/rdma/ib_umem.h | 32 ---------- include/rdma/ib_verbs.h | 48 --------------- include/rdma/iter.h | 88 ++++++++++++++++++++++++++++ 21 files changed, 147 insertions(+), 131 deletions(-) create mode 100644 drivers/infiniband/core/iter.c create mode 100644 include/rdma/iter.h (limited to 'include') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index a2a7a9d2e0d3..deffb03c4574 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o lag.o + trace.o lag.o iter.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/iter.c b/drivers/infiniband/core/iter.c new file mode 100644 index 000000000000..8e543d100657 --- /dev/null +++ b/drivers/infiniband/core/iter.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. */ + +#include +#include + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, unsigned int nents, + unsigned long pgsz) +{ + memset(biter, 0, sizeof(struct ib_block_iter)); + biter->__sg = sglist; + biter->__sg_nents = nents; + + /* Driver provides best block size to use */ + biter->__pg_bit = __fls(pgsz); +} +EXPORT_SYMBOL(__rdma_block_iter_start); + +bool __rdma_block_iter_next(struct ib_block_iter *biter) +{ + unsigned int block_offset; + unsigned int delta; + + if (!biter->__sg_nents || !biter->__sg) + return false; + + biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; + block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); + delta = BIT_ULL(biter->__pg_bit) - block_offset; + + while (biter->__sg_nents && biter->__sg && + sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { + delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; + biter->__sg_advance = 0; + biter->__sg = sg_next(biter->__sg); + biter->__sg_nents--; + } + biter->__sg_advance += delta; + + return true; +} +EXPORT_SYMBOL(__rdma_block_iter_next); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 575b4a4b200b..dc2c46f3bf64 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -3154,44 +3154,6 @@ int rdma_init_netdev(struct ib_device *device, u32 port_num, } EXPORT_SYMBOL(rdma_init_netdev); -void __rdma_block_iter_start(struct ib_block_iter *biter, - struct scatterlist *sglist, unsigned int nents, - unsigned long pgsz) -{ - memset(biter, 0, sizeof(struct ib_block_iter)); - biter->__sg = sglist; - biter->__sg_nents = nents; - - /* Driver provides best block size to use */ - biter->__pg_bit = __fls(pgsz); -} -EXPORT_SYMBOL(__rdma_block_iter_start); - -bool __rdma_block_iter_next(struct ib_block_iter *biter) -{ - unsigned int block_offset; - unsigned int delta; - - if (!biter->__sg_nents || !biter->__sg) - return false; - - biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; - block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - delta = BIT_ULL(biter->__pg_bit) - block_offset; - - while (biter->__sg_nents && biter->__sg && - sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { - delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; - biter->__sg_advance = 0; - biter->__sg = sg_next(biter->__sg); - biter->__sg_nents--; - } - biter->__sg_advance += delta; - - return true; -} -EXPORT_SYMBOL(__rdma_block_iter_next); - /** * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct * for the drivers. diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 341bae3d8a1d..41ad8c2018fd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include "roce_hsi.h" #include "qplib_res.h" diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index b8d49abde099..9fde78b74690 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -32,9 +32,9 @@ #include #include -#include #include #include +#include #include "iw_cxgb4.h" diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index bb59c02b807c..1ef9da94b98f 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -9,9 +9,9 @@ #include #include -#include #include #include +#include #include #define UVERBS_MODULE_NAME efa_ib #include diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 9f74aadc3047..04136a0281aa 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include "erdma.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 8e802f118bc9..142c86f462fa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -32,7 +32,7 @@ */ #include -#include +#include #include "hns_roce_device.h" void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf) diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.h b/drivers/infiniband/hw/ionic/ionic_ibdev.h index 82fda1e3cdb6..63828240d659 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.h +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.h @@ -4,9 +4,9 @@ #ifndef _IONIC_IBDEV_H_ #define _IONIC_IBDEV_H_ -#include #include #include +#include #include #include diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index d320d1a228b3..3d49bd57bae7 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -37,8 +37,8 @@ #include #include #include -#include #include +#include #include #include "osdep.h" #include "defs.h" diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index e447acfd2071..a7c8c0fd7019 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 77a72d2b0dd2..650b4a9121ff 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -33,6 +33,7 @@ #include #include +#include #include "mlx4_ib.h" diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index af321f6ef7f5..75d5b5672b5c 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -31,6 +31,7 @@ */ #include +#include #include "mlx5_ib.h" /* diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 4e562e0dd9e1..29488fba21a0 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -2,6 +2,7 @@ /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */ #include +#include #include "mlx5_ib.h" #include "umr.h" #include "wr.h" diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index ef0635064fba..ee1d3583bbb9 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -35,8 +35,8 @@ */ #include -#include #include +#include #include #include diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index e89be2fbd5eb..c73d4bbee71f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -45,9 +45,9 @@ #include #include #include -#include #include #include +#include #include #include "ocrdma.h" diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 33b4a0e6d3a8..2fa9e07710d3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -39,9 +39,9 @@ #include #include #include -#include #include #include +#include #include #include diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 763ddc6f25d1..23e547d4b3a7 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -53,8 +53,8 @@ #include #include #include -#include #include +#include #include #include "pvrdma_ring.h" diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 09b7f7d4685e..db92d4623647 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -75,38 +75,6 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) { return ib_umem_num_dma_blocks(umem, PAGE_SIZE); } - -static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, - struct ib_umem *umem, - unsigned long pgsz) -{ - __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, - umem->sgt_append.sgt.nents, pgsz); - biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1); - biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz); -} - -static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) -{ - return __rdma_block_iter_next(biter) && biter->__sg_numblocks--; -} - -/** - * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem - * @umem: umem to iterate over - * @biter: block iterator variable - * @pgsz: Page size to split the list into - * - * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The - * returned DMA blocks will be aligned to pgsz and span the range: - * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz) - * - * Performs exactly ib_umem_num_dma_blocks() iterations. - */ -#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ - for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ - __rdma_umem_block_iter_next(biter);) - #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3f3827e1c711..7bdd77ed7e20 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2959,22 +2959,6 @@ struct ib_client { u8 no_kverbs_req:1; }; -/* - * IB block DMA iterator - * - * Iterates the DMA-mapped SGL in contiguous memory blocks aligned - * to a HW supported page size. - */ -struct ib_block_iter { - /* internal states */ - struct scatterlist *__sg; /* sg holding the current aligned block */ - dma_addr_t __dma_addr; /* unaligned DMA address of this block */ - size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ - unsigned int __sg_nents; /* number of SG entries */ - unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ - unsigned int __pg_bit; /* alignment of current block */ -}; - struct ib_device *_ib_alloc_device(size_t size, struct net *net); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ @@ -3003,38 +2987,6 @@ void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -void __rdma_block_iter_start(struct ib_block_iter *biter, - struct scatterlist *sglist, - unsigned int nents, - unsigned long pgsz); -bool __rdma_block_iter_next(struct ib_block_iter *biter); - -/** - * rdma_block_iter_dma_address - get the aligned dma address of the current - * block held by the block iterator. - * @biter: block iterator holding the memory block - */ -static inline dma_addr_t -rdma_block_iter_dma_address(struct ib_block_iter *biter) -{ - return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); -} - -/** - * rdma_for_each_block - iterate over contiguous memory blocks of the sg list - * @sglist: sglist to iterate over - * @biter: block iterator holding the memory block - * @nents: maximum number of sg entries to iterate over - * @pgsz: best HW supported page size to use - * - * Callers may use rdma_block_iter_dma_address() to get each - * blocks aligned DMA address. - */ -#define rdma_for_each_block(sglist, biter, nents, pgsz) \ - for (__rdma_block_iter_start(biter, sglist, nents, \ - pgsz); \ - __rdma_block_iter_next(biter);) - /** * ib_get_client_data - Get IB client context * @device:Device to get context for diff --git a/include/rdma/iter.h b/include/rdma/iter.h new file mode 100644 index 000000000000..19d64ef04ba9 --- /dev/null +++ b/include/rdma/iter.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef _RDMA_ITER_H_ +#define _RDMA_ITER_H_ + +#include +#include + +/** + * IB block DMA iterator + * + * Iterates the DMA-mapped SGL in contiguous memory blocks aligned + * to a HW supported page size. + */ +struct ib_block_iter { + /* internal states */ + struct scatterlist *__sg; /* sg holding the current aligned block */ + dma_addr_t __dma_addr; /* unaligned DMA address of this block */ + size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ + unsigned int __sg_nents; /* number of SG entries */ + unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ + unsigned int __pg_bit; /* alignment of current block */ +}; + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, + unsigned int nents, + unsigned long pgsz); +bool __rdma_block_iter_next(struct ib_block_iter *biter); + +/** + * rdma_block_iter_dma_address - get the aligned dma address of the current + * block held by the block iterator. + * @biter: block iterator holding the memory block + */ +static inline dma_addr_t +rdma_block_iter_dma_address(struct ib_block_iter *biter) +{ + return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); +} + +/** + * rdma_for_each_block - iterate over contiguous memory blocks of the sg list + * @sglist: sglist to iterate over + * @biter: block iterator holding the memory block + * @nents: maximum number of sg entries to iterate over + * @pgsz: best HW supported page size to use + * + * Callers may use rdma_block_iter_dma_address() to get each + * blocks aligned DMA address. + */ +#define rdma_for_each_block(sglist, biter, nents, pgsz) \ + for (__rdma_block_iter_start(biter, sglist, nents, \ + pgsz); \ + __rdma_block_iter_next(biter);) + +static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, + struct ib_umem *umem, + unsigned long pgsz) +{ + __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.nents, pgsz); + biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1); + biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz); +} + +static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) +{ + return __rdma_block_iter_next(biter) && biter->__sg_numblocks--; +} + +/** + * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem + * @umem: umem to iterate over + * @pgsz: Page size to split the list into + * + * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The + * returned DMA blocks will be aligned to pgsz and span the range: + * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz) + * + * Performs exactly ib_umem_num_dma_blocks() iterations. + */ +#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ + for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ + __rdma_umem_block_iter_next(biter);) + +#endif /* _RDMA_ITER_H_ */ -- cgit v1.2.3 From 2ae3c4f6eae911946d0971f377fd00543d2a933e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 13 Feb 2026 12:57:38 +0200 Subject: RDMA/umem: Allow including ib_umem header from any location Including ib_umem.h currently triggers circular dependency errors. These issues can be resolved by removing the include of ib_verbs.h, which was only needed to resolve the struct ib_device pointer. >> depmod: ERROR: Cycle detected: ib_core -> ib_uverbs -> ib_core >> depmod: ERROR: Found 2 modules in dependency cycles! make[3]: *** [scripts/Makefile.modinst:132: depmod] Error 1 make[3]: Target '__modinst' not remade because of errors. make[2]: *** [Makefile:1960: modules_install] Error 2 make[1]: *** [Makefile:248: __sub-make] Error 2 make[1]: Target 'modules_install' not remade because of errors. make: *** [Makefile:248: __sub-make] Error 2 make: Target 'modules_install' not remade because of errors. Link: https://patch.msgid.link/20260213-refactor-umem-v1-2-f3be85847922@nvidia.com Signed-off-by: Leon Romanovsky --- include/rdma/ib_umem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index db92d4623647..d0772a1ed802 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -10,8 +10,8 @@ #include #include #include -#include +struct ib_device; struct ib_ucontext; struct ib_umem_odp; struct dma_buf_attach_ops; -- cgit v1.2.3 From e3104fe9217b08c12df8041c50d11df1159ef330 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 13 Feb 2026 12:57:39 +0200 Subject: RDMA/umem: Remove unnecessary includes and defines from ib_umem header The ib_umem header no longer requires the removed includes or forward declarations, so drop them to reduce clutter. Link: https://patch.msgid.link/20260213-refactor-umem-v1-3-f3be85847922@nvidia.com Signed-off-by: Leon Romanovsky --- include/rdma/ib_umem.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index d0772a1ed802..1cc1d4077353 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -7,13 +7,9 @@ #ifndef IB_UMEM_H #define IB_UMEM_H -#include #include -#include struct ib_device; -struct ib_ucontext; -struct ib_umem_odp; struct dma_buf_attach_ops; struct ib_umem { -- cgit v1.2.3 From 25c741048891c4d3fc627cd5220e2cae4bab42a1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 13 Feb 2026 12:57:41 +0200 Subject: RDMA/core: Manage CQ umem in core code In the current implementation, CQ umem is handled both by ib_core and the driver. ib_core sometimes creates and destroys it, while the driver also destroys it. Store the umem in struct ib_cq and ensure that only ib_core manages its lifetime, relying solely on its internal reference counter. Link: https://patch.msgid.link/20260213-refactor-umem-v1-5-f3be85847922@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 1 + drivers/infiniband/core/uverbs_std_types_cq.c | 7 ++++++- drivers/infiniband/core/verbs.c | 2 ++ drivers/infiniband/hw/efa/efa_verbs.c | 24 +++++++++++------------- include/rdma/ib_verbs.h | 1 + 6 files changed, 22 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index cff4fcca2c34..4eef7b76fe46 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -283,7 +283,7 @@ EXPORT_SYMBOL(ib_umem_get); */ void ib_umem_release(struct ib_umem *umem) { - if (!umem) + if (IS_ERR_OR_NULL(umem)) return; if (umem->is_dmabuf) return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 758ed4ae5f7a..87f327fc1f4e 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1085,6 +1085,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs, return uverbs_response(attrs, &resp, sizeof(resp)); err_free: + ib_umem_release(cq->umem); rdma_restrack_put(&cq->res); kfree(cq); err_file: diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index fab5d914029d..05809f9ff0f6 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -186,6 +186,11 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; + /* + * If UMEM is not provided here, legacy drivers will set it during + * CQ creation based on their internal udata. + */ + cq->umem = umem; atomic_set(&cq->usecnt, 0); rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); @@ -206,7 +211,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return ret; err_free: - ib_umem_release(umem); + ib_umem_release(cq->umem); rdma_restrack_put(&cq->res); kfree(cq); err_event_file: diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index dc2c46f3bf64..29694145ce5f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -2249,6 +2250,7 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) if (ret) return ret; + ib_umem_release(cq->umem); rdma_restrack_del(&cq->res); kfree(cq); return ret; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 1ef9da94b98f..7180d31218c5 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1083,15 +1083,14 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); efa_destroy_cq_idx(dev, cq->cq_idx); - efa_cq_user_mmap_entries_remove(cq); + if (cq->cpu_addr) + efa_cq_user_mmap_entries_remove(cq); if (cq->eq) { xa_erase(&dev->cqs_xa, cq->cq_idx); synchronize_irq(cq->eq->irq.irqn); } - if (cq->umem) - ib_umem_release(cq->umem); - else + if (cq->cpu_addr) efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE); return 0; } @@ -1212,22 +1211,20 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ucontext = ucontext; cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); - if (umem) { - if (umem->length < cq->size) { + if (ibcq->umem) { + if (ibcq->umem->length < cq->size) { ibdev_dbg(&dev->ibdev, "External memory too small\n"); err = -EINVAL; goto err_out; } - if (!ib_umem_is_contiguous(umem)) { + if (!ib_umem_is_contiguous(ibcq->umem)) { ibdev_dbg(&dev->ibdev, "Non contiguous CQ unsupported\n"); err = -EINVAL; goto err_out; } - cq->cpu_addr = NULL; - cq->dma_addr = ib_umem_start_dma_addr(umem); - cq->umem = umem; + cq->dma_addr = ib_umem_start_dma_addr(ibcq->umem); } else { cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, DMA_FROM_DEVICE); @@ -1259,7 +1256,7 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ibcq.cqe = result.actual_depth; WARN_ON_ONCE(entries != result.actual_depth); - if (!umem) + if (cq->cpu_addr) err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid); if (err) { @@ -1296,11 +1293,12 @@ err_xa_erase: if (cq->eq) xa_erase(&dev->cqs_xa, cq->cq_idx); err_remove_mmap: - efa_cq_user_mmap_entries_remove(cq); + if (cq->cpu_addr) + efa_cq_user_mmap_entries_remove(cq); err_destroy_cq: efa_destroy_cq_idx(dev, cq->cq_idx); err_free_mapped: - if (!umem) + if (cq->cpu_addr) efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE); err_out: diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7bdd77ed7e20..8531eed7b394 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1650,6 +1650,7 @@ struct ib_cq { u8 interrupt:1; u8 shared:1; unsigned int comp_vector; + struct ib_umem *umem; /* * Implementation details of the RDMA core, don't use in drivers: -- cgit v1.2.3 From 584ec74748e6fea9042dbd4fd516b025fbe38372 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 13 Feb 2026 12:57:43 +0200 Subject: RDMA/core: Prepare create CQ path for API unification Ensure that .create_cq_umem() and .create_cq() follow the same API contract, allowing drivers to be gradually migrated to the umem-aware CQ management flow. Link: https://patch.msgid.link/20260213-refactor-umem-v1-7-f3be85847922@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 5 ++++- drivers/infiniband/core/uverbs_std_types_cq.c | 16 +++++++++++----- drivers/infiniband/core/verbs.c | 6 +++++- drivers/infiniband/hw/efa/efa.h | 6 ++---- drivers/infiniband/hw/efa/efa_main.c | 3 +-- drivers/infiniband/hw/efa/efa_verbs.c | 10 ++-------- include/rdma/ib_verbs.h | 3 +-- 8 files changed, 27 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1b5f1ee0a557..c7b227e2e657 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2700,7 +2700,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); - SET_DEVICE_OP(dev_ops, create_cq_umem); + SET_DEVICE_OP(dev_ops, create_user_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 87f327fc1f4e..7322ea4cfcbf 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1068,7 +1068,10 @@ static int create_cq(struct uverbs_attr_bundle *attrs, rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, attrs); + if (ib_dev->ops.create_user_cq) + ret = ib_dev->ops.create_user_cq(cq, &attr, attrs); + else + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; rdma_restrack_add(&cq->res); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 05809f9ff0f6..b999d8d62694 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -78,7 +78,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( int buffer_fd; int ret; - if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq) + if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_user_cq) || + !ib_dev->ops.destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -130,7 +131,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) || uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || - !ib_dev->ops.create_cq_umem) { + !ib_dev->ops.create_user_cq) { ret = -EINVAL; goto err_event_file; } @@ -155,7 +156,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( goto err_event_file; if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) || - !ib_dev->ops.create_cq_umem) { + !ib_dev->ops.create_user_cq) { ret = -EINVAL; goto err_event_file; } @@ -196,11 +197,16 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) : - ib_dev->ops.create_cq(cq, &attr, attrs); + if (ib_dev->ops.create_user_cq) + ret = ib_dev->ops.create_user_cq(cq, &attr, attrs); + else + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; + /* Check that driver didn't overrun existing umem */ + WARN_ON(umem && cq->umem != umem); + obj->uevent.uobject.object = cq; obj->uevent.uobject.user_handle = user_handle; rdma_restrack_add(&cq->res); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 29694145ce5f..22179954b880 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2204,7 +2204,6 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, return ERR_PTR(-ENOMEM); cq->device = device; - cq->uobject = NULL; cq->comp_handler = comp_handler; cq->event_handler = event_handler; cq->cq_context = cq_context; @@ -2219,6 +2218,11 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, kfree(cq); return ERR_PTR(ret); } + /* + * We are in kernel verbs flow and drivers are not allowed + * to set umem pointer, it needs to stay NULL. + */ + WARN_ON_ONCE(cq->umem); rdma_restrack_add(&cq->res); return cq; diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 96f9c3bc98b2..00b19f2ba3da 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -161,10 +161,8 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); -int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct uverbs_attr_bundle *attrs); -int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_umem *umem, struct uverbs_attr_bundle *attrs); +int efa_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_dmah *dmah, diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index c1397086dc47..03c237c8c81e 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -371,8 +371,7 @@ static const struct ib_device_ops efa_dev_ops = { .alloc_hw_device_stats = efa_alloc_hw_device_stats, .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, - .create_cq = efa_create_cq, - .create_cq_umem = efa_create_cq_umem, + .create_user_cq = efa_create_user_cq, .create_qp = efa_create_qp, .create_user_ah = efa_create_ah, .dealloc_pd = efa_dealloc_pd, diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 776ae5103706..9d683cb30cba 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1130,8 +1130,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, return 0; } -int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_umem *umem, struct uverbs_attr_bundle *attrs) +int efa_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct efa_ucontext *ucontext = rdma_udata_to_drv_context( @@ -1306,12 +1306,6 @@ err_out: return err; } -int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct uverbs_attr_bundle *attrs) -{ - return efa_create_cq_umem(ibcq, attr, NULL, attrs); -} - static int umem_to_page_list(struct efa_dev *dev, struct ib_umem *umem, u64 *page_list, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8531eed7b394..1b77fd88d0fb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2538,9 +2538,8 @@ struct ib_device_ops { int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); - int (*create_cq_umem)(struct ib_cq *cq, + int (*create_user_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, - struct ib_umem *umem, struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); -- cgit v1.2.3 From 2e7af192697ef2a71c76fd57860b0fcd02754e14 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Fri, 12 Sep 2025 07:38:29 +0200 Subject: sched/deadline: Add reporting of runtime left & abs deadline to sched_getattr() for DEADLINE tasks The SCHED_DEADLINE scheduler allows reading the statically configured run-time, deadline, and period parameters through the sched_getattr() system call. However, there is no immediate way to access, from user space, the current parameters used within the scheduler: the instantaneous runtime left in the current cycle, as well as the current absolute deadline. The `flags' sched_getattr() parameter, so far mandated to contain zero, now supports the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 flag, to request retrieval of the leftover runtime and absolute deadline, converted to a CLOCK_MONOTONIC reference, instead of the statically configured parameters. This feature is useful for adaptive SCHED_DEADLINE tasks that need to modify their behavior depending on whether or not there is enough runtime left in the current period, and/or what is the current absolute deadline. Notes: - before returning the instantaneous parameters, the runtime is updated; - the abs deadline is returned shifted from rq_clock() to ktime_get_ns(), in CLOCK_MONOTONIC reference; this causes multiple invocations from the same period to return values that may differ for a few ns (showing some small drift), albeit the deadline doesn't move, in rq_clock() reference; - the abs deadline value returned to user-space, as unsigned 64-bit value, can represent nearly 585 years since boot time; - setting flags=0 provides the old behavior (retrieve static parameters). See also the notes from discussion held at OSPM 2025 on the topic "Making user space aware of current deadline-scheduler parameters". Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matteo Martelli Link: https://patch.msgid.link/20250912053937.31636-2-tommaso.cucinotta@santannapisa.it --- include/uapi/linux/sched.h | 3 +++ kernel/sched/deadline.c | 19 ++++++++++++++++--- kernel/sched/sched.h | 2 +- kernel/sched/syscalls.c | 16 +++++++++++----- 4 files changed, 31 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..52b69ce89368 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -146,4 +146,7 @@ struct clone_args { SCHED_FLAG_KEEP_ALL | \ SCHED_FLAG_UTIL_CLAMP) +/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */ +#define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01 + #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2de5727b94b4..9e253a825f39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3617,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } -void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { struct sched_dl_entity *dl_se = &p->dl; + struct rq *rq = task_rq(p); + u64 adj_deadline; attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { + guard(raw_spinlock_irq)(&rq->__lock); + update_rq_clock(rq); + if (task_current(rq, p)) + update_curr_dl(rq); + + attr->sched_runtime = dl_se->runtime; + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); + attr->sched_deadline = adj_deadline; + } else { + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + } attr->sched_period = dl_se->dl_period; attr->sched_flags &= ~SCHED_DL_FLAGS; attr->sched_flags |= dl_se->flags; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8bf2f7d524cd..fa2237e89bee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); -extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 6f10db3646e7..a288ac0a633d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -881,10 +881,10 @@ err_size: return -E2BIG; } -static void get_params(struct task_struct *p, struct sched_attr *attr) +static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); + __getparam_dl(p, attr, flags); } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; } else { @@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, return -ESRCH; if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); + get_params(p, &attr, 0); return sched_setattr(p, &attr); } @@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags)) + usize < SCHED_ATTR_SIZE_VER0)) return -EINVAL; scoped_guard (rcu) { @@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (!p) return -ESRCH; + if (flags) { + if (!task_has_dl_policy(p) || + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) + return -EINVAL; + } + retval = security_task_getscheduler(p); if (retval) return retval; @@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); + get_params(p, &kattr, flags); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3 From 59509da0cb51dc48e4edc57d7d3ef1d424c58fc9 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:17 +0100 Subject: mtd: Move struct mtd_concat definition to header file To enable a more generic approach for concatenating MTD devices, struct mtd_concat should be accessible beyond the mtdconcat driver. Therefore, the definition is being moved to a header file. Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- drivers/mtd/mtdconcat.c | 12 ------------ include/linux/mtd/concat.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 9eb5d919d9ba..241d15235d01 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -20,18 +20,6 @@ #include -/* - * Our storage structure: - * Subdev points to an array of pointers to struct mtd_info objects - * which is allocated along with this structure - * - */ -struct mtd_concat { - struct mtd_info mtd; - int num_subdev; - struct mtd_info **subdev; -}; - /* * how to calculate the size required for the above structure, * including the pointer array subdev points to: diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index d6f653e07426..b42d9af87c4e 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -9,6 +9,18 @@ #define MTD_CONCAT_H +/* + * Our storage structure: + * Subdev points to an array of pointers to struct mtd_info objects + * which is allocated along with this structure + * + */ +struct mtd_concat { + struct mtd_info mtd; + int num_subdev; + struct mtd_info **subdev; +}; + struct mtd_info *mtd_concat_create( struct mtd_info *subdev[], /* subdevices to concatenate */ int num_devs, /* number of subdevices */ -- cgit v1.2.3 From 43db6366fc2de02050e66389f5628d3fdc9af10a Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:18 +0100 Subject: mtd: Add driver for concatenating devices Introducing CONFIG_MTD_VIRT_CONCAT to separate the legacy flow from the new approach, where only the concatenated partition is registered as an MTD device, while the individual partitions that form it are not registered independently, as they are typically not required by the user. CONFIG_MTD_VIRT_CONCAT is a boolean configuration option that depends on CONFIG_MTD_PARTITIONED_MASTER. When enabled, it allows flash nodes to be exposed as individual MTD devices along with the other partitions. The solution focuses on fixed-partitions description only as it depends on device boundaries. It supports multiple sets of concatenated devices, each comprising two or more partitions. flash@0 { reg = <0>; partitions { compatible = "fixed-partitions"; part0@0 { part-concat-next = <&flash0_part1>; label = "part0_0"; reg = <0x0 0x800000>; }; flash0_part1: part1@800000 { label = "part0_1"; reg = <800000 0x800000>; }; part2@1000000 { part-concat-next = <&flash1_part0>; label = "part0_2"; reg = <0x800000 0x800000>; }; }; }; flash@1 { reg = <1>; partitions { compatible = "fixed-partitions"; flash1_part0: part1@0 { label = "part1_0"; reg = <0x0 0x800000>; }; part1@800000 { label = "part1_1"; reg = <0x800000 0x800000>; }; }; }; The partitions that gets created are flash@0 part0_0-part0_1-concat flash@1 part1_1 part0_2-part1_0-concat Suggested-by: Bernhard Frauendienst Suggested-by: Miquel Raynal Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- drivers/mtd/Kconfig | 9 ++ drivers/mtd/Makefile | 1 + drivers/mtd/mtd_virt_concat.c | 363 ++++++++++++++++++++++++++++++++++++++++++ drivers/mtd/mtdcore.c | 21 +++ drivers/mtd/mtdpart.c | 6 + include/linux/mtd/concat.h | 51 +++++- 6 files changed, 450 insertions(+), 1 deletion(-) create mode 100644 drivers/mtd/mtd_virt_concat.c (limited to 'include') diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 796a2eccbef0..0421c6208de7 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -206,6 +206,15 @@ config MTD_PARTITIONED_MASTER the parent of the partition device be the master device, rather than what lies behind the master. +config MTD_VIRT_CONCAT + bool "Virtual concatenated MTD devices" + depends on MTD_PARTITIONED_MASTER + help + The driver enables the creation of virtual MTD device by + concatenating multiple physical MTD devices into a single + entity. This allows for the creation of partitions larger than + the individual physical chips, extending across chip boundaries. + source "drivers/mtd/chips/Kconfig" source "drivers/mtd/maps/Kconfig" diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index 593d0593a038..7b6dd53e8150 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -6,6 +6,7 @@ # Core functionality. obj-$(CONFIG_MTD) += mtd.o mtd-y := mtdcore.o mtdsuper.o mtdconcat.o mtdpart.o mtdchar.o +mtd-$(CONFIG_MTD_VIRT_CONCAT) += mtd_virt_concat.o obj-y += parsers/ diff --git a/drivers/mtd/mtd_virt_concat.c b/drivers/mtd/mtd_virt_concat.c new file mode 100644 index 000000000000..aea88d1c9bc5 --- /dev/null +++ b/drivers/mtd/mtd_virt_concat.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Virtual concat MTD device driver + * + * Copyright (C) 2018 Bernhard Frauendienst + * Author: Bernhard Frauendienst + */ + +#include +#include +#include "mtdcore.h" +#include +#include +#include +#include +#include + +#define CONCAT_PROP "part-concat-next" +#define CONCAT_POSTFIX "concat" +#define MIN_DEV_PER_CONCAT 1 + +static LIST_HEAD(concat_node_list); + +/** + * struct mtd_virt_concat_node - components of a concatenation + * @head: List handle + * @count: Number of nodes + * @nodes: Pointer to the nodes (partitions) to concatenate + * @concat: Concatenation container + */ +struct mtd_virt_concat_node { + struct list_head head; + unsigned int count; + struct device_node **nodes; + struct mtd_concat *concat; +}; + +/** + * mtd_is_part_concat - Check if the device is already part + * of a concatenated device + * @dev: pointer to 'device_node' + * + * Return: true if the device is already part of a concatenation, + * false otherwise. + */ +static bool mtd_is_part_concat(struct device_node *dev) +{ + struct mtd_virt_concat_node *item; + int idx; + + list_for_each_entry(item, &concat_node_list, head) { + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == dev) + return true; + } + } + return false; +} + +static void mtd_virt_concat_put_mtd_devices(struct mtd_concat *concat) +{ + int i; + + for (i = 0; i < concat->num_subdev; i++) + put_mtd_device(concat->subdev[i]); +} + +void mtd_virt_concat_destroy_joins(void) +{ + struct mtd_virt_concat_node *item, *tmp; + struct mtd_info *mtd; + + list_for_each_entry_safe(item, tmp, &concat_node_list, head) { + mtd = &item->concat->mtd; + if (item->concat) { + mtd_device_unregister(mtd); + kfree(mtd->name); + mtd_concat_destroy(mtd); + mtd_virt_concat_put_mtd_devices(item->concat); + } + } +} + +/** + * mtd_virt_concat_destroy - Destroy the concat that includes the mtd object + * @mtd: pointer to 'mtd_info' + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_destroy(struct mtd_info *mtd) +{ + struct mtd_info *child, *master = mtd_get_master(mtd); + struct mtd_virt_concat_node *item, *tmp; + struct mtd_concat *concat; + int idx, ret = 0; + bool is_mtd_found; + + list_for_each_entry_safe(item, tmp, &concat_node_list, head) { + is_mtd_found = false; + + /* Find the concat item that hold the mtd device */ + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == mtd->dev.of_node) { + is_mtd_found = true; + break; + } + } + if (!is_mtd_found) + continue; + concat = item->concat; + + /* + * Since this concatenated device is being removed, retrieve + * all MTD devices that are part of it and register them + * individually. + */ + for (idx = 0; idx < concat->num_subdev; idx++) { + child = concat->subdev[idx]; + if (child->dev.of_node != mtd->dev.of_node) { + ret = add_mtd_device(child); + if (ret) + goto out; + } + } + /* Destroy the concat */ + if (concat->mtd.name) { + del_mtd_device(&concat->mtd); + kfree(concat->mtd.name); + mtd_concat_destroy(&concat->mtd); + mtd_virt_concat_put_mtd_devices(item->concat); + } + + for (idx = 0; idx < item->count; idx++) + of_node_put(item->nodes[idx]); + + kfree(item->nodes); + kfree(item); + } + return 0; +out: + mutex_lock(&master->master.partitions_lock); + list_del(&child->part.node); + mutex_unlock(&master->master.partitions_lock); + kfree(mtd->name); + kfree(mtd); + + return ret; +} + +/** + * mtd_virt_concat_create_item - Create a concat item + * @parts: pointer to 'device_node' + * @count: number of mtd devices that make up + * the concatenated device. + * + * Return: 0 on success, -error otherwise. + */ +static int mtd_virt_concat_create_item(struct device_node *parts, + unsigned int count) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + int i; + + for (i = 0; i < (count - 1); i++) { + if (mtd_is_part_concat(of_parse_phandle(parts, CONCAT_PROP, i))) + return 0; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->count = count; + item->nodes = kcalloc(count, sizeof(*item->nodes), GFP_KERNEL); + if (!item->nodes) { + kfree(item); + return -ENOMEM; + } + + /* + * The partition in which "part-concat-next" property + * is defined is the first device in the list of concat + * devices. + */ + item->nodes[0] = parts; + + for (i = 1; i < count; i++) + item->nodes[i] = of_parse_phandle(parts, CONCAT_PROP, (i - 1)); + + concat = kzalloc(sizeof(*concat), GFP_KERNEL); + if (!concat) { + kfree(item); + return -ENOMEM; + } + + concat->subdev = kcalloc(count, sizeof(*concat->subdev), GFP_KERNEL); + if (!concat->subdev) { + kfree(item); + kfree(concat); + return -ENOMEM; + } + item->concat = concat; + + list_add_tail(&item->head, &concat_node_list); + + return 0; +} + +void mtd_virt_concat_destroy_items(void) +{ + struct mtd_virt_concat_node *item, *temp; + int i; + + list_for_each_entry_safe(item, temp, &concat_node_list, head) { + for (i = 0; i < item->count; i++) + of_node_put(item->nodes[i]); + + kfree(item->nodes); + kfree(item); + } +} + +/** + * mtd_virt_concat_create_add - Add a mtd device to the concat list + * @mtd: pointer to 'mtd_info' + * + * Return: true on success, false otherwise. + */ +bool mtd_virt_concat_add(struct mtd_info *mtd) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + int idx; + + list_for_each_entry(item, &concat_node_list, head) { + concat = item->concat; + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == mtd->dev.of_node) { + concat->subdev[concat->num_subdev++] = mtd; + return true; + } + } + } + return false; +} + +/** + * mtd_virt_concat_node_create - List all the concatenations found in DT + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_node_create(void) +{ + struct device_node *parts = NULL; + int ret = 0, count = 0; + + /* List all the concatenations found in DT */ + do { + parts = of_find_node_with_property(parts, CONCAT_PROP); + if (!of_device_is_available(parts)) + continue; + + if (mtd_is_part_concat(parts)) + continue; + + count = of_count_phandle_with_args(parts, CONCAT_PROP, NULL); + if (count < MIN_DEV_PER_CONCAT) + continue; + + /* + * The partition in which "part-concat-next" property is defined + * is also part of the concat device, so increament count by 1. + */ + count++; + + ret = mtd_virt_concat_create_item(parts, count); + if (ret) { + of_node_put(parts); + goto destroy_items; + } + } while (parts); + + return ret; + +destroy_items: + mtd_virt_concat_destroy_items(); + + return ret; +} + +/** + * mtd_virt_concat_create_join - Create and register the concatenated + * MTD device. + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_create_join(void) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + struct mtd_info *mtd; + ssize_t name_sz; + int ret, idx; + char *name; + + list_for_each_entry(item, &concat_node_list, head) { + concat = item->concat; + /* + * Check if item->count != concat->num_subdev, it indicates + * that the MTD information for all devices included in the + * concatenation are not handy, concat MTD device can't be + * created hence switch to next concat device. + */ + if (item->count != concat->num_subdev) { + continue; + } else { + /* Calculate the legth of the name of the virtual device */ + for (idx = 0, name_sz = 0; idx < concat->num_subdev; idx++) + name_sz += (strlen(concat->subdev[idx]->name) + 1); + name_sz += strlen(CONCAT_POSTFIX); + name = kmalloc(name_sz + 1, GFP_KERNEL); + if (!name) { + mtd_virt_concat_put_mtd_devices(concat); + return -ENOMEM; + } + + ret = 0; + for (idx = 0; idx < concat->num_subdev; idx++) { + ret += sprintf((name + ret), "%s-", + concat->subdev[idx]->name); + } + sprintf((name + ret), CONCAT_POSTFIX); + + if (concat->mtd.name) { + ret = memcmp(concat->mtd.name, name, name_sz); + if (ret == 0) + continue; + } + mtd = mtd_concat_create(concat->subdev, concat->num_subdev, name); + if (!mtd) { + kfree(name); + return -ENXIO; + } + concat->mtd = *mtd; + /* Arbitrary set the first device as parent */ + concat->mtd.dev.parent = concat->subdev[0]->dev.parent; + concat->mtd.dev = concat->subdev[0]->dev; + + /* Add the mtd device */ + ret = add_mtd_device(&concat->mtd); + if (ret) + goto destroy_concat; + } + } + + return 0; + +destroy_concat: + mtd_concat_destroy(mtd); + + return ret; +} diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 64808493b4f5..576537774628 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -34,6 +34,7 @@ #include #include +#include #include "mtdcore.h" @@ -1120,6 +1121,12 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, goto out; } + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + ret = mtd_virt_concat_node_create(); + if (ret < 0) + goto out; + } + /* Prefer parsed partitions over driver-provided fallback */ ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) @@ -1137,6 +1144,11 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, if (ret) goto out; + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + ret = mtd_virt_concat_create_join(); + if (ret < 0) + goto out; + } /* * FIXME: some drivers unfortunately call this function more than once. * So we have to check if we've already assigned the reboot notifier. @@ -1186,6 +1198,11 @@ int mtd_device_unregister(struct mtd_info *master) nvmem_unregister(master->otp_user_nvmem); nvmem_unregister(master->otp_factory_nvmem); + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + err = mtd_virt_concat_destroy(master); + if (err) + return err; + } err = del_mtd_partitions(master); if (err) return err; @@ -2621,6 +2638,10 @@ err_reg: static void __exit cleanup_mtd(void) { + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + mtd_virt_concat_destroy_joins(); + mtd_virt_concat_destroy_items(); + } debugfs_remove_recursive(dfs_dir_mtd); cleanup_mtdchar(); if (proc_mtd) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index e016cfbc7224..795a94e6b482 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "mtdcore.h" @@ -409,6 +410,11 @@ int add_mtd_partitions(struct mtd_info *parent, goto err_del_partitions; } + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + if (mtd_virt_concat_add(child)) + continue; + } + mutex_lock(&master->master.partitions_lock); list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index b42d9af87c4e..2cd9d48958a8 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -28,5 +28,54 @@ struct mtd_info *mtd_concat_create( void mtd_concat_destroy(struct mtd_info *mtd); -#endif +/** + * mtd_virt_concat_node_create - Create a component for concatenation + * + * Returns a positive number representing the no. of devices found for + * concatenation, or a negative error code. + * + * List all the devices for concatenations found in DT and create a + * component for concatenation. + */ +int mtd_virt_concat_node_create(void); + +/** + * mtd_virt_concat_add - add mtd_info object to the list of subdevices for concatenation + * @mtd: pointer to new MTD device info structure + * + * Returns true if the mtd_info object is added successfully else returns false. + * + * The mtd_info object is added to the list of subdevices for concatenation. + * It returns true if a match is found, and false if all subdevices have + * already been added or if the mtd_info object does not match any of the + * intended MTD devices. + */ +bool mtd_virt_concat_add(struct mtd_info *mtd); +/** + * mtd_virt_concat_create_join - Create and register the concatenated MTD device + * + * Returns 0 on succes, or a negative error code. + * + * Creates and registers the concatenated MTD device + */ +int mtd_virt_concat_create_join(void); + +/** + * mtd_virt_concat_destroy - Remove the concat that includes a specific mtd device + * as one of its components. + * @mtd: pointer to MTD device info structure. + * + * Returns 0 on succes, or a negative error code. + * + * If the mtd_info object is part of a concatenated device, all other MTD devices + * within that concat are registered individually. The concatenated device is then + * removed, along with its concatenation component. + * + */ +int mtd_virt_concat_destroy(struct mtd_info *mtd); + +void mtd_virt_concat_destroy_joins(void); +void mtd_virt_concat_destroy_items(void); + +#endif -- cgit v1.2.3 From 43479bb3703f17da6cdfaa2a7f4b93db9c6908bc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 5 Feb 2026 19:49:15 +0100 Subject: mtd: spinand: Clean the flags section Mention that we are declaring the main SPI NAND flags with a comment. Align the values with tabs. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 6a024cf1c53a..58abd306ebe3 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -477,8 +477,9 @@ struct spinand_ecc_info { const struct mtd_ooblayout_ops *ooblayout; }; -#define SPINAND_HAS_QE_BIT BIT(0) -#define SPINAND_HAS_CR_FEAT_BIT BIT(1) +/* SPI NAND flags */ +#define SPINAND_HAS_QE_BIT BIT(0) +#define SPINAND_HAS_CR_FEAT_BIT BIT(1) #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) -- cgit v1.2.3 From 7717fbb14028be5735acb911aeb7553b7c662418 Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 24 Feb 2026 16:50:30 +0100 Subject: net: pppoe: avoid zero-length arrays in struct pppoe_hdr Jakub Kicinski reported following issue in upcoming patches: W=1 C=1 GCC build gives us: net/bridge/netfilter/nf_conntrack_bridge.c: note: in included file (through ../include/linux/if_pppox.h, ../include/uapi/linux/netfilter_bridge.h, ../include/linux/netfilter_bridge.h): include/uapi/linux/if_pppox.h: 153:29: warning: array of flexible structures sparse doesn't like that hdr has a zero-length array which overlaps proto. The kernel code doesn't currently need those arrays. PPPoE connection is functional after applying this patch. Reviewed-by: Nikolay Aleksandrov Reviewed-by: Kees Cook Signed-off-by: Eric Woudstra Link: https://patch.msgid.link/20260224155030.106918-1-ericwouds@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/pppoe.c | 2 +- include/uapi/linux/if_pppox.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4275b393a454..7900cc3212a5 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -885,7 +885,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, skb->protocol = cpu_to_be16(ETH_P_PPP_SES); ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr)); - start = (char *)&ph->tag[0]; + start = (char *)ph + sizeof(*ph); error = memcpy_from_msg(start, m, total_len); if (error < 0) { diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index 9abd80dcc46f..29b804aa7474 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -122,7 +122,9 @@ struct sockaddr_pppol2tpv3in6 { struct pppoe_tag { __be16 tag_type; __be16 tag_len; +#ifndef __KERNEL__ char tag_data[]; +#endif } __attribute__ ((packed)); /* Tag identifiers */ @@ -150,7 +152,9 @@ struct pppoe_hdr { __u8 code; __be16 sid; __be16 length; +#ifndef __KERNEL__ struct pppoe_tag tag[]; +#endif } __packed; /* Length of entire PPPoE + PPP header */ -- cgit v1.2.3 From 74455a5b4326add2499cb4a1f9706154b3a1eab4 Mon Sep 17 00:00:00 2001 From: Jiejian Wu Date: Tue, 24 Feb 2026 21:50:40 +0100 Subject: ipvs: make ip_vs_svc_table and ip_vs_svc_fwm_table per netns Current ipvs uses one global mutex "__ip_vs_mutex" to keep the global "ip_vs_svc_table" and "ip_vs_svc_fwm_table" safe. But when there are tens of thousands of services from different netns in the table, it takes a long time to look up the table, for example, using "ipvsadm -ln" from different netns simultaneously. We make "ip_vs_svc_table" and "ip_vs_svc_fwm_table" per netns, and we add "service_mutex" per netns to keep these two tables safe instead of the global "__ip_vs_mutex" in current version. To this end, looking up services from different netns simultaneously will not get stuck, shortening the time consumption in large-scale deployment. It can be reproduced using the simple scripts below. init.sh: #!/bin/bash for((i=1;i<=4;i++));do ip netns add ns$i ip netns exec ns$i ip link set dev lo up ip netns exec ns$i sh add-services.sh done add-services.sh: #!/bin/bash for((i=0;i<30000;i++)); do ipvsadm -A -t 10.10.10.10:$((80+$i)) -s rr done runtest.sh: #!/bin/bash for((i=1;i<4;i++));do ip netns exec ns$i ipvsadm -ln > /dev/null & done ip netns exec ns4 ipvsadm -ln > /dev/null Run "sh init.sh" to initiate the network environment. Then run "time ./runtest.sh" to evaluate the time consumption. Our testbed is a 4-core Intel Xeon ECS. The result of the original version is around 8 seconds, while the result of the modified version is only 0.8 seconds. Signed-off-by: Jiejian Wu Co-developed-by: Dust Li Signed-off-by: Dust Li Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 13 ++++ net/netfilter/ipvs/ip_vs_ctl.c | 167 ++++++++++++++++++----------------------- net/netfilter/ipvs/ip_vs_est.c | 18 ++--- 3 files changed, 94 insertions(+), 104 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 29a36709e7f3..074a204ec6db 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -33,6 +33,12 @@ #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) @@ -1041,6 +1047,13 @@ struct netns_ipvs { */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ + + /* the service mutex that protect svc_table and svc_fwm_table */ + struct mutex service_mutex; + /* the service table hashed by */ + struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; + /* the service table hashed by fwmark */ + struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE]; }; #define DEFAULT_SYNC_THRESHOLD 3 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 068702894377..d871273ce917 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -48,7 +48,7 @@ MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); -DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ +static struct lock_class_key __ipvs_service_key; /* sysctl variables */ @@ -293,17 +293,6 @@ ip_vs_use_count_dec(void) } -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by */ -static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* @@ -338,8 +327,8 @@ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 } /* - * Hashes a service in the ip_vs_svc_table by - * or in the ip_vs_svc_fwm_table by fwmark. + * Hashes a service in the svc_table by + * or in the svc_fwm_table by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) @@ -354,17 +343,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* - * Hash it by in ip_vs_svc_table + * Hash it by in svc_table */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); - hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); + hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); - hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + hlist_add_head_rcu(&svc->f_list, &svc->ipvs->svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; @@ -413,12 +402,9 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); - hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { - if ((svc->af == af) - && ip_vs_addr_equal(af, &svc->addr, vaddr) - && (svc->port == vport) - && (svc->protocol == protocol) - && (svc->ipvs == ipvs)) { + hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { + if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && + svc->port == vport && svc->protocol == protocol) { /* HIT */ return svc; } @@ -440,9 +426,8 @@ __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); - hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af - && (svc->ipvs == ipvs)) { + hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ return svc; } @@ -1701,10 +1686,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) * Flush the service table hashed by */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], s_list) { - if (svc->ipvs == ipvs) - ip_vs_unlink_service(svc, cleanup); + ip_vs_unlink_service(svc, cleanup); } } @@ -1712,10 +1696,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + hlist_for_each_entry_safe(svc, n, &ipvs->svc_fwm_table[idx], f_list) { - if (svc->ipvs == ipvs) - ip_vs_unlink_service(svc, cleanup); + ip_vs_unlink_service(svc, cleanup); } } @@ -1732,12 +1715,12 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list) struct net *net; /* Check for "full" addressed entries */ - mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); + mutex_lock(&ipvs->service_mutex); ip_vs_flush(ipvs, true); + mutex_unlock(&ipvs->service_mutex); } - mutex_unlock(&__ip_vs_mutex); } /* Put all references for device (dst_cache) */ @@ -1775,25 +1758,20 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (svc->ipvs == ipvs) { - list_for_each_entry(dest, &svc->destinations, - n_list) { - ip_vs_forget_dev(dest, dev); - } + hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); } } - hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (svc->ipvs == ipvs) { - list_for_each_entry(dest, &svc->destinations, - n_list) { - ip_vs_forget_dev(dest, dev); - } + hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); } - } } @@ -1802,7 +1780,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, ip_vs_forget_dev(dest, dev); } spin_unlock_bh(&ipvs->dest_trash_lock); - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return NOTIFY_DONE; } @@ -1826,16 +1804,14 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (svc->ipvs == ipvs) - ip_vs_zero_service(svc); + hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { + ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (svc->ipvs == ipvs) - ip_vs_zero_service(svc); + hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); } } @@ -2306,9 +2282,9 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { - if ((svc->ipvs == ipvs) && pos-- == 0) { - iter->table = ip_vs_svc_table; + hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { + if (pos-- == 0) { + iter->table = ipvs->svc_table; iter->bucket = idx; return svc; } @@ -2317,10 +2293,10 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx], f_list) { - if ((svc->ipvs == ipvs) && pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; + if (pos-- == 0) { + iter->table = ipvs->svc_fwm_table; iter->bucket = idx; return svc; } @@ -2343,6 +2319,8 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) @@ -2351,7 +2329,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) svc = v; iter = seq->private; - if (iter->table == ip_vs_svc_table) { + if (iter->table == ipvs->svc_table) { /* next service in table hashed by protocol */ e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) @@ -2359,13 +2337,13 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, - &ip_vs_svc_table[iter->bucket], + &ipvs->svc_table[iter->bucket], s_list) { return svc; } } - iter->table = ip_vs_svc_fwm_table; + iter->table = ipvs->svc_fwm_table; iter->bucket = -1; goto scan_fwmark; } @@ -2378,7 +2356,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, - &ip_vs_svc_fwm_table[iter->bucket], + &ipvs->svc_fwm_table[iter->bucket], f_list) return svc; } @@ -2414,7 +2392,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) if (svc->ipvs != ipvs) return 0; - if (iter->table == ip_vs_svc_table) { + if (iter->table == ipvs->svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", @@ -2736,7 +2714,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) return ret; } - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(ipvs, false); @@ -2833,7 +2811,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) } out_unlock: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return ret; } @@ -2871,9 +2849,9 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || (svc->ipvs != ipvs)) + if (svc->af != AF_INET) continue; if (count >= get->num_services) @@ -2890,9 +2868,9 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET || (svc->ipvs != ipvs)) + if (svc->af != AF_INET) continue; if (count >= get->num_services) @@ -3061,7 +3039,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { @@ -3160,7 +3138,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } out: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return ret; } @@ -3395,10 +3373,10 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start || (svc->ipvs != ipvs)) + hlist_for_each_entry(svc, &ipvs->svc_table[i], s_list) { + if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -3408,8 +3386,8 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start || (svc->ipvs != ipvs)) + hlist_for_each_entry(svc, &ipvs->svc_fwm_table[i], f_list) { + if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -3419,7 +3397,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } nla_put_failure: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); cb->args[0] = idx; return skb->len; @@ -3608,7 +3586,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); /* Try to find the service for which to dump destinations */ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) @@ -3633,7 +3611,7 @@ nla_put_failure: cb->args[0] = idx; out_err: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return skb->len; } @@ -3916,7 +3894,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) cmd = info->genlhdr->cmd; - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); if (cmd == IPVS_CMD_FLUSH) { ret = ip_vs_flush(ipvs, false); @@ -4028,7 +4006,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } out: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return ret; } @@ -4058,7 +4036,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); if (reply == NULL) @@ -4126,7 +4104,7 @@ nla_put_failure: out_err: nlmsg_free(msg); out: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); return ret; } @@ -4243,6 +4221,7 @@ static struct genl_family ip_vs_genl_family __ro_after_init = { .small_ops = ip_vs_genl_ops, .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), .resv_start_op = IPVS_CMD_FLUSH + 1, + .parallel_ops = 1, }; static int __init ip_vs_genl_register(void) @@ -4425,6 +4404,13 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) int ret = -ENOMEM; int idx; + /* Initialize service_mutex, svc_table, svc_fwm_table per netns */ + __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ipvs->svc_table[idx]); + INIT_HLIST_HEAD(&ipvs->svc_fwm_table[idx]); + } + /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->rs_table[idx]); @@ -4529,17 +4515,8 @@ void ip_vs_unregister_nl_ioctl(void) int __init ip_vs_control_init(void) { - int idx; int ret; - /* Initialize svc_table, ip_vs_svc_fwm_table */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); - INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - - smp_wmb(); /* Do we really need it now ? */ - ret = register_netdevice_notifier(&ip_vs_dst_notifier); if (ret < 0) return ret; diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 77f4f637ff67..dc207172ca9f 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -602,7 +602,7 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) while (1) { int max = 16; - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, @@ -622,12 +622,12 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) } goto unlock; } - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); cond_resched(); } unlock: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); } /* Calculate limits for all kthreads */ @@ -647,9 +647,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); kd = ipvs->est_kt_arr[0]; - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; @@ -748,7 +748,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period @@ -815,9 +815,9 @@ walk_chain: /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); cond_resched(); - mutex_lock(&__ip_vs_mutex); + mutex_lock(&ipvs->service_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) @@ -893,7 +893,7 @@ unlock2: mutex_unlock(&ipvs->est_mutex); unlock: - mutex_unlock(&__ip_vs_mutex); + mutex_unlock(&ipvs->service_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) -- cgit v1.2.3 From b24ae1a387e404e832385448ccad30cb03520e45 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:42 +0100 Subject: ipvs: use single svc table fwmark based services and non-fwmark based services can be hashed in same service table. This reduces the burden of working with two tables. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-4-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 8 +-- net/netfilter/ipvs/ip_vs_ctl.c | 146 ++++++----------------------------------- 2 files changed, 22 insertions(+), 132 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 074a204ec6db..b5a5a5efe3cc 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -679,8 +679,7 @@ struct ip_vs_dest_user_kern { * forwarding entries. */ struct ip_vs_service { - struct hlist_node s_list; /* for normal service table */ - struct hlist_node f_list; /* for fwmark-based service table */ + struct hlist_node s_list; /* node in service table */ atomic_t refcnt; /* reference counter */ u16 af; /* address family */ @@ -1050,10 +1049,7 @@ struct netns_ipvs { /* the service mutex that protect svc_table and svc_fwm_table */ struct mutex service_mutex; - /* the service table hashed by */ - struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; - /* the service table hashed by fwmark */ - struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */ }; #define DEFAULT_SYNC_THRESHOLD 3 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b9eaf048a29f..2ef1f99dada6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -328,7 +328,7 @@ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 /* * Hashes a service in the svc_table by - * or in the svc_fwm_table by fwmark. + * or by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) @@ -343,18 +343,17 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* - * Hash it by in svc_table + * Hash it by */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); - hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); } else { /* - * Hash it by fwmark in svc_fwm_table + * Hash it by fwmark */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); - hlist_add_head_rcu(&svc->f_list, &svc->ipvs->svc_fwm_table[hash]); } + hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); svc->flags |= IP_VS_SVC_F_HASHED; /* increase its refcnt because it is referenced by the svc table */ @@ -364,7 +363,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* - * Unhashes a service from svc_table / svc_fwm_table. + * Unhashes a service from svc_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) @@ -375,13 +374,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) return 0; } - if (svc->fwmark == 0) { - /* Remove it from the svc_table table */ - hlist_del_rcu(&svc->s_list); - } else { - /* Remove it from the svc_fwm_table table */ - hlist_del_rcu(&svc->f_list); - } + /* Remove it from svc_table */ + hlist_del_rcu(&svc->s_list); svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); @@ -404,7 +398,8 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && - svc->port == vport && svc->protocol == protocol) { + svc->port == vport && svc->protocol == protocol && + !svc->fwmark) { /* HIT */ return svc; } @@ -426,7 +421,7 @@ __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); - hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[hash], f_list) { + hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ return svc; @@ -1682,26 +1677,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) struct ip_vs_service *svc; struct hlist_node *n; - /* - * Flush the service table hashed by - */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], - s_list) { + s_list) ip_vs_unlink_service(svc, cleanup); - } } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_safe(svc, n, &ipvs->svc_fwm_table[idx], - f_list) { - ip_vs_unlink_service(svc, cleanup); - } - } - return 0; } @@ -1764,11 +1744,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, list_for_each_entry_rcu(dest, &svc->destinations, n_list) ip_vs_forget_dev(dest, dev); - - hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx], f_list) - list_for_each_entry_rcu(dest, &svc->destinations, - n_list) - ip_vs_forget_dev(dest, dev); } rcu_read_unlock(); @@ -1802,15 +1777,8 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { + hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } } ip_vs_zero_stats(&ipvs->tot_stats->s); @@ -2246,7 +2214,6 @@ static struct ctl_table vs_vars[] = { struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - struct hlist_head *table; int bucket; }; @@ -2269,7 +2236,6 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) } -/* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); @@ -2278,29 +2244,14 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_service *svc; - /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { if (pos-- == 0) { - iter->table = ipvs->svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[idx], - f_list) { - if (pos-- == 0) { - iter->table = ipvs->svc_fwm_table; iter->bucket = idx; return svc; } } } - return NULL; } @@ -2327,38 +2278,17 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) svc = v; iter = seq->private; - if (iter->table == ipvs->svc_table) { - /* next service in table hashed by protocol */ - e = rcu_dereference(hlist_next_rcu(&svc->s_list)); - if (e) - return hlist_entry(e, struct ip_vs_service, s_list); - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - hlist_for_each_entry_rcu(svc, - &ipvs->svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ipvs->svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) - return hlist_entry(e, struct ip_vs_service, f_list); + return hlist_entry(e, struct ip_vs_service, s_list); - scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, - &ipvs->svc_fwm_table[iter->bucket], - f_list) + &ipvs->svc_table[iter->bucket], + s_list) { return svc; + } } - return NULL; } @@ -2380,17 +2310,12 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { - struct net *net = seq_file_net(seq); - struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; - if (svc->ipvs != ipvs) - return 0; - if (iter->table == ipvs->svc_table) { + if (!svc->fwmark) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", @@ -2865,24 +2790,6 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, } } - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_fwm_table[idx], f_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } out: return ret; } @@ -3383,17 +3290,6 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, } } - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_fwm_table[i], f_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - nla_put_failure: rcu_read_unlock(); cb->args[0] = idx; @@ -4403,12 +4299,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) int ret = -ENOMEM; int idx; - /* Initialize service_mutex, svc_table, svc_fwm_table per netns */ + /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->svc_table[idx]); - INIT_HLIST_HEAD(&ipvs->svc_fwm_table[idx]); - } /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) -- cgit v1.2.3 From c59bd9e62e060bb5cd4d697b73bbe6f23a723345 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:44 +0100 Subject: ipvs: use more counters to avoid service lookups When new connection is created we can lookup for services multiple times to support fallback options. We already have some counters to skip specific lookups because it costs CPU cycles for hash calculation, etc. Add more counters for fwmark/non-fwmark services (fwm_services and nonfwm_services) and make all counters per address family. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-6-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 24 +++++++++--- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 86 ++++++++++++++++++++++++----------------- 3 files changed, 69 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b5a5a5efe3cc..f2291be36409 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -271,6 +271,18 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +/* For arrays per family */ +enum { + IP_VS_AF_INET, + IP_VS_AF_INET6, + IP_VS_AF_MAX +}; + +static inline int ip_vs_af_index(int af) +{ + return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; +} + /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -940,17 +952,17 @@ struct netns_ipvs { /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ - int num_services; /* no of virtual services */ - int num_services6; /* IPv6 virtual services */ - /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ /* Service counters */ - atomic_t ftpsvc_counter; - atomic_t nullsvc_counter; - atomic_t conn_out_counter; + atomic_t num_services[IP_VS_AF_MAX]; /* Services */ + atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ + atomic_t nonfwm_services[IP_VS_AF_MAX];/* Services */ + atomic_t ftpsvc_counter[IP_VS_AF_MAX]; /* FTPPORT */ + atomic_t nullsvc_counter[IP_VS_AF_MAX];/* Zero port */ + atomic_t conn_out_counter[IP_VS_AF_MAX];/* out conn */ #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 90d56f92c0f6..869f18e0e835 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1400,7 +1400,7 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat return handle_response(af, skb, pd, cp, &iph, hooknum); /* Check for real-server-started requests */ - if (atomic_read(&ipvs->conn_out_counter)) { + if (atomic_read(&ipvs->conn_out_counter[ip_vs_af_index(af)])) { /* Currently only for UDP: * connection oriented protocols typically use * ephemeral ports for outgoing connections, so diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7c0e2d9b5b98..564e09f0f90a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -436,35 +436,42 @@ struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - struct ip_vs_service *svc; + struct ip_vs_service *svc = NULL; + int af_id = ip_vs_af_index(af); /* * Check the table hashed by fwmark first */ - if (fwmark) { + if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } + if (!atomic_read(&ipvs->nonfwm_services[af_id])) + goto out; + /* * Check the table hashed by * for "full" addressed entries */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); + if (svc) + goto out; - if (!svc && protocol == IPPROTO_TCP && - atomic_read(&ipvs->ftpsvc_counter) && + if (protocol == IPPROTO_TCP && + atomic_read(&ipvs->ftpsvc_counter[af_id]) && (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); + if (svc) + goto out; } - if (svc == NULL - && atomic_read(&ipvs->nullsvc_counter)) { + if (atomic_read(&ipvs->nullsvc_counter[af_id])) { /* * Check if the catch-all port (port zero) exists */ @@ -1352,6 +1359,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, { int ret = 0; struct ip_vs_scheduler *sched = NULL; + int af_id = ip_vs_af_index(u->af); struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; int ret_hooks = -1; @@ -1396,8 +1404,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif - if ((u->af == AF_INET && !ipvs->num_services) || - (u->af == AF_INET6 && !ipvs->num_services6)) { + if (!atomic_read(&ipvs->num_services[af_id])) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) goto out_err; @@ -1444,21 +1451,21 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, /* Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_inc(&ipvs->ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ipvs->nullsvc_counter); + atomic_inc(&ipvs->ftpsvc_counter[af_id]); + else if (!svc->port && !svc->fwmark) + atomic_inc(&ipvs->nullsvc_counter[af_id]); if (pe && pe->conn_out) - atomic_inc(&ipvs->conn_out_counter); + atomic_inc(&ipvs->conn_out_counter[af_id]); /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ipvs->num_services++; - else if (svc->af == AF_INET6) - ipvs->num_services6++; + if (svc->fwmark) + atomic_inc(&ipvs->fwm_services[af_id]); + else + atomic_inc(&ipvs->nonfwm_services[af_id]); + atomic_inc(&ipvs->num_services[af_id]); /* Hash the service into the service table */ ip_vs_svc_hash(svc); @@ -1503,6 +1510,8 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; bool new_pe_conn_out, old_pe_conn_out; + struct netns_ipvs *ipvs = svc->ipvs; + int af_id = ip_vs_af_index(svc->af); /* * Lookup the scheduler, by 'u->sched_name' @@ -1571,9 +1580,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) new_pe_conn_out = (pe && pe->conn_out) ? true : false; old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; if (new_pe_conn_out && !old_pe_conn_out) - atomic_inc(&svc->ipvs->conn_out_counter); + atomic_inc(&ipvs->conn_out_counter[af_id]); if (old_pe_conn_out && !new_pe_conn_out) - atomic_dec(&svc->ipvs->conn_out_counter); + atomic_dec(&ipvs->conn_out_counter[af_id]); } out: @@ -1593,16 +1602,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; + int af_id = ip_vs_af_index(svc->af); - if (svc->af == AF_INET) { - ipvs->num_services--; - if (!ipvs->num_services) - ip_vs_unregister_hooks(ipvs, svc->af); - } else if (svc->af == AF_INET6) { - ipvs->num_services6--; - if (!ipvs->num_services6) - ip_vs_unregister_hooks(ipvs, svc->af); - } + atomic_dec(&ipvs->num_services[af_id]); + if (!atomic_read(&ipvs->num_services[af_id])) + ip_vs_unregister_hooks(ipvs, svc->af); + if (svc->fwmark) + atomic_dec(&ipvs->fwm_services[af_id]); + else + atomic_dec(&ipvs->nonfwm_services[af_id]); ip_vs_stop_estimator(svc->ipvs, &svc->stats); @@ -1614,7 +1622,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); if (old_pe && old_pe->conn_out) - atomic_dec(&ipvs->conn_out_counter); + atomic_dec(&ipvs->conn_out_counter[af_id]); ip_vs_pe_put(old_pe); /* @@ -1629,9 +1637,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) * Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_dec(&ipvs->ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ipvs->nullsvc_counter); + atomic_dec(&ipvs->ftpsvc_counter[af_id]); + else if (!svc->port && !svc->fwmark) + atomic_dec(&ipvs->nullsvc_counter[af_id]); /* * Free the service if nobody refers to it @@ -2961,7 +2969,8 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; - info.num_services = ipvs->num_services; + info.num_services = + atomic_read(&ipvs->num_services[IP_VS_AF_INET]); if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } @@ -4307,9 +4316,14 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); - atomic_set(&ipvs->ftpsvc_counter, 0); - atomic_set(&ipvs->nullsvc_counter, 0); - atomic_set(&ipvs->conn_out_counter, 0); + for (idx = 0; idx < IP_VS_AF_MAX; idx++) { + atomic_set(&ipvs->num_services[idx], 0); + atomic_set(&ipvs->fwm_services[idx], 0); + atomic_set(&ipvs->nonfwm_services[idx], 0); + atomic_set(&ipvs->ftpsvc_counter[idx], 0); + atomic_set(&ipvs->nullsvc_counter[idx], 0); + atomic_set(&ipvs->conn_out_counter[idx], 0); + } INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); -- cgit v1.2.3 From 09b71fb459468b408f3fa3e2b75d20113374f057 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:45 +0100 Subject: ipvs: no_cport and dropentry counters can be per-net Change the no_cport counters to be per-net and address family. This should reduce the extra conn lookups done during present NO_CPORT connections. By changing from global to per-net dropentry counters, one net will not affect the drop rate of another net. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-7-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 2 ++ net/netfilter/ipvs/ip_vs_conn.c | 64 +++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f2291be36409..ad8a16146ac5 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -948,6 +948,7 @@ struct netns_ipvs { #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ + atomic_t no_cport_conns[IP_VS_AF_MAX]; /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ @@ -973,6 +974,7 @@ struct netns_ipvs { int drop_counter; int old_secure_tcp; atomic_t dropentry; + s8 dropentry_counters[8]; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 50cc492c7553..66057db63d02 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -59,9 +59,6 @@ static struct hlist_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - /* random value for IPVS connection hash */ static unsigned int ip_vs_conn_rnd __read_mostly; @@ -294,10 +291,16 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) struct ip_vs_conn *cp; cp = __ip_vs_conn_in_get(p); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { - struct ip_vs_conn_param cport_zero_p = *p; - cport_zero_p.cport = 0; - cp = __ip_vs_conn_in_get(&cport_zero_p); + if (!cp) { + struct netns_ipvs *ipvs = p->ipvs; + int af_id = ip_vs_af_index(p->af); + + if (atomic_read(&ipvs->no_cport_conns[af_id])) { + struct ip_vs_conn_param cport_zero_p = *p; + + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } } IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", @@ -490,9 +493,12 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { if (ip_vs_conn_unhash(cp)) { + struct netns_ipvs *ipvs = cp->ipvs; + int af_id = ip_vs_af_index(cp->af); + spin_lock_bh(&cp->lock); if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ipvs->no_cport_conns[af_id]); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } @@ -891,8 +897,11 @@ static void ip_vs_conn_expire(struct timer_list *t) if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + int af_id = ip_vs_af_index(cp->af); + + atomic_dec(&ipvs->no_cport_conns[af_id]); + } if (cp->flags & IP_VS_CONN_F_ONE_PACKET) ip_vs_conn_rcu_free(&cp->rcu_head); else @@ -999,8 +1008,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, cp->out_seq.delta = 0; atomic_inc(&ipvs->conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); + if (unlikely(flags & IP_VS_CONN_F_NO_CPORT)) { + int af_id = ip_vs_af_index(cp->af); + + atomic_inc(&ipvs->no_cport_conns[af_id]); + } /* Bind the connection with a destination server */ cp->dest = NULL; @@ -1257,6 +1269,7 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { }; #endif +#ifdef CONFIG_SYSCTL /* Randomly drop connection entries before running out of memory * Can be used for DATA and CTL conns. For TPL conns there are exceptions: @@ -1266,12 +1279,7 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { */ static inline int todrop_entry(struct ip_vs_conn *cp) { - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static signed char todrop_counter[9] = {0}; + struct netns_ipvs *ipvs = cp->ipvs; int i; /* if the conn entry hasn't lasted for 60 seconds, don't drop it. @@ -1280,15 +1288,17 @@ static inline int todrop_entry(struct ip_vs_conn *cp) if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) return 0; - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ + /* Drop only conns with number of incoming packets in [1..8] range */ i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; + if (i > 8 || i < 1) + return 0; - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; + i--; + if (--ipvs->dropentry_counters[i] > 0) + return 0; - todrop_counter[i] = todrop_rate[i]; + /* Prefer to drop conns with less number of incoming packets */ + ipvs->dropentry_counters[i] = i + 1; return 1; } @@ -1368,7 +1378,7 @@ drop: } rcu_read_unlock(); } - +#endif /* * Flush all the connection entries in the ip_vs_conn_tab @@ -1450,7 +1460,11 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) */ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) { + int idx; + atomic_set(&ipvs->conn_count, 0); + for (idx = 0; idx < IP_VS_AF_MAX; idx++) + atomic_set(&ipvs->no_cport_conns[idx], 0); #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, -- cgit v1.2.3 From 6b94d081f81dd524626f7aab2b98a9de335edb72 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Feb 2026 21:50:48 +0100 Subject: netfilter: nf_tables: remove register tracking infrastructure This facility was disabled in commit 9e539c5b6d9c ("netfilter: nf_tables: disable expression reduction infra"), because not all nft_exprs guarantee they will update the destination register: some may set NFT_BREAK instead to cancel evaluation of the rule. This has been dead code ever since. There are no plans to salvage this at this time, so remove this. Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-10-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_tables.h | 32 ---------- include/net/netfilter/nft_fib.h | 2 - include/net/netfilter/nft_meta.h | 3 - net/bridge/netfilter/nft_meta_bridge.c | 20 ------ net/bridge/netfilter/nft_reject_bridge.c | 1 - net/ipv4/netfilter/nft_dup_ipv4.c | 1 - net/ipv4/netfilter/nft_fib_ipv4.c | 2 - net/ipv4/netfilter/nft_reject_ipv4.c | 1 - net/ipv6/netfilter/nft_dup_ipv6.c | 1 - net/ipv6/netfilter/nft_fib_ipv6.c | 2 - net/ipv6/netfilter/nft_reject_ipv6.c | 1 - net/netfilter/nf_tables_api.c | 67 -------------------- net/netfilter/nft_bitwise.c | 104 ------------------------------- net/netfilter/nft_byteorder.c | 11 ---- net/netfilter/nft_cmp.c | 3 - net/netfilter/nft_compat.c | 10 --- net/netfilter/nft_connlimit.c | 1 - net/netfilter/nft_counter.c | 1 - net/netfilter/nft_ct.c | 46 -------------- net/netfilter/nft_dup_netdev.c | 1 - net/netfilter/nft_dynset.c | 1 - net/netfilter/nft_exthdr.c | 34 ---------- net/netfilter/nft_fib.c | 42 ------------- net/netfilter/nft_fib_inet.c | 1 - net/netfilter/nft_fib_netdev.c | 1 - net/netfilter/nft_flow_offload.c | 1 - net/netfilter/nft_fwd_netdev.c | 2 - net/netfilter/nft_hash.c | 36 ----------- net/netfilter/nft_immediate.c | 12 ---- net/netfilter/nft_last.c | 1 - net/netfilter/nft_limit.c | 2 - net/netfilter/nft_log.c | 1 - net/netfilter/nft_lookup.c | 12 ---- net/netfilter/nft_masq.c | 3 - net/netfilter/nft_meta.c | 45 ------------- net/netfilter/nft_nat.c | 2 - net/netfilter/nft_numgen.c | 22 ------- net/netfilter/nft_objref.c | 2 - net/netfilter/nft_osf.c | 25 -------- net/netfilter/nft_payload.c | 47 -------------- net/netfilter/nft_queue.c | 2 - net/netfilter/nft_quota.c | 1 - net/netfilter/nft_range.c | 1 - net/netfilter/nft_redir.c | 3 - net/netfilter/nft_reject_inet.c | 1 - net/netfilter/nft_reject_netdev.c | 1 - net/netfilter/nft_rt.c | 1 - net/netfilter/nft_socket.c | 26 -------- net/netfilter/nft_synproxy.c | 1 - net/netfilter/nft_tproxy.c | 1 - net/netfilter/nft_tunnel.c | 26 -------- net/netfilter/nft_xfrm.c | 27 -------- 52 files changed, 693 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 426534a711b0..40e8106e71f0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -122,17 +122,6 @@ struct nft_regs { }; }; -struct nft_regs_track { - struct { - const struct nft_expr *selector; - const struct nft_expr *bitwise; - u8 num_reg; - } regs[NFT_REG32_NUM]; - - const struct nft_expr *cur; - const struct nft_expr *last; -}; - /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit @@ -427,8 +416,6 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); -bool nft_expr_reduce_bitwise(struct nft_regs_track *track, - const struct nft_expr *expr); struct nft_set_ext; @@ -941,7 +928,6 @@ struct nft_offload_ctx; * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection - * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow @@ -975,8 +961,6 @@ struct nft_expr_ops { bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr); - bool (*reduce)(struct nft_regs_track *track, - const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, @@ -1954,20 +1938,4 @@ static inline u64 nft_net_tstamp(const struct net *net) return nft_pernet(net)->tstamp; } -#define __NFT_REDUCE_READONLY 1UL -#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY - -void nft_reg_track_update(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg, u8 len); -void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); -void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); - -static inline bool nft_reg_track_cmp(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg) -{ - return track->regs[dreg].selector && - track->regs[dreg].selector->ops == expr->ops && - track->regs[dreg].num_reg == 0; -} - #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 7370fba844ef..e0422456f27b 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -66,6 +66,4 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); -bool nft_fib_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); #endif diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index d602263590fe..f74e63290603 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -43,9 +43,6 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); -bool nft_meta_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); - struct nft_inner_tun_ctx; void nft_meta_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index b7af36bbd306..7763e78abb00 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -112,7 +112,6 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = { .eval = nft_meta_bridge_get_eval, .init = nft_meta_bridge_get_init, .dump = nft_meta_get_dump, - .reduce = nft_meta_get_reduce, }; static void nft_meta_bridge_set_eval(const struct nft_expr *expr, @@ -159,24 +158,6 @@ static int nft_meta_bridge_set_init(const struct nft_ctx *ctx, return 0; } -static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - int i; - - for (i = 0; i < NFT_REG32_NUM; i++) { - if (!track->regs[i].selector) - continue; - - if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops) - continue; - - __nft_reg_track_cancel(track, i); - } - - return false; -} - static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -202,7 +183,6 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .init = nft_meta_bridge_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, - .reduce = nft_meta_bridge_set_reduce, .validate = nft_meta_bridge_set_validate, }; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 1cb5c16e97b7..cd2b04236a99 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -184,7 +184,6 @@ static const struct nft_expr_ops nft_reject_bridge_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_bridge_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_bridge_type __read_mostly = { diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index ef5dd88107dd..d53a65ddbd7b 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -76,7 +76,6 @@ static const struct nft_expr_ops nft_dup_ipv4_ops = { .eval = nft_dup_ipv4_eval, .init = nft_dup_ipv4_init, .dump = nft_dup_ipv4_dump, - .reduce = NFT_REDUCE_READONLY, }; static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = { diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 82af6cd76d13..9d0c6d75109b 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -163,7 +163,6 @@ static const struct nft_expr_ops nft_fib4_type_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static const struct nft_expr_ops nft_fib4_ops = { @@ -173,7 +172,6 @@ static const struct nft_expr_ops nft_fib4_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static const struct nft_expr_ops * diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index 6cb213bb7256..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -45,7 +45,6 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 492a811828a7..95ec27b3971c 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -74,7 +74,6 @@ static const struct nft_expr_ops nft_dup_ipv6_ops = { .eval = nft_dup_ipv6_eval, .init = nft_dup_ipv6_init, .dump = nft_dup_ipv6_dump, - .reduce = NFT_REDUCE_READONLY, }; static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = { diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 421036a3605b..dc375b725b28 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -225,7 +225,6 @@ static const struct nft_expr_ops nft_fib6_type_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static const struct nft_expr_ops nft_fib6_ops = { @@ -235,7 +234,6 @@ static const struct nft_expr_ops nft_fib6_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static const struct nft_expr_ops * diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 5c61294f410e..ed69c768797e 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -46,7 +46,6 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 23ef775897a4..c8d5f7e93dfd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -932,58 +932,6 @@ static int nft_delflowtable(struct nft_ctx *ctx, return 0; } -static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) -{ - int i; - - for (i = track->regs[dreg].num_reg; i > 0; i--) - __nft_reg_track_cancel(track, dreg - i); -} - -static void __nft_reg_track_update(struct nft_regs_track *track, - const struct nft_expr *expr, - u8 dreg, u8 num_reg) -{ - track->regs[dreg].selector = expr; - track->regs[dreg].bitwise = NULL; - track->regs[dreg].num_reg = num_reg; -} - -void nft_reg_track_update(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg, u8 len) -{ - unsigned int regcount; - int i; - - __nft_reg_track_clobber(track, dreg); - - regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); - for (i = 0; i < regcount; i++, dreg++) - __nft_reg_track_update(track, expr, dreg, i); -} -EXPORT_SYMBOL_GPL(nft_reg_track_update); - -void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len) -{ - unsigned int regcount; - int i; - - __nft_reg_track_clobber(track, dreg); - - regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); - for (i = 0; i < regcount; i++, dreg++) - __nft_reg_track_cancel(track, dreg); -} -EXPORT_SYMBOL_GPL(nft_reg_track_cancel); - -void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg) -{ - track->regs[dreg].selector = NULL; - track->regs[dreg].bitwise = NULL; - track->regs[dreg].num_reg = 0; -} -EXPORT_SYMBOL_GPL(__nft_reg_track_cancel); - /* * Tables */ @@ -10172,16 +10120,9 @@ void nf_tables_trans_destroy_flush_work(struct net *net) } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); -static bool nft_expr_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - return false; -} - static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { const struct nft_expr *expr, *last; - struct nft_regs_track track = {}; unsigned int size, data_size; void *data, *data_boundary; struct nft_rule_dp *prule; @@ -10218,15 +10159,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha return -ENOMEM; size = 0; - track.last = nft_expr_last(rule); nft_rule_for_each_expr(expr, last, rule) { - track.cur = expr; - - if (nft_expr_reduce(&track, expr)) { - expr = track.cur; - continue; - } - if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary)) return -ENOMEM; diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index d550910aabec..a4ff781f334d 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -391,61 +391,12 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, return 0; } -static bool nft_bitwise_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_bitwise *priv = nft_expr_priv(expr); - const struct nft_bitwise *bitwise; - unsigned int regcount; - u8 dreg; - int i; - - if (!track->regs[priv->sreg].selector) - return false; - - bitwise = nft_expr_priv(track->regs[priv->dreg].selector); - if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && - track->regs[priv->sreg].num_reg == 0 && - track->regs[priv->dreg].bitwise && - track->regs[priv->dreg].bitwise->ops == expr->ops && - priv->sreg == bitwise->sreg && - priv->sreg2 == bitwise->sreg2 && - priv->dreg == bitwise->dreg && - priv->op == bitwise->op && - priv->len == bitwise->len && - !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) && - !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) && - !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) { - track->cur = expr; - return true; - } - - if (track->regs[priv->sreg].bitwise || - track->regs[priv->sreg].num_reg != 0) { - nft_reg_track_cancel(track, priv->dreg, priv->len); - return false; - } - - if (priv->sreg != priv->dreg) { - nft_reg_track_update(track, track->regs[priv->sreg].selector, - priv->dreg, priv->len); - } - - dreg = priv->dreg; - regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE); - for (i = 0; i < regcount; i++, dreg++) - track->regs[dreg].bitwise = expr; - - return false; -} - static const struct nft_expr_ops nft_bitwise_ops = { .type = &nft_bitwise_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)), .eval = nft_bitwise_eval, .init = nft_bitwise_init, .dump = nft_bitwise_dump, - .reduce = nft_bitwise_reduce, .offload = nft_bitwise_offload, }; @@ -548,48 +499,12 @@ static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx, return 0; } -static bool nft_bitwise_fast_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); - const struct nft_bitwise_fast_expr *bitwise; - - if (!track->regs[priv->sreg].selector) - return false; - - bitwise = nft_expr_priv(track->regs[priv->dreg].selector); - if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector && - track->regs[priv->dreg].bitwise && - track->regs[priv->dreg].bitwise->ops == expr->ops && - priv->sreg == bitwise->sreg && - priv->dreg == bitwise->dreg && - priv->mask == bitwise->mask && - priv->xor == bitwise->xor) { - track->cur = expr; - return true; - } - - if (track->regs[priv->sreg].bitwise) { - nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); - return false; - } - - if (priv->sreg != priv->dreg) { - track->regs[priv->dreg].selector = - track->regs[priv->sreg].selector; - } - track->regs[priv->dreg].bitwise = expr; - - return false; -} - const struct nft_expr_ops nft_bitwise_fast_ops = { .type = &nft_bitwise_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)), .eval = NULL, /* inlined */ .init = nft_bitwise_fast_init, .dump = nft_bitwise_fast_dump, - .reduce = nft_bitwise_fast_reduce, .offload = nft_bitwise_fast_offload, }; @@ -626,22 +541,3 @@ struct nft_expr_type nft_bitwise_type __read_mostly = { .maxattr = NFTA_BITWISE_MAX, .owner = THIS_MODULE, }; - -bool nft_expr_reduce_bitwise(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_expr *last = track->last; - const struct nft_expr *next; - - if (expr == last) - return false; - - next = nft_expr_next(expr); - if (next->ops == &nft_bitwise_ops) - return nft_bitwise_reduce(track, next); - else if (next->ops == &nft_bitwise_fast_ops) - return nft_bitwise_fast_reduce(track, next); - - return false; -} -EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise); diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index af9206a3afd1..744878773dac 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -170,23 +170,12 @@ nla_put_failure: return -1; } -static bool nft_byteorder_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - struct nft_byteorder *priv = nft_expr_priv(expr); - - nft_reg_track_cancel(track, priv->dreg, priv->len); - - return false; -} - static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, - .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 2605f43737bc..b61dc9c3383e 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -190,7 +190,6 @@ static const struct nft_expr_ops nft_cmp_ops = { .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, - .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_offload, }; @@ -282,7 +281,6 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, - .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_fast_offload, }; @@ -376,7 +374,6 @@ const struct nft_expr_ops nft_cmp16_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp16_fast_init, .dump = nft_cmp16_fast_dump, - .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp16_fast_offload, }; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 08f620311b03..5021a01ba42c 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -778,14 +778,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { static struct nft_expr_type nft_match_type; -static bool nft_match_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct xt_match *match = expr->ops->data; - - return strcmp(match->name, "comment") == 0; -} - static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -828,7 +820,6 @@ nft_match_select_ops(const struct nft_ctx *ctx, ops->dump = nft_match_dump; ops->validate = nft_match_validate; ops->data = match; - ops->reduce = nft_match_reduce; matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); if (matchsize > NFT_MATCH_LARGE_THRESH) { @@ -918,7 +909,6 @@ nft_target_select_ops(const struct nft_ctx *ctx, ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; - ops->reduce = NFT_REDUCE_READONLY; if (family == NFPROTO_BRIDGE) ops->eval = nft_target_eval_bridge; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 657764774a2d..47d817983e81 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -258,7 +258,6 @@ static const struct nft_expr_ops nft_connlimit_ops = { .destroy_clone = nft_connlimit_destroy_clone, .dump = nft_connlimit_dump, .gc = nft_connlimit_gc, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_connlimit_type __read_mostly = { diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 169ae93688bc..3fa6369790f4 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -313,7 +313,6 @@ static const struct nft_expr_ops nft_counter_ops = { .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, - .reduce = NFT_REDUCE_READONLY, .offload = nft_counter_offload, .offload_stats = nft_counter_offload_stats, }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6f2ae7cad731..b6abd5f8de92 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -698,29 +698,6 @@ nla_put_failure: return -1; } -static bool nft_ct_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_ct *priv = nft_expr_priv(expr); - const struct nft_ct *ct; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - ct = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->key != ct->key) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} - static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -755,27 +732,8 @@ static const struct nft_expr_ops nft_ct_get_ops = { .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, - .reduce = nft_ct_get_reduce, }; -static bool nft_ct_set_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - int i; - - for (i = 0; i < NFT_REG32_NUM; i++) { - if (!track->regs[i].selector) - continue; - - if (track->regs[i].selector->ops != &nft_ct_get_ops) - continue; - - __nft_reg_track_cancel(track, i); - } - - return false; -} - #ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, @@ -784,7 +742,6 @@ static const struct nft_expr_ops nft_ct_get_fast_ops = { .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, - .reduce = nft_ct_set_reduce, }; #endif @@ -795,7 +752,6 @@ static const struct nft_expr_ops nft_ct_set_ops = { .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, - .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -806,7 +762,6 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = { .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, - .reduce = nft_ct_set_reduce, }; #endif @@ -876,7 +831,6 @@ static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 0573f96ce079..06866799e946 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -80,7 +80,6 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .eval = nft_dup_netdev_eval, .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, - .reduce = NFT_REDUCE_READONLY, .offload = nft_dup_netdev_offload, .offload_action = nft_dup_netdev_offload_action, }; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 7807d8129664..6bff6287e7d5 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -421,7 +421,6 @@ static const struct nft_expr_ops nft_dynset_ops = { .activate = nft_dynset_activate, .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, - .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_dynset_type __read_mostly = { diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 7eedf4e3ae9c..5f01269a49bd 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -702,40 +702,12 @@ static int nft_exthdr_dump_strip(struct sk_buff *skb, return nft_exthdr_dump_common(skb, priv); } -static bool nft_exthdr_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_exthdr *priv = nft_expr_priv(expr); - const struct nft_exthdr *exthdr; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - exthdr = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->type != exthdr->type || - priv->op != exthdr->op || - priv->flags != exthdr->flags || - priv->offset != exthdr->offset || - priv->len != exthdr->len) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} - static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, - .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_ipv4_ops = { @@ -744,7 +716,6 @@ static const struct nft_expr_ops nft_exthdr_ipv4_ops = { .eval = nft_exthdr_ipv4_eval, .init = nft_exthdr_ipv4_init, .dump = nft_exthdr_dump, - .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_ops = { @@ -753,7 +724,6 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = { .eval = nft_exthdr_tcp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, - .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { @@ -762,7 +732,6 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .eval = nft_exthdr_tcp_set_eval, .init = nft_exthdr_tcp_set_init, .dump = nft_exthdr_dump_set, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { @@ -771,7 +740,6 @@ static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { .eval = nft_exthdr_tcp_strip_eval, .init = nft_exthdr_tcp_strip_init, .dump = nft_exthdr_dump_strip, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_sctp_ops = { @@ -780,7 +748,6 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .eval = nft_exthdr_sctp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, - .reduce = nft_exthdr_reduce, }; #ifdef CONFIG_NFT_EXTHDR_DCCP @@ -790,7 +757,6 @@ static const struct nft_expr_ops nft_exthdr_dccp_ops = { .eval = nft_exthdr_dccp_eval, .init = nft_exthdr_dccp_init, .dump = nft_exthdr_dump, - .reduce = nft_exthdr_reduce, }; #endif diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 96e02a83c045..f7dc0e54375f 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -162,48 +162,6 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, } EXPORT_SYMBOL_GPL(nft_fib_store_result); -bool nft_fib_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_fib *priv = nft_expr_priv(expr); - unsigned int len = NFT_REG32_SIZE; - const struct nft_fib *fib; - - switch (priv->result) { - case NFT_FIB_RESULT_OIF: - break; - case NFT_FIB_RESULT_OIFNAME: - if (priv->flags & NFTA_FIB_F_PRESENT) - len = NFT_REG32_SIZE; - else - len = IFNAMSIZ; - break; - case NFT_FIB_RESULT_ADDRTYPE: - break; - default: - WARN_ON_ONCE(1); - break; - } - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, len); - return false; - } - - fib = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->result != fib->result || - priv->flags != fib->flags) { - nft_reg_track_update(track, expr, priv->dreg, len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return false; -} -EXPORT_SYMBOL_GPL(nft_fib_reduce); - MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Query routing table from nftables"); MODULE_AUTHOR("Florian Westphal "); diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c index 666a3741d20b..a88d44e163d1 100644 --- a/net/netfilter/nft_fib_inet.c +++ b/net/netfilter/nft_fib_inet.c @@ -49,7 +49,6 @@ static const struct nft_expr_ops nft_fib_inet_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static struct nft_expr_type nft_fib_inet_type __read_mostly = { diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c index 9121ec64e918..3f3478abd845 100644 --- a/net/netfilter/nft_fib_netdev.c +++ b/net/netfilter/nft_fib_netdev.c @@ -58,7 +58,6 @@ static const struct nft_expr_ops nft_fib_netdev_ops = { .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, - .reduce = nft_fib_reduce, }; static struct nft_expr_type nft_fib_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 179d0e59e2b5..32b4281038dd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -225,7 +225,6 @@ static const struct nft_expr_ops nft_flow_offload_ops = { .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 152a9fb4d23a..ad48dcd45abe 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -218,7 +218,6 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, .validate = nft_fwd_validate, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -228,7 +227,6 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, - .reduce = NFT_REDUCE_READONLY, .offload = nft_fwd_netdev_offload, .offload_action = nft_fwd_netdev_offload_action, }; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 5d034bbb6913..1cf41e0a0e0c 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -166,16 +166,6 @@ nla_put_failure: return -1; } -static bool nft_jhash_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_jhash *priv = nft_expr_priv(expr); - - nft_reg_track_cancel(track, priv->dreg, sizeof(u32)); - - return false; -} - static int nft_symhash_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -196,30 +186,6 @@ nla_put_failure: return -1; } -static bool nft_symhash_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - struct nft_symhash *priv = nft_expr_priv(expr); - struct nft_symhash *symhash; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); - return false; - } - - symhash = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->offset != symhash->offset || - priv->modulus != symhash->modulus) { - nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return false; -} - static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, @@ -227,7 +193,6 @@ static const struct nft_expr_ops nft_jhash_ops = { .eval = nft_jhash_eval, .init = nft_jhash_init, .dump = nft_jhash_dump, - .reduce = nft_jhash_reduce, }; static const struct nft_expr_ops nft_symhash_ops = { @@ -236,7 +201,6 @@ static const struct nft_expr_ops nft_symhash_ops = { .eval = nft_symhash_eval, .init = nft_symhash_init, .dump = nft_symhash_dump, - .reduce = nft_symhash_reduce, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 02ee5fb69871..37c29947b380 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -320,17 +320,6 @@ static bool nft_immediate_offload_action(const struct nft_expr *expr) return false; } -static bool nft_immediate_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_immediate_expr *priv = nft_expr_priv(expr); - - if (priv->dreg != NFT_REG_VERDICT) - nft_reg_track_cancel(track, priv->dreg, priv->dlen); - - return false; -} - static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -341,7 +330,6 @@ static const struct nft_expr_ops nft_imm_ops = { .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, - .reduce = nft_immediate_reduce, .offload = nft_immediate_offload, .offload_action = nft_immediate_offload_action, }; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index de1b6066bfa8..e845779268d3 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -125,7 +125,6 @@ static const struct nft_expr_ops nft_last_ops = { .destroy = nft_last_destroy, .clone = nft_last_clone, .dump = nft_last_dump, - .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_last_type __read_mostly = { diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 21d26b79b460..0daeb0b23c20 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -243,7 +243,6 @@ static const struct nft_expr_ops nft_limit_pkts_ops = { .destroy = nft_limit_pkts_destroy, .clone = nft_limit_pkts_clone, .dump = nft_limit_pkts_dump, - .reduce = NFT_REDUCE_READONLY, }; static void nft_limit_bytes_eval(const struct nft_expr *expr, @@ -299,7 +298,6 @@ static const struct nft_expr_ops nft_limit_bytes_ops = { .dump = nft_limit_bytes_dump, .clone = nft_limit_bytes_clone, .destroy = nft_limit_bytes_destroy, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index bf01cf8a8907..da0c0d1c9cea 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -235,7 +235,6 @@ static const struct nft_expr_ops nft_log_ops = { .init = nft_log_init, .destroy = nft_log_destroy, .dump = nft_log_dump, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_log_type __read_mostly = { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index fc2d7c5d83c8..e4e619027542 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -266,17 +266,6 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; } -static bool nft_lookup_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_lookup *priv = nft_expr_priv(expr); - - if (priv->set->flags & NFT_SET_MAP) - nft_reg_track_cancel(track, priv->dreg, priv->set->dlen); - - return false; -} - static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), @@ -287,7 +276,6 @@ static const struct nft_expr_ops nft_lookup_ops = { .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, - .reduce = nft_lookup_reduce, }; struct nft_expr_type nft_lookup_type __read_mostly = { diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 868bd4d73555..2b01128737a3 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -143,7 +143,6 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { @@ -171,7 +170,6 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = { .destroy = nft_masq_ipv6_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { @@ -213,7 +211,6 @@ static const struct nft_expr_ops nft_masq_inet_ops = { .destroy = nft_masq_inet_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_masq_inet_type __read_mostly = { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 05cd1e6e6a2f..983158274c68 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -742,60 +742,16 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx, return 0; } -bool nft_meta_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_meta *priv = nft_expr_priv(expr); - const struct nft_meta *meta; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - meta = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->key != meta->key || - priv->dreg != meta->dreg) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} -EXPORT_SYMBOL_GPL(nft_meta_get_reduce); - static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, .init = nft_meta_get_init, .dump = nft_meta_get_dump, - .reduce = nft_meta_get_reduce, .validate = nft_meta_get_validate, .offload = nft_meta_get_offload, }; -static bool nft_meta_set_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - int i; - - for (i = 0; i < NFT_REG32_NUM; i++) { - if (!track->regs[i].selector) - continue; - - if (track->regs[i].selector->ops != &nft_meta_get_ops) - continue; - - __nft_reg_track_cancel(track, i); - } - - return false; -} - static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -803,7 +759,6 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, - .reduce = nft_meta_set_reduce, .validate = nft_meta_set_validate, }; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 6e21f72c5b57..e32cd9fbc7c2 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -320,7 +320,6 @@ static const struct nft_expr_ops nft_nat_ops = { .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { @@ -351,7 +350,6 @@ static const struct nft_expr_ops nft_nat_inet_ops = { .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index bd058babfc82..06e87dfd76e7 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -84,16 +84,6 @@ err: return err; } -static bool nft_ng_inc_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_ng_inc *priv = nft_expr_priv(expr); - - nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); - - return false; -} - static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -178,16 +168,6 @@ static int nft_ng_random_dump(struct sk_buff *skb, priv->offset); } -static bool nft_ng_random_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_ng_random *priv = nft_expr_priv(expr); - - nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); - - return false; -} - static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -196,7 +176,6 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .init = nft_ng_inc_init, .destroy = nft_ng_inc_destroy, .dump = nft_ng_inc_dump, - .reduce = nft_ng_inc_reduce, }; static const struct nft_expr_ops nft_ng_random_ops = { @@ -205,7 +184,6 @@ static const struct nft_expr_ops nft_ng_random_ops = { .eval = nft_ng_random_eval, .init = nft_ng_random_init, .dump = nft_ng_random_dump, - .reduce = nft_ng_random_reduce, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 1a62e384766a..633cce69568f 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -123,7 +123,6 @@ static const struct nft_expr_ops nft_objref_ops = { .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, .validate = nft_objref_validate, - .reduce = NFT_REDUCE_READONLY, }; struct nft_objref_map { @@ -245,7 +244,6 @@ static const struct nft_expr_ops nft_objref_map_ops = { .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, .validate = nft_objref_map_validate, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 1c0b493ef0a9..39ccd67ed265 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -127,30 +127,6 @@ static int nft_osf_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } -static bool nft_osf_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - struct nft_osf *priv = nft_expr_priv(expr); - struct nft_osf *osf; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); - return false; - } - - osf = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->flags != osf->flags || - priv->ttl != osf->ttl) { - nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return false; -} - static struct nft_expr_type nft_osf_type; static const struct nft_expr_ops nft_osf_op = { .eval = nft_osf_eval, @@ -159,7 +135,6 @@ static const struct nft_expr_ops nft_osf_op = { .dump = nft_osf_dump, .type = &nft_osf_type, .validate = nft_osf_validate, - .reduce = nft_osf_reduce, }; static struct nft_expr_type nft_osf_type __read_mostly = { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index b0214418f75a..973d56af03ff 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -250,31 +250,6 @@ nla_put_failure: return -1; } -static bool nft_payload_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_payload *priv = nft_expr_priv(expr); - const struct nft_payload *payload; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - payload = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->base != payload->base || - priv->offset != payload->offset || - priv->len != payload->len) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} - static bool nft_payload_offload_mask(struct nft_offload_reg *reg, u32 priv_len, u32 field_len) { @@ -578,7 +553,6 @@ static const struct nft_expr_ops nft_payload_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, - .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; @@ -588,7 +562,6 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, - .reduce = nft_payload_reduce, .offload = nft_payload_offload, }; @@ -1012,32 +985,12 @@ nla_put_failure: return -1; } -static bool nft_payload_set_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - int i; - - for (i = 0; i < NFT_REG32_NUM; i++) { - if (!track->regs[i].selector) - continue; - - if (track->regs[i].selector->ops != &nft_payload_ops && - track->regs[i].selector->ops != &nft_payload_fast_ops) - continue; - - __nft_reg_track_cancel(track, i); - } - - return false; -} - static const struct nft_expr_ops nft_payload_set_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), .eval = nft_payload_set_eval, .init = nft_payload_set_init, .dump = nft_payload_set_dump, - .reduce = nft_payload_set_reduce, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 344fe311878f..8eb13a02942e 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -191,7 +191,6 @@ static const struct nft_expr_ops nft_queue_ops = { .init = nft_queue_init, .dump = nft_queue_dump, .validate = nft_queue_validate, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_queue_sreg_ops = { @@ -201,7 +200,6 @@ static const struct nft_expr_ops nft_queue_sreg_ops = { .init = nft_queue_sreg_init, .dump = nft_queue_sreg_dump, .validate = nft_queue_validate, - .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index cb6c0e04ff67..bb3cf3d16e79 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -266,7 +266,6 @@ static const struct nft_expr_ops nft_quota_ops = { .destroy = nft_quota_destroy, .clone = nft_quota_clone, .dump = nft_quota_dump, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_quota_type __read_mostly = { diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index ea382f7bbd78..cbb02644b836 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -138,7 +138,6 @@ static const struct nft_expr_ops nft_range_ops = { .eval = nft_range_eval, .init = nft_range_init, .dump = nft_range_dump, - .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_range_type __read_mostly = { diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 95eedad85c83..58ae802db8f5 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -146,7 +146,6 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = { .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { @@ -174,7 +173,6 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = { .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { @@ -203,7 +201,6 @@ static const struct nft_expr_ops nft_redir_inet_ops = { .destroy = nft_redir_inet_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_redir_inet_type __read_mostly = { diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 49020e67304a..dcae83ddc32e 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -79,7 +79,6 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_inet_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c index 2558ce1505d9..b53e81e4ca75 100644 --- a/net/netfilter/nft_reject_netdev.c +++ b/net/netfilter/nft_reject_netdev.c @@ -158,7 +158,6 @@ static const struct nft_expr_ops nft_reject_netdev_ops = { .init = nft_reject_init, .dump = nft_reject_dump, .validate = nft_reject_netdev_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_reject_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index dc50b9a5bd68..ad527f3596c0 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -195,7 +195,6 @@ static const struct nft_expr_ops nft_rt_get_ops = { .init = nft_rt_get_init, .dump = nft_rt_get_dump, .validate = nft_rt_validate, - .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_rt_type __read_mostly = { diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 36affbb697c2..c55a1310226a 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -249,31 +249,6 @@ static int nft_socket_dump(struct sk_buff *skb, return 0; } -static bool nft_socket_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_socket *priv = nft_expr_priv(expr); - const struct nft_socket *socket; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - socket = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->key != socket->key || - priv->dreg != socket->dreg || - priv->level != socket->level) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} - static int nft_socket_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -296,7 +271,6 @@ static const struct nft_expr_ops nft_socket_ops = { .init = nft_socket_init, .dump = nft_socket_dump, .validate = nft_socket_validate, - .reduce = nft_socket_reduce, }; static struct nft_expr_type nft_socket_type __read_mostly = { diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index b71ef18b0e8c..8e452a874969 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -292,7 +292,6 @@ static const struct nft_expr_ops nft_synproxy_ops = { .dump = nft_synproxy_dump, .type = &nft_synproxy_type, .validate = nft_synproxy_validate, - .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_synproxy_type __read_mostly = { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 50481280abd2..f2101af8c867 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -331,7 +331,6 @@ static const struct nft_expr_ops nft_tproxy_ops = { .init = nft_tproxy_init, .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, - .reduce = NFT_REDUCE_READONLY, .validate = nft_tproxy_validate, }; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index a12486ae089d..f5cadba91417 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -124,31 +124,6 @@ nla_put_failure: return -1; } -static bool nft_tunnel_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_tunnel *priv = nft_expr_priv(expr); - const struct nft_tunnel *tunnel; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - tunnel = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->key != tunnel->key || - priv->dreg != tunnel->dreg || - priv->mode != tunnel->mode) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return false; -} - static struct nft_expr_type nft_tunnel_type; static const struct nft_expr_ops nft_tunnel_get_ops = { .type = &nft_tunnel_type, @@ -156,7 +131,6 @@ static const struct nft_expr_ops nft_tunnel_get_ops = { .eval = nft_tunnel_get_eval, .init = nft_tunnel_get_init, .dump = nft_tunnel_get_dump, - .reduce = nft_tunnel_get_reduce, }; static struct nft_expr_type nft_tunnel_type __read_mostly = { diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 3210cfc966ab..7ffe6a2690d1 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -259,32 +259,6 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e return nft_chain_validate_hooks(ctx->chain, hooks); } -static bool nft_xfrm_reduce(struct nft_regs_track *track, - const struct nft_expr *expr) -{ - const struct nft_xfrm *priv = nft_expr_priv(expr); - const struct nft_xfrm *xfrm; - - if (!nft_reg_track_cmp(track, expr, priv->dreg)) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - xfrm = nft_expr_priv(track->regs[priv->dreg].selector); - if (priv->key != xfrm->key || - priv->dreg != xfrm->dreg || - priv->dir != xfrm->dir || - priv->spnum != xfrm->spnum) { - nft_reg_track_update(track, expr, priv->dreg, priv->len); - return false; - } - - if (!track->regs[priv->dreg].bitwise) - return true; - - return nft_expr_reduce_bitwise(track, expr); -} - static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, @@ -293,7 +267,6 @@ static const struct nft_expr_ops nft_xfrm_get_ops = { .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, - .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { -- cgit v1.2.3 From 15c9ed1d8286dc0297f01347dc74f5a8cbc173de Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Tue, 24 Feb 2026 09:50:52 +0800 Subject: pppoe: remove kernel-mode relay support The kernel-mode PPPoE relay feature and its two associated ioctls (PPPOEIOCSFWD and PPPOEIOCDFWD) are not used by any existing userspace PPPoE implementations. The most commonly-used package, RP-PPPoE [1], handles the relaying entirely in userspace. This legacy code has remained in the driver since its introduction in kernel 2.3.99-pre7 for over two decades, but has served no practical purpose. Remove the unused relay code. [1] https://dianne.skoll.ca/projects/rp-pppoe/ Signed-off-by: Qingfang Deng Acked-by: Arnd Bergmann Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20260224015053.42472-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ppp/pppoe.c | 79 ------------------------------------------- drivers/net/ppp/pppox.c | 3 -- include/linux/if_pppox.h | 6 ---- include/uapi/linux/if_pppox.h | 10 ------ 4 files changed, 98 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 7900cc3212a5..1ac61c273b28 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -237,25 +237,6 @@ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, return po; } -static inline struct pppox_sock *__get_item_by_addr(struct net *net, - struct sockaddr_pppox *sp) -{ - struct net_device *dev; - struct pppoe_net *pn; - struct pppox_sock *pppox_sock = NULL; - - int ifindex; - - dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); - if (dev) { - ifindex = dev->ifindex; - pn = pppoe_pernet(net); - pppox_sock = __get_item(pn, sp->sa_addr.pppoe.sid, - sp->sa_addr.pppoe.remote, ifindex); - } - return pppox_sock; -} - static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { @@ -369,7 +350,6 @@ static struct notifier_block pppoe_notifier = { static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); - struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state @@ -378,17 +358,6 @@ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); - } else if (sk->sk_state & PPPOX_RELAY) { - relay_po = __get_item_by_addr(sock_net(sk), - &po->pppoe_relay); - if (relay_po == NULL) - goto abort_kfree; - - if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) - goto abort_kfree; - - if (!__pppoe_xmit(sk_pppox(relay_po), skb)) - goto abort_kfree; } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; @@ -656,7 +625,6 @@ static int pppoe_connect(struct socket *sock, struct sockaddr_unsized *uservaddr po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); - memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; @@ -783,53 +751,6 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, err = 0; break; - case PPPOEIOCSFWD: - { - struct pppox_sock *relay_po; - - err = -EBUSY; - if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) - break; - - err = -ENOTCONN; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - /* PPPoE address from the user specifies an outbound - PPPoE address which frames are forwarded to */ - err = -EFAULT; - if (copy_from_user(&po->pppoe_relay, - (void __user *)arg, - sizeof(struct sockaddr_pppox))) - break; - - err = -EINVAL; - if (po->pppoe_relay.sa_family != AF_PPPOX || - po->pppoe_relay.sa_protocol != PX_PROTO_OE) - break; - - /* Check that the socket referenced by the address - actually exists. */ - rcu_read_lock(); - relay_po = __get_item_by_addr(sock_net(sk), &po->pppoe_relay); - rcu_read_unlock(); - if (!relay_po) - break; - - sk->sk_state |= PPPOX_RELAY; - err = 0; - break; - } - - case PPPOEIOCDFWD: - err = -EALREADY; - if (!(sk->sk_state & PPPOX_RELAY)) - break; - - sk->sk_state &= ~PPPOX_RELAY; - err = 0; - break; - default: err = -ENOTTY; } diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 08364f10a43f..5861a2f6ce3e 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -102,9 +102,6 @@ EXPORT_SYMBOL(pppox_ioctl); #ifdef CONFIG_COMPAT int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - if (cmd == PPPOEIOCSFWD32) - cmd = PPPOEIOCSFWD; - return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index db45d6f1c4f4..8bbf676c2a85 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -25,8 +25,6 @@ struct pppoe_opt { struct net_device *dev; /* device associated with socket*/ int ifindex; /* ifindex of device associated with socket */ struct pppoe_addr pa; /* what this socket is bound to*/ - struct sockaddr_pppox relay; /* what socket data will be - relayed to (PPPoE relaying) */ struct work_struct padt_work;/* Work item for handling PADT */ }; @@ -53,7 +51,6 @@ struct pppox_sock { #define pppoe_dev proto.pppoe.dev #define pppoe_ifindex proto.pppoe.ifindex #define pppoe_pa proto.pppoe.pa -#define pppoe_relay proto.pppoe.relay static inline struct pppox_sock *pppox_sk(struct sock *sk) { @@ -80,14 +77,11 @@ extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); -#define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) - /* PPPoX socket states */ enum { PPPOX_NONE = 0, /* initial state */ PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ - PPPOX_RELAY = 4, /* forwarding is enabled */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index 29b804aa7474..7ae044d71fb7 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -103,16 +103,6 @@ struct sockaddr_pppol2tpv3in6 { struct pppol2tpv3in6_addr pppol2tp; } __packed; -/********************************************************************* - * - * ioctl interface for defining forwarding of connections - * - ********************************************************************/ - -#define PPPOEIOCSFWD _IOW(0xB1 ,0, size_t) -#define PPPOEIOCDFWD _IO(0xB1 ,1) -/*#define PPPOEIOCGFWD _IOWR(0xB1,2, size_t)*/ - /* Codes to identify message types */ #define PADI_CODE 0x09 #define PADO_CODE 0x07 -- cgit v1.2.3 From 4916f2e2f3fc9aef289fcd07949301e5c29094c2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 24 Feb 2026 02:02:14 +0000 Subject: bonding: print churn state via netlink Currently, the churn state is printed only in sysfs. Add netlink support so users could get the state via netlink. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20260224020215.6012-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_netlink.c | 9 +++++++++ include/uapi/linux/if_link.h | 2 ++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 286f11c517f7..ea1a80e658ae 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -29,6 +29,8 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ + nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE */ + nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE */ 0; } @@ -77,6 +79,13 @@ static int bond_fill_slave_info(struct sk_buff *skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE, + ad_port->sm_churn_actor_state)) + goto nla_put_failure; + if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE, + ad_port->sm_churn_partner_state)) + goto nla_put_failure; } if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e9b5f79e1ee1..83a96c56b8ca 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1568,6 +1568,8 @@ enum { IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, IFLA_BOND_SLAVE_PRIO, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, + IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE, + IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From 1ae2f435350ec05224a39995c3a680aa6fdae5a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:29:37 +0100 Subject: ACPI: x86: cmos_rtc: Create a CMOS RTC platform device Make the CMOS RTC ACPI scan handler create a platform device that will be used subsequently by rtc-cmos for driver binding on x86 systems with ACPI and update add_rtc_cmos() to skip registering a fallback platform device for the CMOS RTC when the above one has been registered. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen # x86 Link: https://patch.msgid.link/1962427.tdWV9SEqCh@rafael.j.wysocki --- arch/x86/kernel/rtc.c | 4 ++++ drivers/acpi/x86/cmos_rtc.c | 8 ++++++++ include/linux/acpi.h | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 51a849a79c98..b112178e8185 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -2,6 +2,7 @@ /* * RTC related functions */ +#include #include #include #include @@ -146,6 +147,9 @@ static __init int add_rtc_cmos(void) } } #endif + if (cmos_rtc_platform_device_present) + return 0; + if (!x86_platform.legacy.rtc) return -ENODEV; diff --git a/drivers/acpi/x86/cmos_rtc.c b/drivers/acpi/x86/cmos_rtc.c index 45db7e51cbe6..bdd66dfd4a44 100644 --- a/drivers/acpi/x86/cmos_rtc.c +++ b/drivers/acpi/x86/cmos_rtc.c @@ -24,6 +24,8 @@ static const struct acpi_device_id acpi_cmos_rtc_ids[] = { {} }; +bool cmos_rtc_platform_device_present; + static bool cmos_rtc_space_handler_present __read_mostly; static acpi_status acpi_cmos_rtc_space_handler(u32 function, @@ -103,6 +105,12 @@ static int acpi_cmos_rtc_attach(struct acpi_device *adev, if (ret < 0) return ret; + if (IS_ERR_OR_NULL(acpi_create_platform_device(adev, NULL))) { + pr_err("Failed to create CMOS-RTC platform device\n"); + return 0; + } else { + cmos_rtc_platform_device_present = true; + } return 1; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d2f0bed7a06..2bdb801cee01 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,8 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +extern bool cmos_rtc_platform_device_present; + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -1116,6 +1118,8 @@ static inline int acpi_mrrm_max_mem_region(void) return 1; } +#define cmos_rtc_platform_device_present false + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT -- cgit v1.2.3 From 2a78e42104444f948698f1225deaf515e9b7224d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:30:21 +0100 Subject: ACPI: x86/rtc-cmos: Use platform device for driver binding Modify the rtc-cmos driver to bind to a platform device on systems with ACPI via acpi_match_table and advertise the CMOST RTC ACPI device IDs for driver auto-loading. Note that adding the requisite device IDs to it and exposing them via MODULE_DEVICE_TABLE() is sufficient for this purpose. Since the ACPI device IDs in question are the same as for the CMOS RTC ACPI scan handler, put them into a common header file and use the definition from there in both places. Additionally, to prevent a PNP device from being created for the CMOS RTC if a platform one is present already, make is_cmos_rtc_device() check cmos_rtc_platform_device_present introduced previously. Signed-off-by: Rafael J. Wysocki Acked-by: Alexandre Belloni Link: https://patch.msgid.link/13969123.uLZWGnKmhe@rafael.j.wysocki --- drivers/acpi/acpi_pnp.c | 2 +- drivers/acpi/x86/cmos_rtc.c | 5 +---- drivers/rtc/rtc-cmos.c | 10 ++++++++++ include/linux/acpi.h | 6 ++++++ 4 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpi_pnp.c b/drivers/acpi/acpi_pnp.c index 85d9f78619a2..4ad8f56d1a5d 100644 --- a/drivers/acpi/acpi_pnp.c +++ b/drivers/acpi/acpi_pnp.c @@ -368,7 +368,7 @@ static int is_cmos_rtc_device(struct acpi_device *adev) { "PNP0B02" }, {""}, }; - return !acpi_match_device_ids(adev, ids); + return !cmos_rtc_platform_device_present && !acpi_match_device_ids(adev, ids); } bool acpi_is_pnp_device(struct acpi_device *adev) diff --git a/drivers/acpi/x86/cmos_rtc.c b/drivers/acpi/x86/cmos_rtc.c index bdd66dfd4a44..a6df5b991c96 100644 --- a/drivers/acpi/x86/cmos_rtc.c +++ b/drivers/acpi/x86/cmos_rtc.c @@ -18,10 +18,7 @@ #include "../internal.h" static const struct acpi_device_id acpi_cmos_rtc_ids[] = { - { "PNP0B00" }, - { "PNP0B01" }, - { "PNP0B02" }, - {} + ACPI_CMOS_RTC_IDS }; bool cmos_rtc_platform_device_present; diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 0743c6acd6e2..7457f42fd6f0 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -27,6 +27,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -1476,6 +1477,14 @@ static __init void cmos_of_init(struct platform_device *pdev) #else static inline void cmos_of_init(struct platform_device *pdev) {} #endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id acpi_cmos_rtc_ids[] = { + ACPI_CMOS_RTC_IDS +}; +MODULE_DEVICE_TABLE(acpi, acpi_cmos_rtc_ids); +#endif + /*----------------------------------------------------------------*/ /* Platform setup should have set up an RTC device, when PNP is @@ -1530,6 +1539,7 @@ static struct platform_driver cmos_platform_driver = { .name = driver_name, .pm = &cmos_pm_ops, .of_match_table = of_match_ptr(of_cmos_match), + .acpi_match_table = ACPI_PTR(acpi_cmos_rtc_ids), } }; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2bdb801cee01..5ecdcdaf31aa 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,12 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +#define ACPI_CMOS_RTC_IDS \ + { "PNP0B00", }, \ + { "PNP0B01", }, \ + { "PNP0B02", }, \ + { "", } + extern bool cmos_rtc_platform_device_present; #else /* !CONFIG_ACPI */ -- cgit v1.2.3 From e8d1eb65193ce93283f8f56a069eee5d548a6b70 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:33:27 +0100 Subject: ACPI: TAD/x86: cmos_rtc: Consolidate address space handler setup On x86, as a rule the CMOS RTC address space handler is set up by the CMOS RTC ACPI scan handler attach callback, acpi_cmos_rtc_attach(), but if the ACPI namespace does not contain a CMOS RTC device object, the CMOS RTC address space handler installation is taken care of the ACPI TAD (Timer and Alarm Device) driver. This is not particularly straightforward and can be avoided by adding the ACPI TAD device ID to the CMOS RTC ACPI scan handler which will cause it to create a platform device for ACPI TAD after installing the CMOS RTC address space handler. One related detail that needs to be taken care of, though, is that the creation of an ACPI TAD platform device should not cause cmos_rtc_platform_device_present to be set, since this may cause add_rtc_cmos() to suppress the creation of a fallback CMOS RTC platform device which may not be the right thing to do (for instance, due to the fact that the ACPI TAD driver is missing an RTC class device interface). After doing the above, the CMOS RTC address space handler installation and removal can be dropped from the ACPI TAD driver (which allows it to be simplified quite a bit), acpi_remove_cmos_rtc_space_handler() can be dropped and acpi_install_cmos_rtc_space_handler() can be made static. Update the code as per the above. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/23028644.EfDdHjke4D@rafael.j.wysocki --- drivers/acpi/acpi_tad.c | 27 +++++---------------------- drivers/acpi/x86/cmos_rtc.c | 26 +++++--------------------- include/acpi/acpi_bus.h | 9 --------- 3 files changed, 10 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index 6d870d97ada6..4f5089fc023d 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -563,7 +563,6 @@ static int acpi_tad_disable_timer(struct device *dev, u32 timer_id) static void acpi_tad_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; - acpi_handle handle = ACPI_HANDLE(dev); struct acpi_tad_driver_data *dd = dev_get_drvdata(dev); device_init_wakeup(dev, false); @@ -587,7 +586,6 @@ static void acpi_tad_remove(struct platform_device *pdev) pm_runtime_suspend(dev); pm_runtime_disable(dev); - acpi_remove_cmos_rtc_space_handler(handle); } static int acpi_tad_probe(struct platform_device *pdev) @@ -599,11 +597,6 @@ static int acpi_tad_probe(struct platform_device *pdev) unsigned long long caps; int ret; - ret = acpi_install_cmos_rtc_space_handler(handle); - if (ret < 0) { - dev_info(dev, "Unable to install space handler\n"); - return -ENODEV; - } /* * Initialization failure messages are mostly about firmware issues, so * print them at the "info" level. @@ -611,27 +604,22 @@ static int acpi_tad_probe(struct platform_device *pdev) status = acpi_evaluate_integer(handle, "_GCP", NULL, &caps); if (ACPI_FAILURE(status)) { dev_info(dev, "Unable to get capabilities\n"); - ret = -ENODEV; - goto remove_handler; + return -ENODEV; } if (!(caps & ACPI_TAD_AC_WAKE)) { dev_info(dev, "Unsupported capabilities\n"); - ret = -ENODEV; - goto remove_handler; + return -ENODEV; } if (!acpi_has_method(handle, "_PRW")) { dev_info(dev, "Missing _PRW\n"); - ret = -ENODEV; - goto remove_handler; + return -ENODEV; } dd = devm_kzalloc(dev, sizeof(*dd), GFP_KERNEL); - if (!dd) { - ret = -ENOMEM; - goto remove_handler; - } + if (!dd) + return -ENOMEM; dd->capabilities = caps; dev_set_drvdata(dev, dd); @@ -673,11 +661,6 @@ static int acpi_tad_probe(struct platform_device *pdev) fail: acpi_tad_remove(pdev); - /* Don't fallthrough because cmos rtc space handler is removed in acpi_tad_remove() */ - return ret; - -remove_handler: - acpi_remove_cmos_rtc_space_handler(handle); return ret; } diff --git a/drivers/acpi/x86/cmos_rtc.c b/drivers/acpi/x86/cmos_rtc.c index a6df5b991c96..ced334e19896 100644 --- a/drivers/acpi/x86/cmos_rtc.c +++ b/drivers/acpi/x86/cmos_rtc.c @@ -18,13 +18,12 @@ #include "../internal.h" static const struct acpi_device_id acpi_cmos_rtc_ids[] = { + { "ACPI000E", 1 }, /* ACPI Time and Alarm Device (TAD) */ ACPI_CMOS_RTC_IDS }; bool cmos_rtc_platform_device_present; -static bool cmos_rtc_space_handler_present __read_mostly; - static acpi_status acpi_cmos_rtc_space_handler(u32 function, acpi_physical_address address, u32 bits, u64 *value64, @@ -56,8 +55,9 @@ static acpi_status acpi_cmos_rtc_space_handler(u32 function, return AE_BAD_PARAMETER; } -int acpi_install_cmos_rtc_space_handler(acpi_handle handle) +static int acpi_install_cmos_rtc_space_handler(acpi_handle handle) { + static bool cmos_rtc_space_handler_present __read_mostly; acpi_status status; if (cmos_rtc_space_handler_present) @@ -76,22 +76,6 @@ int acpi_install_cmos_rtc_space_handler(acpi_handle handle) return 1; } -EXPORT_SYMBOL_GPL(acpi_install_cmos_rtc_space_handler); - -void acpi_remove_cmos_rtc_space_handler(acpi_handle handle) -{ - acpi_status status; - - if (cmos_rtc_space_handler_present) - return; - - status = acpi_remove_address_space_handler(handle, - ACPI_ADR_SPACE_CMOS, - acpi_cmos_rtc_space_handler); - if (ACPI_FAILURE(status)) - pr_err("Failed to remove CMOS-RTC address space handler\n"); -} -EXPORT_SYMBOL_GPL(acpi_remove_cmos_rtc_space_handler); static int acpi_cmos_rtc_attach(struct acpi_device *adev, const struct acpi_device_id *id) @@ -103,9 +87,9 @@ static int acpi_cmos_rtc_attach(struct acpi_device *adev, return ret; if (IS_ERR_OR_NULL(acpi_create_platform_device(adev, NULL))) { - pr_err("Failed to create CMOS-RTC platform device\n"); + pr_err("Failed to create a platform device for %s\n", (char *)id->id); return 0; - } else { + } else if (!id->driver_data) { cmos_rtc_platform_device_present = true; } return 1; diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index aad1a95e6863..be6d9032a161 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -760,8 +760,6 @@ int acpi_disable_wakeup_device_power(struct acpi_device *dev); #ifdef CONFIG_X86 bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status); bool acpi_quirk_skip_acpi_ac_and_battery(void); -int acpi_install_cmos_rtc_space_handler(acpi_handle handle); -void acpi_remove_cmos_rtc_space_handler(acpi_handle handle); int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip); #else static inline bool acpi_device_override_status(struct acpi_device *adev, @@ -773,13 +771,6 @@ static inline bool acpi_quirk_skip_acpi_ac_and_battery(void) { return false; } -static inline int acpi_install_cmos_rtc_space_handler(acpi_handle handle) -{ - return 1; -} -static inline void acpi_remove_cmos_rtc_space_handler(acpi_handle handle) -{ -} static inline int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip) { -- cgit v1.2.3 From 501efdcb3b3ab099fc0ce2f6e668b1c4095dd476 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 25 Feb 2026 14:01:18 +0000 Subject: ASoC: SDCA: Pull the Q7.8 volume helpers out of soc-ops It is cleaner to keep the SDCA code contained and not update the core code for things that are unlikely to see reuse outside of SDCA. Move the Q7.8 volume helpers back into the SDCA core code. Reviewed-by: Pierre-Louis Bossart Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260225140118.402695-5-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc.h | 1 - sound/soc/sdca/sdca_asoc.c | 67 +++++++++++++++++++++++++++++++++++++++++++++- sound/soc/soc-ops.c | 56 ++++++++------------------------------ 3 files changed, 77 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 7d8376c8e1be..172bd68e1315 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1239,7 +1239,6 @@ struct soc_mixer_control { unsigned int sign_bit; unsigned int invert:1; unsigned int autodisable:1; - unsigned int sdca_q78:1; #ifdef CONFIG_SND_SOC_TOPOLOGY struct snd_soc_dobj dobj; #endif diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c index e6f7c2778bec..a342a4e56717 100644 --- a/sound/soc/sdca/sdca_asoc.c +++ b/sound/soc/sdca/sdca_asoc.c @@ -805,6 +805,70 @@ int sdca_asoc_populate_dapm(struct device *dev, struct sdca_function_data *funct } EXPORT_SYMBOL_NS(sdca_asoc_populate_dapm, "SND_SOC_SDCA"); +static int q78_write(struct snd_soc_component *component, + struct soc_mixer_control *mc, + unsigned int reg, const int val) +{ + unsigned int mask = GENMASK(mc->sign_bit, 0); + unsigned int reg_val; + + if (val < 0 || val > mc->max - mc->min) + return -EINVAL; + + reg_val = (val + mc->min) * mc->shift; + + return snd_soc_component_update_bits(component, reg, mask, reg_val); +} + +static int q78_put_volsw(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *component = snd_kcontrol_chip(kcontrol); + int ret; + + ret = q78_write(component, mc, mc->reg, ucontrol->value.integer.value[0]); + if (ret < 0) + return ret; + + if (snd_soc_volsw_is_stereo(mc)) { + int err; /* Don't drop change flag */ + + err = q78_write(component, mc, mc->rreg, ucontrol->value.integer.value[1]); + if (err) + return err; + } + + return ret; +} + +static int q78_read(struct snd_soc_component *component, + struct soc_mixer_control *mc, unsigned int reg) +{ + unsigned int reg_val; + int val; + + reg_val = snd_soc_component_read(component, reg); + + val = (sign_extend32(reg_val, mc->sign_bit) / mc->shift) - mc->min; + + return val & GENMASK(mc->sign_bit, 0); +} + +static int q78_get_volsw(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *component = snd_kcontrol_chip(kcontrol); + + ucontrol->value.integer.value[0] = q78_read(component, mc, mc->reg); + + if (snd_soc_volsw_is_stereo(mc)) + ucontrol->value.integer.value[1] = q78_read(component, mc, mc->rreg); + + return 0; +} + static int control_limit_kctl(struct device *dev, struct sdca_entity *entity, struct sdca_control *control, @@ -845,10 +909,11 @@ static int control_limit_kctl(struct device *dev, mc->max = max / step; mc->shift = step; mc->sign_bit = 15; - mc->sdca_q78 = 1; kctl->tlv.p = tlv; kctl->access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ; + kctl->get = q78_get_volsw; + kctl->put = q78_put_volsw; return 0; } diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 8ae6609ca961..0d633f38cfdc 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -110,29 +110,6 @@ int snd_soc_put_enum_double(struct snd_kcontrol *kcontrol, } EXPORT_SYMBOL_GPL(snd_soc_put_enum_double); -static int sdca_soc_q78_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_val, - unsigned int mask, unsigned int shift, int max, - bool sx) -{ - int val = reg_val; - - if (WARN_ON(!mc->shift)) - return -EINVAL; - - val = sign_extend32(val, mc->sign_bit); - - return ((val / mc->shift) - mc->min) & mask; -} - -static unsigned int sdca_soc_q78_ctl_to_reg(struct soc_mixer_control *mc, int val, - unsigned int mask, unsigned int shift, int max) -{ - if (WARN_ON(!mc->shift)) - return -EINVAL; - - return ((val + mc->min) * mc->shift) & mask; -} - static int soc_mixer_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_val, unsigned int mask, unsigned int shift, int max, bool sx) @@ -226,27 +203,19 @@ static int soc_put_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol, struct soc_mixer_control *mc, int mask, int max) { - unsigned int (*ctl_to_reg)(struct soc_mixer_control *, int, unsigned int, unsigned int, int); struct snd_soc_component *component = snd_kcontrol_chip(kcontrol); unsigned int val1, val_mask; unsigned int val2 = 0; bool double_r = false; int ret; - if (mc->sdca_q78) { - ctl_to_reg = sdca_soc_q78_ctl_to_reg; - val_mask = mask; - } else { - ctl_to_reg = soc_mixer_ctl_to_reg; - val_mask = mask << mc->shift; - } - ret = soc_mixer_valid_ctl(mc, ucontrol->value.integer.value[0], max); if (ret) return ret; - val1 = ctl_to_reg(mc, ucontrol->value.integer.value[0], + val1 = soc_mixer_ctl_to_reg(mc, ucontrol->value.integer.value[0], mask, mc->shift, max); + val_mask = mask << mc->shift; if (snd_soc_volsw_is_stereo(mc)) { ret = soc_mixer_valid_ctl(mc, ucontrol->value.integer.value[1], max); @@ -254,10 +223,14 @@ static int soc_put_volsw(struct snd_kcontrol *kcontrol, return ret; if (mc->reg == mc->rreg) { - val1 |= ctl_to_reg(mc, ucontrol->value.integer.value[1], mask, mc->rshift, max); + val1 |= soc_mixer_ctl_to_reg(mc, + ucontrol->value.integer.value[1], + mask, mc->rshift, max); val_mask |= mask << mc->rshift; } else { - val2 = ctl_to_reg(mc, ucontrol->value.integer.value[1], mask, mc->shift, max); + val2 = soc_mixer_ctl_to_reg(mc, + ucontrol->value.integer.value[1], + mask, mc->shift, max); double_r = true; } } @@ -281,28 +254,21 @@ static int soc_get_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol, struct soc_mixer_control *mc, int mask, int max, bool sx) { - int (*reg_to_ctl)(struct soc_mixer_control *, unsigned int, unsigned int, - unsigned int, int, bool); struct snd_soc_component *component = snd_kcontrol_chip(kcontrol); unsigned int reg_val; int val; - if (mc->sdca_q78) - reg_to_ctl = sdca_soc_q78_reg_to_ctl; - else - reg_to_ctl = soc_mixer_reg_to_ctl; - reg_val = snd_soc_component_read(component, mc->reg); - val = reg_to_ctl(mc, reg_val, mask, mc->shift, max, sx); + val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->shift, max, sx); ucontrol->value.integer.value[0] = val; if (snd_soc_volsw_is_stereo(mc)) { if (mc->reg == mc->rreg) { - val = reg_to_ctl(mc, reg_val, mask, mc->rshift, max, sx); + val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->rshift, max, sx); } else { reg_val = snd_soc_component_read(component, mc->rreg); - val = reg_to_ctl(mc, reg_val, mask, mc->shift, max, sx); + val = soc_mixer_reg_to_ctl(mc, reg_val, mask, mc->shift, max, sx); } ucontrol->value.integer.value[1] = val; -- cgit v1.2.3 From 2a62dc74726b03b76bab4641ee54b88b6eb7a1d5 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 25 Feb 2026 16:49:09 +0200 Subject: drm/i915/dpt: move create/destroy to parent interface Move the DPT create/destroy calls to the display parent interface. With this, we can remove the dummy xe implementation. Reviewed-by: Juha-Pekka Heikkila Link: https://patch.msgid.link/9753b21466c668872f468ccff827eab7be034b0c.1772030909.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dpt.h | 3 --- drivers/gpu/drm/i915/display/intel_fb.c | 8 ++++---- drivers/gpu/drm/i915/display/intel_parent.c | 17 +++++++++++++++++ drivers/gpu/drm/i915/display/intel_parent.h | 8 ++++++++ drivers/gpu/drm/i915/i915_dpt.c | 11 ++++++++--- drivers/gpu/drm/i915/i915_dpt.h | 9 +++++++++ drivers/gpu/drm/i915/i915_driver.c | 2 ++ drivers/gpu/drm/xe/display/xe_fb_pin.c | 20 -------------------- include/drm/intel/display_parent_interface.h | 9 +++++++++ 9 files changed, 57 insertions(+), 30 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_dpt.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_dpt.h b/drivers/gpu/drm/i915/display/intel_dpt.h index 79d9bb80941a..e05b3a716310 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt.h +++ b/drivers/gpu/drm/i915/display/intel_dpt.h @@ -8,18 +8,15 @@ #include -struct drm_gem_object; struct i915_address_space; struct i915_vma; struct intel_display; -void intel_dpt_destroy(struct i915_address_space *vm); struct i915_vma *intel_dpt_pin_to_ggtt(struct i915_address_space *vm, unsigned int alignment); void intel_dpt_unpin_from_ggtt(struct i915_address_space *vm); void intel_dpt_suspend(struct intel_display *display); void intel_dpt_resume(struct intel_display *display); -struct i915_address_space *intel_dpt_create(struct drm_gem_object *obj, size_t size); u64 intel_dpt_offset(struct i915_vma *dpt_vma); #endif /* __INTEL_DPT_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_fb.c b/drivers/gpu/drm/i915/display/intel_fb.c index 4ee884639ac2..f718eb139d69 100644 --- a/drivers/gpu/drm/i915/display/intel_fb.c +++ b/drivers/gpu/drm/i915/display/intel_fb.c @@ -16,7 +16,6 @@ #include "intel_display_core.h" #include "intel_display_types.h" #include "intel_display_utils.h" -#include "intel_dpt.h" #include "intel_fb.h" #include "intel_fb_bo.h" #include "intel_frontbuffer.h" @@ -2104,12 +2103,13 @@ int intel_plane_compute_gtt(struct intel_plane_state *plane_state) static void intel_user_framebuffer_destroy(struct drm_framebuffer *fb) { + struct intel_display *display = to_intel_display(fb->dev); struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); drm_framebuffer_cleanup(fb); if (intel_fb_uses_dpt(fb)) - intel_dpt_destroy(intel_fb->dpt_vm); + intel_parent_dpt_destroy(display, intel_fb->dpt_vm); intel_fb_bo_framebuffer_fini(intel_fb_bo(fb)); @@ -2311,7 +2311,7 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, if (intel_fb_needs_pot_stride_remap(intel_fb)) size = intel_remapped_info_size(&intel_fb->remapped_view.gtt.remapped); - vm = intel_dpt_create(obj, size); + vm = intel_parent_dpt_create(display, obj, size); if (IS_ERR(vm)) { drm_dbg_kms(display->drm, "failed to create DPT\n"); ret = PTR_ERR(vm); @@ -2331,7 +2331,7 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, err_free_dpt: if (intel_fb_uses_dpt(fb)) - intel_dpt_destroy(intel_fb->dpt_vm); + intel_parent_dpt_destroy(display, intel_fb->dpt_vm); err_bo_framebuffer_fini: intel_fb_bo_framebuffer_fini(obj); err_frontbuffer_put: diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index 7f73695a0444..c43e3518a139 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -23,6 +23,23 @@ #include "intel_display_core.h" #include "intel_parent.h" +/* dpt */ +struct i915_address_space *intel_parent_dpt_create(struct intel_display *display, + struct drm_gem_object *obj, + size_t size) +{ + if (display->parent->dpt) + return display->parent->dpt->create(obj, size); + + return NULL; +} + +void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address_space *vm) +{ + if (display->parent->dpt) + display->parent->dpt->destroy(vm); +} + /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, struct intel_hdcp_gsc_context *gsc_context, diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 04782bb26b61..88860e471a0d 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -7,12 +7,20 @@ #include struct dma_fence; +struct drm_gem_object; struct drm_scanout_buffer; +struct i915_address_space; struct intel_display; struct intel_hdcp_gsc_context; struct intel_panic; struct intel_stolen_node; +/* dpt */ +struct i915_address_space *intel_parent_dpt_create(struct intel_display *display, + struct drm_gem_object *obj, + size_t size); +void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address_space *vm); + /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, struct intel_hdcp_gsc_context *gsc_context, diff --git a/drivers/gpu/drm/i915/i915_dpt.c b/drivers/gpu/drm/i915/i915_dpt.c index cd98b06d2685..5237d057119e 100644 --- a/drivers/gpu/drm/i915/i915_dpt.c +++ b/drivers/gpu/drm/i915/i915_dpt.c @@ -4,6 +4,7 @@ */ #include +#include #include "display/intel_display_core.h" #include "display/intel_display_rpm.h" @@ -242,8 +243,7 @@ void intel_dpt_suspend(struct intel_display *display) mutex_unlock(&display->drm->mode_config.fb_lock); } -struct i915_address_space * -intel_dpt_create(struct drm_gem_object *obj, size_t size) +static struct i915_address_space *i915_dpt_create(struct drm_gem_object *obj, size_t size) { struct drm_i915_private *i915 = to_i915(obj->dev); struct drm_i915_gem_object *dpt_obj; @@ -308,7 +308,7 @@ intel_dpt_create(struct drm_gem_object *obj, size_t size) return &dpt->vm; } -void intel_dpt_destroy(struct i915_address_space *vm) +static void i915_dpt_destroy(struct i915_address_space *vm) { struct i915_dpt *dpt = i915_vm_to_dpt(vm); @@ -320,3 +320,8 @@ u64 intel_dpt_offset(struct i915_vma *dpt_vma) { return i915_vma_offset(dpt_vma); } + +const struct intel_display_dpt_interface i915_display_dpt_interface = { + .create = i915_dpt_create, + .destroy = i915_dpt_destroy, +}; diff --git a/drivers/gpu/drm/i915/i915_dpt.h b/drivers/gpu/drm/i915/i915_dpt.h new file mode 100644 index 000000000000..494cd4af3bcd --- /dev/null +++ b/drivers/gpu/drm/i915/i915_dpt.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright 2026 Intel Corporation */ + +#ifndef __I915_DPT_H__ +#define __I915_DPT_H__ + +extern const struct intel_display_dpt_interface i915_display_dpt_interface; + +#endif /* __I915_DPT_H__ */ diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 6d8fbf845bc2..31a608ccab00 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -91,6 +91,7 @@ #include "i915_debugfs.h" #include "i915_display_pc8.h" +#include "i915_dpt.h" #include "i915_driver.h" #include "i915_drm_client.h" #include "i915_drv.h" @@ -761,6 +762,7 @@ static bool vgpu_active(struct drm_device *drm) } static const struct intel_display_parent_interface parent = { + .dpt = &i915_display_dpt_interface, .dsb = &i915_display_dsb_interface, .hdcp = &i915_display_hdcp_interface, .initial_plane = &i915_display_initial_plane_interface, diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c b/drivers/gpu/drm/xe/display/xe_fb_pin.c index 36eb6c0b9d76..4cb37717d3b4 100644 --- a/drivers/gpu/drm/xe/display/xe_fb_pin.c +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c @@ -8,7 +8,6 @@ #include "i915_vma.h" #include "intel_display_core.h" #include "intel_display_types.h" -#include "intel_dpt.h" #include "intel_fb.h" #include "intel_fb_pin.h" #include "intel_fbdev.h" @@ -452,25 +451,6 @@ void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state) old_plane_state->ggtt_vma = NULL; } -/* - * For Xe introduce dummy intel_dpt_create which just return NULL, - * intel_dpt_destroy which does nothing, and fake intel_dpt_ofsset returning 0; - */ -struct i915_address_space *intel_dpt_create(struct drm_gem_object *obj, size_t size) -{ - return NULL; -} - -void intel_dpt_destroy(struct i915_address_space *vm) -{ - return; -} - -u64 intel_dpt_offset(struct i915_vma *dpt_vma) -{ - return 0; -} - void intel_fb_get_map(struct i915_vma *vma, struct iosys_map *map) { *map = vma->bo->vmap; diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 41f4afe7928c..48abbe187d61 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -13,6 +13,7 @@ struct drm_framebuffer; struct drm_gem_object; struct drm_plane_state; struct drm_scanout_buffer; +struct i915_address_space; struct i915_vma; struct intel_dsb_buffer; struct intel_hdcp_gsc_context; @@ -23,6 +24,11 @@ struct ref_tracker; /* Keep struct definitions sorted */ +struct intel_display_dpt_interface { + struct i915_address_space *(*create)(struct drm_gem_object *obj, size_t size); + void (*destroy)(struct i915_address_space *vm); +}; + struct intel_display_dsb_interface { u32 (*ggtt_offset)(struct intel_dsb_buffer *dsb_buf); void (*write)(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val); @@ -124,6 +130,9 @@ struct intel_display_stolen_interface { * check the optional pointers. */ struct intel_display_parent_interface { + /** @dsb: DPT interface. Optional. */ + const struct intel_display_dpt_interface *dpt; + /** @dsb: DSB buffer interface */ const struct intel_display_dsb_interface *dsb; -- cgit v1.2.3 From 3834ea7499ca2c88e0f67bb6929668f78bb67127 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 25 Feb 2026 16:49:10 +0200 Subject: drm/i915/dpt: move suspend/resume to parent interface Add per-vm DPT suspend/resume calls to the display parent interface, and lift the generic code away from i915 specific code. Reviewed-by: Juha-Pekka Heikkila Link: https://patch.msgid.link/080945a49559ec1f5183ad409e1526736e828d90.1772030909.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dpt.h | 3 -- drivers/gpu/drm/i915/display/intel_dpt_common.c | 59 ++++++++++++++++++++ drivers/gpu/drm/i915/display/intel_dpt_common.h | 3 ++ drivers/gpu/drm/i915/display/intel_parent.c | 12 +++++ drivers/gpu/drm/i915/display/intel_parent.h | 2 + drivers/gpu/drm/i915/i915_dpt.c | 72 +++++-------------------- drivers/gpu/drm/i915/i915_driver.c | 2 +- include/drm/intel/display_parent_interface.h | 2 + 8 files changed, 91 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_dpt.h b/drivers/gpu/drm/i915/display/intel_dpt.h index e05b3a716310..0482af43e946 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt.h +++ b/drivers/gpu/drm/i915/display/intel_dpt.h @@ -10,13 +10,10 @@ struct i915_address_space; struct i915_vma; -struct intel_display; struct i915_vma *intel_dpt_pin_to_ggtt(struct i915_address_space *vm, unsigned int alignment); void intel_dpt_unpin_from_ggtt(struct i915_address_space *vm); -void intel_dpt_suspend(struct intel_display *display); -void intel_dpt_resume(struct intel_display *display); u64 intel_dpt_offset(struct i915_vma *dpt_vma); #endif /* __INTEL_DPT_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_dpt_common.c b/drivers/gpu/drm/i915/display/intel_dpt_common.c index 5eb88d51dba1..6551318b037b 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt_common.c +++ b/drivers/gpu/drm/i915/display/intel_dpt_common.c @@ -7,6 +7,7 @@ #include "intel_display_regs.h" #include "intel_display_types.h" #include "intel_dpt_common.h" +#include "intel_parent.h" #include "skl_universal_plane_regs.h" void intel_dpt_configure(struct intel_crtc *crtc) @@ -33,3 +34,61 @@ void intel_dpt_configure(struct intel_crtc *crtc) CHICKEN_MISC_DISABLE_DPT); } } + +/** + * intel_dpt_suspend - suspend the memory mapping for all DPT FBs during system suspend + * @display: display device instance + * + * Suspend the memory mapping during system suspend for all framebuffers which + * are mapped to HW via a GGTT->DPT page table. + * + * This function must be called before the mappings in GGTT are suspended calling + * i915_ggtt_suspend(). + */ +void intel_dpt_suspend(struct intel_display *display) +{ + struct drm_framebuffer *drm_fb; + + if (!HAS_DISPLAY(display)) + return; + + mutex_lock(&display->drm->mode_config.fb_lock); + + drm_for_each_fb(drm_fb, display->drm) { + struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); + + if (fb->dpt_vm) + intel_parent_dpt_suspend(display, fb->dpt_vm); + } + + mutex_unlock(&display->drm->mode_config.fb_lock); +} + +/** + * intel_dpt_resume - restore the memory mapping for all DPT FBs during system resume + * @display: display device instance + * + * Restore the memory mapping during system resume for all framebuffers which + * are mapped to HW via a GGTT->DPT page table. The content of these page + * tables are not stored in the hibernation image during S4 and S3RST->S4 + * transitions, so here we reprogram the PTE entries in those tables. + * + * This function must be called after the mappings in GGTT have been restored calling + * i915_ggtt_resume(). + */ +void intel_dpt_resume(struct intel_display *display) +{ + struct drm_framebuffer *drm_fb; + + if (!HAS_DISPLAY(display)) + return; + + mutex_lock(&display->drm->mode_config.fb_lock); + drm_for_each_fb(drm_fb, display->drm) { + struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); + + if (fb->dpt_vm) + intel_parent_dpt_resume(display, fb->dpt_vm); + } + mutex_unlock(&display->drm->mode_config.fb_lock); +} diff --git a/drivers/gpu/drm/i915/display/intel_dpt_common.h b/drivers/gpu/drm/i915/display/intel_dpt_common.h index 6d7de405126a..11bd495693b2 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt_common.h +++ b/drivers/gpu/drm/i915/display/intel_dpt_common.h @@ -7,7 +7,10 @@ #define __INTEL_DPT_COMMON_H__ struct intel_crtc; +struct intel_display; void intel_dpt_configure(struct intel_crtc *crtc); +void intel_dpt_suspend(struct intel_display *display); +void intel_dpt_resume(struct intel_display *display); #endif /* __INTEL_DPT_COMMON_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index c43e3518a139..a79ea775bde2 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -40,6 +40,18 @@ void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address display->parent->dpt->destroy(vm); } +void intel_parent_dpt_suspend(struct intel_display *display, struct i915_address_space *vm) +{ + if (display->parent->dpt) + display->parent->dpt->suspend(vm); +} + +void intel_parent_dpt_resume(struct intel_display *display, struct i915_address_space *vm) +{ + if (display->parent->dpt) + display->parent->dpt->resume(vm); +} + /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, struct intel_hdcp_gsc_context *gsc_context, diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 88860e471a0d..be577ce10c21 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -20,6 +20,8 @@ struct i915_address_space *intel_parent_dpt_create(struct intel_display *display struct drm_gem_object *obj, size_t size); void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address_space *vm); +void intel_parent_dpt_suspend(struct intel_display *display, struct i915_address_space *vm); +void intel_parent_dpt_resume(struct intel_display *display, struct i915_address_space *vm); /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, diff --git a/drivers/gpu/drm/i915/i915_dpt.c b/drivers/gpu/drm/i915/i915_dpt.c index 5237d057119e..635127ee5505 100644 --- a/drivers/gpu/drm/i915/i915_dpt.c +++ b/drivers/gpu/drm/i915/i915_dpt.c @@ -8,9 +8,7 @@ #include "display/intel_display_core.h" #include "display/intel_display_rpm.h" -#include "display/intel_display_types.h" #include "display/intel_dpt.h" -#include "display/intel_fb.h" #include "gem/i915_gem_domain.h" #include "gem/i915_gem_internal.h" #include "gem/i915_gem_lmem.h" @@ -185,64 +183,6 @@ void intel_dpt_unpin_from_ggtt(struct i915_address_space *vm) i915_vma_put(dpt->vma); } -/** - * intel_dpt_resume - restore the memory mapping for all DPT FBs during system resume - * @display: display device instance - * - * Restore the memory mapping during system resume for all framebuffers which - * are mapped to HW via a GGTT->DPT page table. The content of these page - * tables are not stored in the hibernation image during S4 and S3RST->S4 - * transitions, so here we reprogram the PTE entries in those tables. - * - * This function must be called after the mappings in GGTT have been restored calling - * i915_ggtt_resume(). - */ -void intel_dpt_resume(struct intel_display *display) -{ - struct drm_framebuffer *drm_fb; - - if (!HAS_DISPLAY(display)) - return; - - mutex_lock(&display->drm->mode_config.fb_lock); - drm_for_each_fb(drm_fb, display->drm) { - struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); - - if (fb->dpt_vm) - i915_ggtt_resume_vm(fb->dpt_vm, true); - } - mutex_unlock(&display->drm->mode_config.fb_lock); -} - -/** - * intel_dpt_suspend - suspend the memory mapping for all DPT FBs during system suspend - * @display: display device instance - * - * Suspend the memory mapping during system suspend for all framebuffers which - * are mapped to HW via a GGTT->DPT page table. - * - * This function must be called before the mappings in GGTT are suspended calling - * i915_ggtt_suspend(). - */ -void intel_dpt_suspend(struct intel_display *display) -{ - struct drm_framebuffer *drm_fb; - - if (!HAS_DISPLAY(display)) - return; - - mutex_lock(&display->drm->mode_config.fb_lock); - - drm_for_each_fb(drm_fb, display->drm) { - struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); - - if (fb->dpt_vm) - i915_ggtt_suspend_vm(fb->dpt_vm, true); - } - - mutex_unlock(&display->drm->mode_config.fb_lock); -} - static struct i915_address_space *i915_dpt_create(struct drm_gem_object *obj, size_t size) { struct drm_i915_private *i915 = to_i915(obj->dev); @@ -316,6 +256,16 @@ static void i915_dpt_destroy(struct i915_address_space *vm) i915_vm_put(&dpt->vm); } +static void i915_dpt_suspend(struct i915_address_space *vm) +{ + i915_ggtt_suspend_vm(vm, true); +} + +static void i915_dpt_resume(struct i915_address_space *vm) +{ + i915_ggtt_resume_vm(vm, true); +} + u64 intel_dpt_offset(struct i915_vma *dpt_vma) { return i915_vma_offset(dpt_vma); @@ -324,4 +274,6 @@ u64 intel_dpt_offset(struct i915_vma *dpt_vma) const struct intel_display_dpt_interface i915_display_dpt_interface = { .create = i915_dpt_create, .destroy = i915_dpt_destroy, + .suspend = i915_dpt_suspend, + .resume = i915_dpt_resume, }; diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 31a608ccab00..570626f8a554 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -59,7 +59,7 @@ #include "display/intel_display_power.h" #include "display/intel_dmc.h" #include "display/intel_dp.h" -#include "display/intel_dpt.h" +#include "display/intel_dpt_common.h" #include "display/intel_dram.h" #include "display/intel_encoder.h" #include "display/intel_fbdev.h" diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 48abbe187d61..2af4d6e99fd0 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -27,6 +27,8 @@ struct ref_tracker; struct intel_display_dpt_interface { struct i915_address_space *(*create)(struct drm_gem_object *obj, size_t size); void (*destroy)(struct i915_address_space *vm); + void (*suspend)(struct i915_address_space *vm); + void (*resume)(struct i915_address_space *vm); }; struct intel_display_dsb_interface { -- cgit v1.2.3 From 4226479f912e829ffba3993438ebc64dac90ae18 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 25 Feb 2026 16:49:16 +0200 Subject: drm/i915/dpt: pass opaque struct intel_dpt around instead of i915_address_space struct i915_address_space is used in an opaque fashion in the display parent interface, but it's just one include away from being non-opaque. And anyway the name is rather specific. Switch to using the struct intel_dpt instead, which embeds struct i915_address_space anyway. With the definition hidden in i915_dpt.c, this can't be accidentally made non-opaque, and the type seems rather more generic anyway. We do have to add a new helper i915_dpt_to_vm(), as there's one case in intel_fb_pin_to_dpt() that requires direct access to struct i915_address_space. But this just underlines the point about opacity. Reviewed-by: Juha-Pekka Heikkila Link: https://patch.msgid.link/daa39178c0b0305b010564952d691f06e3cd63ca.1772030909.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display_types.h | 2 +- drivers/gpu/drm/i915/display/intel_dpt.c | 8 +++--- drivers/gpu/drm/i915/display/intel_fb.c | 14 +++++----- drivers/gpu/drm/i915/display/intel_fb_pin.c | 11 ++++---- drivers/gpu/drm/i915/display/intel_parent.c | 17 ++++++------ drivers/gpu/drm/i915/display/intel_parent.h | 13 +++++----- drivers/gpu/drm/i915/i915_dpt.c | 30 +++++++++++----------- drivers/gpu/drm/i915/i915_dpt.h | 6 +++-- include/drm/intel/display_parent_interface.h | 10 ++++---- 9 files changed, 56 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index e8e4af03a6a6..8a2b37c7bccf 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -145,7 +145,7 @@ struct intel_framebuffer { struct intel_fb_view remapped_view; }; - struct i915_address_space *dpt_vm; + struct intel_dpt *dpt; unsigned int min_alignment; unsigned int vtd_guard; diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c b/drivers/gpu/drm/i915/display/intel_dpt.c index dffd500d378e..145dc9511116 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt.c +++ b/drivers/gpu/drm/i915/display/intel_dpt.c @@ -57,8 +57,8 @@ void intel_dpt_suspend(struct intel_display *display) drm_for_each_fb(drm_fb, display->drm) { struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); - if (fb->dpt_vm) - intel_parent_dpt_suspend(display, fb->dpt_vm); + if (fb->dpt) + intel_parent_dpt_suspend(display, fb->dpt); } mutex_unlock(&display->drm->mode_config.fb_lock); @@ -87,8 +87,8 @@ void intel_dpt_resume(struct intel_display *display) drm_for_each_fb(drm_fb, display->drm) { struct intel_framebuffer *fb = to_intel_framebuffer(drm_fb); - if (fb->dpt_vm) - intel_parent_dpt_resume(display, fb->dpt_vm); + if (fb->dpt) + intel_parent_dpt_resume(display, fb->dpt); } mutex_unlock(&display->drm->mode_config.fb_lock); } diff --git a/drivers/gpu/drm/i915/display/intel_fb.c b/drivers/gpu/drm/i915/display/intel_fb.c index f718eb139d69..6be07d8a7e81 100644 --- a/drivers/gpu/drm/i915/display/intel_fb.c +++ b/drivers/gpu/drm/i915/display/intel_fb.c @@ -2109,7 +2109,7 @@ static void intel_user_framebuffer_destroy(struct drm_framebuffer *fb) drm_framebuffer_cleanup(fb); if (intel_fb_uses_dpt(fb)) - intel_parent_dpt_destroy(display, intel_fb->dpt_vm); + intel_parent_dpt_destroy(display, intel_fb->dpt); intel_fb_bo_framebuffer_fini(intel_fb_bo(fb)); @@ -2305,20 +2305,20 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, if (intel_fb_uses_dpt(fb)) { struct drm_gem_object *obj = intel_fb_bo(&intel_fb->base); - struct i915_address_space *vm; + struct intel_dpt *dpt; size_t size = 0; if (intel_fb_needs_pot_stride_remap(intel_fb)) size = intel_remapped_info_size(&intel_fb->remapped_view.gtt.remapped); - vm = intel_parent_dpt_create(display, obj, size); - if (IS_ERR(vm)) { + dpt = intel_parent_dpt_create(display, obj, size); + if (IS_ERR(dpt)) { drm_dbg_kms(display->drm, "failed to create DPT\n"); - ret = PTR_ERR(vm); + ret = PTR_ERR(dpt); goto err_frontbuffer_put; } - intel_fb->dpt_vm = vm; + intel_fb->dpt = dpt; } ret = drm_framebuffer_init(display->drm, fb, &intel_fb_funcs); @@ -2331,7 +2331,7 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, err_free_dpt: if (intel_fb_uses_dpt(fb)) - intel_parent_dpt_destroy(display, intel_fb->dpt_vm); + intel_parent_dpt_destroy(display, intel_fb->dpt); err_bo_framebuffer_fini: intel_fb_bo_framebuffer_fini(obj); err_frontbuffer_put: diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c b/drivers/gpu/drm/i915/display/intel_fb_pin.c index d2e4200f2cef..738d77a1468a 100644 --- a/drivers/gpu/drm/i915/display/intel_fb_pin.c +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c @@ -27,13 +27,14 @@ intel_fb_pin_to_dpt(const struct drm_framebuffer *fb, const struct i915_gtt_view *view, unsigned int alignment, unsigned long *out_flags, - struct i915_address_space *vm) + struct intel_dpt *dpt) { struct drm_device *dev = fb->dev; struct intel_display *display = to_intel_display(dev); struct drm_i915_private *dev_priv = to_i915(dev); struct drm_gem_object *_obj = intel_fb_bo(fb); struct drm_i915_gem_object *obj = to_intel_bo(_obj); + struct i915_address_space *vm = i915_dpt_to_vm(dpt); struct i915_gem_ww_ctx ww; struct i915_vma *vma; int ret; @@ -284,7 +285,7 @@ int intel_plane_pin_fb(struct intel_plane_state *plane_state, } else { unsigned int alignment = intel_plane_fb_min_alignment(plane_state); - vma = i915_dpt_pin_to_ggtt(fb->dpt_vm, alignment / 512); + vma = i915_dpt_pin_to_ggtt(fb->dpt, alignment / 512); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -292,9 +293,9 @@ int intel_plane_pin_fb(struct intel_plane_state *plane_state, vma = intel_fb_pin_to_dpt(&fb->base, &plane_state->view.gtt, alignment, &plane_state->flags, - fb->dpt_vm); + fb->dpt); if (IS_ERR(vma)) { - i915_dpt_unpin_from_ggtt(fb->dpt_vm); + i915_dpt_unpin_from_ggtt(fb->dpt); plane_state->ggtt_vma = NULL; return PTR_ERR(vma); } @@ -346,7 +347,7 @@ void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state) vma = fetch_and_zero(&old_plane_state->ggtt_vma); if (vma) - i915_dpt_unpin_from_ggtt(fb->dpt_vm); + i915_dpt_unpin_from_ggtt(fb->dpt); } } diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index a79ea775bde2..7044632ef3fc 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -24,9 +24,8 @@ #include "intel_parent.h" /* dpt */ -struct i915_address_space *intel_parent_dpt_create(struct intel_display *display, - struct drm_gem_object *obj, - size_t size) +struct intel_dpt *intel_parent_dpt_create(struct intel_display *display, + struct drm_gem_object *obj, size_t size) { if (display->parent->dpt) return display->parent->dpt->create(obj, size); @@ -34,22 +33,22 @@ struct i915_address_space *intel_parent_dpt_create(struct intel_display *display return NULL; } -void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address_space *vm) +void intel_parent_dpt_destroy(struct intel_display *display, struct intel_dpt *dpt) { if (display->parent->dpt) - display->parent->dpt->destroy(vm); + display->parent->dpt->destroy(dpt); } -void intel_parent_dpt_suspend(struct intel_display *display, struct i915_address_space *vm) +void intel_parent_dpt_suspend(struct intel_display *display, struct intel_dpt *dpt) { if (display->parent->dpt) - display->parent->dpt->suspend(vm); + display->parent->dpt->suspend(dpt); } -void intel_parent_dpt_resume(struct intel_display *display, struct i915_address_space *vm) +void intel_parent_dpt_resume(struct intel_display *display, struct intel_dpt *dpt) { if (display->parent->dpt) - display->parent->dpt->resume(vm); + display->parent->dpt->resume(dpt); } /* hdcp */ diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index be577ce10c21..002234e81ce6 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -9,19 +9,18 @@ struct dma_fence; struct drm_gem_object; struct drm_scanout_buffer; -struct i915_address_space; struct intel_display; +struct intel_dpt; struct intel_hdcp_gsc_context; struct intel_panic; struct intel_stolen_node; /* dpt */ -struct i915_address_space *intel_parent_dpt_create(struct intel_display *display, - struct drm_gem_object *obj, - size_t size); -void intel_parent_dpt_destroy(struct intel_display *display, struct i915_address_space *vm); -void intel_parent_dpt_suspend(struct intel_display *display, struct i915_address_space *vm); -void intel_parent_dpt_resume(struct intel_display *display, struct i915_address_space *vm); +struct intel_dpt *intel_parent_dpt_create(struct intel_display *display, + struct drm_gem_object *obj, size_t size); +void intel_parent_dpt_destroy(struct intel_display *display, struct intel_dpt *dpt); +void intel_parent_dpt_suspend(struct intel_display *display, struct intel_dpt *dpt); +void intel_parent_dpt_resume(struct intel_display *display, struct intel_dpt *dpt); /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, diff --git a/drivers/gpu/drm/i915/i915_dpt.c b/drivers/gpu/drm/i915/i915_dpt.c index baf45d70c152..9f47bb563c85 100644 --- a/drivers/gpu/drm/i915/i915_dpt.c +++ b/drivers/gpu/drm/i915/i915_dpt.c @@ -33,6 +33,11 @@ i915_vm_to_dpt(struct i915_address_space *vm) return container_of(vm, struct intel_dpt, vm); } +struct i915_address_space *i915_dpt_to_vm(struct intel_dpt *dpt) +{ + return &dpt->vm; +} + static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte) { writeq(pte, addr); @@ -121,11 +126,10 @@ static void dpt_cleanup(struct i915_address_space *vm) i915_gem_object_put(dpt->obj); } -struct i915_vma *i915_dpt_pin_to_ggtt(struct i915_address_space *vm, unsigned int alignment) +struct i915_vma *i915_dpt_pin_to_ggtt(struct intel_dpt *dpt, unsigned int alignment) { - struct drm_i915_private *i915 = vm->i915; + struct drm_i915_private *i915 = dpt->vm.i915; struct intel_display *display = i915->display; - struct intel_dpt *dpt = i915_vm_to_dpt(vm); struct ref_tracker *wakeref; struct i915_vma *vma; void __iomem *iomem; @@ -173,15 +177,13 @@ struct i915_vma *i915_dpt_pin_to_ggtt(struct i915_address_space *vm, unsigned in return err ? ERR_PTR(err) : vma; } -void i915_dpt_unpin_from_ggtt(struct i915_address_space *vm) +void i915_dpt_unpin_from_ggtt(struct intel_dpt *dpt) { - struct intel_dpt *dpt = i915_vm_to_dpt(vm); - i915_vma_unpin_iomap(dpt->vma); i915_vma_put(dpt->vma); } -static struct i915_address_space *i915_dpt_create(struct drm_gem_object *obj, size_t size) +static struct intel_dpt *i915_dpt_create(struct drm_gem_object *obj, size_t size) { struct drm_i915_private *i915 = to_i915(obj->dev); struct drm_i915_gem_object *dpt_obj; @@ -243,25 +245,23 @@ static struct i915_address_space *i915_dpt_create(struct drm_gem_object *obj, si dpt->obj = dpt_obj; dpt->obj->is_dpt = true; - return &dpt->vm; + return dpt; } -static void i915_dpt_destroy(struct i915_address_space *vm) +static void i915_dpt_destroy(struct intel_dpt *dpt) { - struct intel_dpt *dpt = i915_vm_to_dpt(vm); - dpt->obj->is_dpt = false; i915_vm_put(&dpt->vm); } -static void i915_dpt_suspend(struct i915_address_space *vm) +static void i915_dpt_suspend(struct intel_dpt *dpt) { - i915_ggtt_suspend_vm(vm, true); + i915_ggtt_suspend_vm(&dpt->vm, true); } -static void i915_dpt_resume(struct i915_address_space *vm) +static void i915_dpt_resume(struct intel_dpt *dpt) { - i915_ggtt_resume_vm(vm, true); + i915_ggtt_resume_vm(&dpt->vm, true); } u64 i915_dpt_offset(struct i915_vma *dpt_vma) diff --git a/drivers/gpu/drm/i915/i915_dpt.h b/drivers/gpu/drm/i915/i915_dpt.h index 3b76e9760600..08dbe444fe18 100644 --- a/drivers/gpu/drm/i915/i915_dpt.h +++ b/drivers/gpu/drm/i915/i915_dpt.h @@ -8,9 +8,11 @@ struct i915_address_space; struct i915_vma; +struct intel_dpt; -struct i915_vma *i915_dpt_pin_to_ggtt(struct i915_address_space *vm, unsigned int alignment); -void i915_dpt_unpin_from_ggtt(struct i915_address_space *vm); +struct i915_address_space *i915_dpt_to_vm(struct intel_dpt *dpt); +struct i915_vma *i915_dpt_pin_to_ggtt(struct intel_dpt *dpt, unsigned int alignment); +void i915_dpt_unpin_from_ggtt(struct intel_dpt *dpt); u64 i915_dpt_offset(struct i915_vma *dpt_vma); extern const struct intel_display_dpt_interface i915_display_dpt_interface; diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 2af4d6e99fd0..50da825ec06c 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -13,8 +13,8 @@ struct drm_framebuffer; struct drm_gem_object; struct drm_plane_state; struct drm_scanout_buffer; -struct i915_address_space; struct i915_vma; +struct intel_dpt; struct intel_dsb_buffer; struct intel_hdcp_gsc_context; struct intel_initial_plane_config; @@ -25,10 +25,10 @@ struct ref_tracker; /* Keep struct definitions sorted */ struct intel_display_dpt_interface { - struct i915_address_space *(*create)(struct drm_gem_object *obj, size_t size); - void (*destroy)(struct i915_address_space *vm); - void (*suspend)(struct i915_address_space *vm); - void (*resume)(struct i915_address_space *vm); + struct intel_dpt *(*create)(struct drm_gem_object *obj, size_t size); + void (*destroy)(struct intel_dpt *dpt); + void (*suspend)(struct intel_dpt *dpt); + void (*resume)(struct intel_dpt *dpt); }; struct intel_display_dsb_interface { -- cgit v1.2.3 From 2974aa42e6696a1d95b727d677dc01a71af5b998 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Fri, 27 Feb 2026 01:17:19 +0000 Subject: ASoC: remove snd_soc_pcm_subclass enum snd_soc_pcm_subclass has added at v3.1 commit b8c0dab9bf337 ("ASoC: core - PCM mutex per rtd"), but has never been used during this 15 years. Let's remove it. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/878qcfyogw.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 7bf7ce085516..b1c5dad26edb 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -422,11 +422,6 @@ struct snd_soc_jack_pin; #include #include -enum snd_soc_pcm_subclass { - SND_SOC_PCM_CLASS_PCM = 0, - SND_SOC_PCM_CLASS_BE = 1, -}; - int snd_soc_register_card(struct snd_soc_card *card); void snd_soc_unregister_card(struct snd_soc_card *card); int devm_snd_soc_register_card(struct device *dev, struct snd_soc_card *card); @@ -999,7 +994,6 @@ struct snd_soc_card { /* Mutex for PCM operations */ struct mutex pcm_mutex; - enum snd_soc_pcm_subclass pcm_subclass; int (*probe)(struct snd_soc_card *card); int (*late_probe)(struct snd_soc_card *card); @@ -1519,7 +1513,7 @@ static inline void _snd_soc_dapm_mutex_assert_held_d(struct snd_soc_dapm_context */ static inline void _snd_soc_dpcm_mutex_lock_c(struct snd_soc_card *card) { - mutex_lock_nested(&card->pcm_mutex, card->pcm_subclass); + mutex_lock(&card->pcm_mutex); } static inline void _snd_soc_dpcm_mutex_unlock_c(struct snd_soc_card *card) -- cgit v1.2.3 From c8e9b1d9febc83ee94944695a07cfd40a1b29743 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:20 -0800 Subject: dmaengine: fsl-edma: fix all kernel-doc warnings Use the correct kernel-doc format and struct member names to eliminate these kernel-doc warnings: Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'dma_channels' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slave_map' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slavecnt' not described in 'mcf_edma_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051220.548566-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-mcf-edma.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/dma-mcf-edma.h b/include/linux/platform_data/dma-mcf-edma.h index d718ccfa3421..0b31af66a1ac 100644 --- a/include/linux/platform_data/dma-mcf-edma.h +++ b/include/linux/platform_data/dma-mcf-edma.h @@ -26,8 +26,9 @@ bool mcf_edma_filter_fn(struct dma_chan *chan, void *param); /** * struct mcf_edma_platform_data - platform specific data for eDMA engine * - * @ver The eDMA module version. - * @dma_channels The number of eDMA channels. + * @dma_channels: The number of eDMA channels. + * @slave_map: Slave device map + * @slavecnt: Number of entries in @slave_map */ struct mcf_edma_platform_data { int dma_channels; -- cgit v1.2.3 From fd6dad4e1ae296b67b87291256878a58dad36c93 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 24 Feb 2026 15:14:24 +0900 Subject: netmem: remove the pp fields from net_iov MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the pp fields in net_iov have no users, remove them from net_iov and clean up. Signed-off-by: Byungchul Park Reviewed-by: Simon Horman Reviewed-by: Pavel Begunkov Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260224061424.11219-1-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index a96b3e5e5574..a6d65ced5231 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -93,23 +93,7 @@ enum net_iov_type { * supported. */ struct net_iov { - union { - struct netmem_desc desc; - - /* XXX: The following part should be removed once all - * the references to them are converted so as to be - * accessed via netmem_desc e.g. niov->desc.pp instead - * of niov->pp. - */ - struct { - unsigned long _flags; - unsigned long pp_magic; - struct page_pool *pp; - unsigned long _pp_mapping_pad; - unsigned long dma_addr; - atomic_long_t pp_ref_count; - }; - }; + struct netmem_desc desc; struct net_iov_area *owner; enum net_iov_type type; }; @@ -123,26 +107,6 @@ struct net_iov_area { unsigned long base_virtual; }; -/* net_iov is union'ed with struct netmem_desc mirroring struct page, so - * the page_pool can access these fields without worrying whether the - * underlying fields are accessed via netmem_desc or directly via - * net_iov, until all the references to them are converted so as to be - * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp. - * - * The non-net stack fields of struct page are private to the mm stack - * and must never be mirrored to net_iov. - */ -#define NET_IOV_ASSERT_OFFSET(desc, iov) \ - static_assert(offsetof(struct netmem_desc, desc) == \ - offsetof(struct net_iov, iov)) -NET_IOV_ASSERT_OFFSET(_flags, _flags); -NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); -NET_IOV_ASSERT_OFFSET(pp, pp); -NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); -NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); -NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); -#undef NET_IOV_ASSERT_OFFSET - static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) { return niov->owner; -- cgit v1.2.3 From e3f33adfa3a3be16ef59ed849fbbd10e966e98b0 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 26 Feb 2026 15:01:50 +0200 Subject: drm/i915/overlay: Convert overlay to parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the direct i915_overlay_*() calls from the display side to go over a new parent interface instead. v2: Correctly handle the ERR_PTR returned by i915_overlay_obj_lookup() (Jani) v3: Rebase due to the NULL check in intel_overlay_cleanup() Reviewed-by: Jani Nikula Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20260226130150.16816-1-ville.syrjala@linux.intel.com --- drivers/gpu/drm/i915/display/intel_overlay.c | 36 ++++++------- drivers/gpu/drm/i915/display/intel_overlay.h | 30 ----------- drivers/gpu/drm/i915/display/intel_parent.c | 76 ++++++++++++++++++++++++++++ drivers/gpu/drm/i915/display/intel_parent.h | 25 +++++++++ drivers/gpu/drm/i915/i915_driver.c | 2 + drivers/gpu/drm/i915/i915_overlay.c | 57 +++++++++++++-------- drivers/gpu/drm/i915/i915_overlay.h | 34 +------------ drivers/gpu/drm/xe/Makefile | 1 + include/drm/intel/display_parent_interface.h | 33 ++++++++++++ 9 files changed, 193 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c index 497bd4ec2224..12a325ceae6f 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.c +++ b/drivers/gpu/drm/i915/display/intel_overlay.c @@ -30,13 +30,13 @@ #include #include -#include "i915_overlay.h" #include "intel_color_regs.h" #include "intel_de.h" #include "intel_display_regs.h" #include "intel_display_types.h" #include "intel_frontbuffer.h" #include "intel_overlay.h" +#include "intel_parent.h" #include "intel_pfit_regs.h" /* Limits for overlay size. According to intel doc, the real limits are: @@ -199,7 +199,7 @@ void intel_overlay_reset(struct intel_display *display) overlay->old_yscale = 0; overlay->crtc = NULL; - i915_overlay_reset(display->drm); + intel_parent_overlay_reset(display); } static int packed_depth_bytes(u32 format) @@ -477,19 +477,19 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay, drm_WARN_ON(display->drm, !drm_modeset_is_locked(&display->drm->mode_config.connection_mutex)); - ret = i915_overlay_release_old_vid(display->drm); + ret = intel_parent_overlay_release_old_vid(display); if (ret != 0) return ret; atomic_inc(&display->restore.pending_fb_pin); - vma = i915_overlay_pin_fb(display->drm, obj, &offset); + vma = intel_parent_overlay_pin_fb(display, obj, &offset); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out_pin_section; } - if (!i915_overlay_is_active(display->drm)) { + if (!intel_parent_overlay_is_active(display)) { const struct intel_crtc_state *crtc_state = overlay->crtc->config; u32 oconfig = 0; @@ -505,7 +505,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay, OCONF_PIPE_A : OCONF_PIPE_B; iowrite32(oconfig, ®s->OCONFIG); - ret = i915_overlay_on(display->drm, INTEL_FRONTBUFFER_OVERLAY(pipe)); + ret = intel_parent_overlay_on(display, INTEL_FRONTBUFFER_OVERLAY(pipe)); if (ret != 0) goto out_unpin; } @@ -563,14 +563,14 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay, if (tmp & (1 << 17)) drm_dbg(display->drm, "overlay underrun, DOVSTA: %x\n", tmp); - ret = i915_overlay_continue(display->drm, vma, scale_changed); + ret = intel_parent_overlay_continue(display, vma, scale_changed); if (ret) goto out_unpin; return 0; out_unpin: - i915_overlay_unpin_fb(display->drm, vma); + intel_parent_overlay_unpin_fb(display, vma); out_pin_section: atomic_dec(&display->restore.pending_fb_pin); @@ -585,14 +585,14 @@ int intel_overlay_switch_off(struct intel_overlay *overlay) drm_WARN_ON(display->drm, !drm_modeset_is_locked(&display->drm->mode_config.connection_mutex)); - ret = i915_overlay_recover_from_interrupt(display->drm); + ret = intel_parent_overlay_recover_from_interrupt(display); if (ret != 0) return ret; - if (!i915_overlay_is_active(display->drm)) + if (!intel_parent_overlay_is_active(display)) return 0; - ret = i915_overlay_release_old_vid(display->drm); + ret = intel_parent_overlay_release_old_vid(display); if (ret != 0) return ret; @@ -601,7 +601,7 @@ int intel_overlay_switch_off(struct intel_overlay *overlay) overlay->crtc->overlay = NULL; overlay->crtc = NULL; - return i915_overlay_off(display->drm); + return intel_parent_overlay_off(display); } static int check_overlay_possible_on_crtc(struct intel_overlay *overlay, @@ -822,13 +822,13 @@ int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data, return -ENOENT; crtc = to_intel_crtc(drmmode_crtc); - obj = i915_overlay_obj_lookup(dev, file_priv, params->bo_handle); + obj = intel_parent_overlay_obj_lookup(display, file_priv, params->bo_handle); if (IS_ERR(obj)) return PTR_ERR(obj); drm_modeset_lock_all(dev); - ret = i915_overlay_recover_from_interrupt(dev); + ret = intel_parent_overlay_recover_from_interrupt(display); if (ret != 0) goto out_unlock; @@ -998,7 +998,7 @@ int intel_overlay_attrs_ioctl(struct drm_device *dev, void *data, if (DISPLAY_VER(display) == 2) goto out_unlock; - if (i915_overlay_is_active(display->drm)) { + if (intel_parent_overlay_is_active(display)) { ret = -EBUSY; goto out_unlock; } @@ -1036,8 +1036,8 @@ void intel_overlay_setup(struct intel_display *display) if (!overlay) return; - regs = i915_overlay_setup(display->drm, - OVERLAY_NEEDS_PHYSICAL(display)); + regs = intel_parent_overlay_setup(display, + OVERLAY_NEEDS_PHYSICAL(display)); if (IS_ERR(regs)) goto out_free; @@ -1071,7 +1071,7 @@ void intel_overlay_cleanup(struct intel_display *display) if (!display->overlay) return; - i915_overlay_cleanup(display->drm); + intel_parent_overlay_cleanup(display); kfree(display->overlay); display->overlay = NULL; diff --git a/drivers/gpu/drm/i915/display/intel_overlay.h b/drivers/gpu/drm/i915/display/intel_overlay.h index 4ef6882b9acb..a4291d6dd528 100644 --- a/drivers/gpu/drm/i915/display/intel_overlay.h +++ b/drivers/gpu/drm/i915/display/intel_overlay.h @@ -14,7 +14,6 @@ struct drm_printer; struct intel_display; struct intel_overlay; -#ifdef I915 void intel_overlay_setup(struct intel_display *display); bool intel_overlay_available(struct intel_display *display); void intel_overlay_cleanup(struct intel_display *display); @@ -24,34 +23,5 @@ int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data, int intel_overlay_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); void intel_overlay_reset(struct intel_display *display); -#else -static inline void intel_overlay_setup(struct intel_display *display) -{ -} -static inline bool intel_overlay_available(struct intel_display *display) -{ - return false; -} -static inline void intel_overlay_cleanup(struct intel_display *display) -{ -} -static inline int intel_overlay_switch_off(struct intel_overlay *overlay) -{ - return 0; -} -static inline int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv) -{ - return 0; -} -static inline int intel_overlay_attrs_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv) -{ - return 0; -} -static inline void intel_overlay_reset(struct intel_display *display) -{ -} -#endif #endif /* __INTEL_OVERLAY_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index 7044632ef3fc..89f78ca1cd15 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -87,6 +87,82 @@ void intel_parent_irq_synchronize(struct intel_display *display) display->parent->irq->synchronize(display->drm); } +/* overlay */ +bool intel_parent_overlay_is_active(struct intel_display *display) +{ + return display->parent->overlay->is_active(display->drm); +} + +int intel_parent_overlay_on(struct intel_display *display, + u32 frontbuffer_bits) +{ + return display->parent->overlay->overlay_on(display->drm, + frontbuffer_bits); +} + +int intel_parent_overlay_continue(struct intel_display *display, + struct i915_vma *vma, + bool load_polyphase_filter) +{ + return display->parent->overlay->overlay_continue(display->drm, vma, + load_polyphase_filter); +} + +int intel_parent_overlay_off(struct intel_display *display) +{ + return display->parent->overlay->overlay_off(display->drm); +} + +int intel_parent_overlay_recover_from_interrupt(struct intel_display *display) +{ + return display->parent->overlay->recover_from_interrupt(display->drm); +} + +int intel_parent_overlay_release_old_vid(struct intel_display *display) +{ + return display->parent->overlay->release_old_vid(display->drm); +} + +void intel_parent_overlay_reset(struct intel_display *display) +{ + display->parent->overlay->reset(display->drm); +} + +struct i915_vma *intel_parent_overlay_pin_fb(struct intel_display *display, + struct drm_gem_object *obj, + u32 *offset) +{ + return display->parent->overlay->pin_fb(display->drm, obj, offset); +} + +void intel_parent_overlay_unpin_fb(struct intel_display *display, + struct i915_vma *vma) +{ + return display->parent->overlay->unpin_fb(display->drm, vma); +} + +struct drm_gem_object *intel_parent_overlay_obj_lookup(struct intel_display *display, + struct drm_file *filp, + u32 handle) +{ + return display->parent->overlay->obj_lookup(display->drm, + filp, handle); +} + +void __iomem *intel_parent_overlay_setup(struct intel_display *display, + bool needs_physical) +{ + if (drm_WARN_ON_ONCE(display->drm, !display->parent->overlay)) + return ERR_PTR(-ENODEV); + + return display->parent->overlay->setup(display->drm, needs_physical); +} + +void intel_parent_overlay_cleanup(struct intel_display *display) +{ + display->parent->overlay->cleanup(display->drm); +} + /* panic */ struct intel_panic *intel_parent_panic_alloc(struct intel_display *display) { diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 002234e81ce6..2317482ef072 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -7,8 +7,10 @@ #include struct dma_fence; +struct drm_file; struct drm_gem_object; struct drm_scanout_buffer; +struct i915_vma; struct intel_display; struct intel_dpt; struct intel_hdcp_gsc_context; @@ -36,6 +38,29 @@ void intel_parent_hdcp_gsc_context_free(struct intel_display *display, bool intel_parent_irq_enabled(struct intel_display *display); void intel_parent_irq_synchronize(struct intel_display *display); +/* overlay */ +bool intel_parent_overlay_is_active(struct intel_display *display); +int intel_parent_overlay_on(struct intel_display *display, + u32 frontbuffer_bits); +int intel_parent_overlay_continue(struct intel_display *display, + struct i915_vma *vma, + bool load_polyphase_filter); +int intel_parent_overlay_off(struct intel_display *display); +int intel_parent_overlay_recover_from_interrupt(struct intel_display *display); +int intel_parent_overlay_release_old_vid(struct intel_display *display); +void intel_parent_overlay_reset(struct intel_display *display); +struct i915_vma *intel_parent_overlay_pin_fb(struct intel_display *display, + struct drm_gem_object *obj, + u32 *offset); +void intel_parent_overlay_unpin_fb(struct intel_display *display, + struct i915_vma *vma); +struct drm_gem_object *intel_parent_overlay_obj_lookup(struct intel_display *display, + struct drm_file *filp, + u32 handle); +void __iomem *intel_parent_overlay_setup(struct intel_display *display, + bool needs_physical); +void intel_parent_overlay_cleanup(struct intel_display *display); + /* panic */ struct intel_panic *intel_parent_panic_alloc(struct intel_display *display); int intel_parent_panic_setup(struct intel_display *display, struct intel_panic *panic, struct drm_scanout_buffer *sb); diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 31a608ccab00..5f77e891604d 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -107,6 +107,7 @@ #include "i915_ioctl.h" #include "i915_irq.h" #include "i915_memcpy.h" +#include "i915_overlay.h" #include "i915_panic.h" #include "i915_perf.h" #include "i915_query.h" @@ -767,6 +768,7 @@ static const struct intel_display_parent_interface parent = { .hdcp = &i915_display_hdcp_interface, .initial_plane = &i915_display_initial_plane_interface, .irq = &i915_display_irq_interface, + .overlay = &i915_display_overlay_interface, .panic = &i915_display_panic_interface, .pc8 = &i915_display_pc8_interface, .pcode = &i915_display_pcode_interface, diff --git a/drivers/gpu/drm/i915/i915_overlay.c b/drivers/gpu/drm/i915/i915_overlay.c index 61869747c6bb..28518dbb5b8e 100644 --- a/drivers/gpu/drm/i915/i915_overlay.c +++ b/drivers/gpu/drm/i915/i915_overlay.c @@ -5,6 +5,7 @@ #include +#include #include #include "gem/i915_gem_internal.h" @@ -88,7 +89,7 @@ alloc_request(struct i915_overlay *overlay, void (*fn)(struct i915_overlay *)) return rq; } -bool i915_overlay_is_active(struct drm_device *drm) +static bool i915_overlay_is_active(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -97,8 +98,8 @@ bool i915_overlay_is_active(struct drm_device *drm) } /* overlay needs to be disable in OCMD reg */ -int i915_overlay_on(struct drm_device *drm, - u32 frontbuffer_bits) +static int i915_overlay_on(struct drm_device *drm, + u32 frontbuffer_bits) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -159,9 +160,9 @@ static void i915_overlay_flip_prepare(struct i915_overlay *overlay, } /* overlay needs to be enabled in OCMD reg */ -int i915_overlay_continue(struct drm_device *drm, - struct i915_vma *vma, - bool load_polyphase_filter) +static int i915_overlay_continue(struct drm_device *drm, + struct i915_vma *vma, + bool load_polyphase_filter) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -210,7 +211,8 @@ static void i915_overlay_release_old_vma(struct i915_overlay *overlay) i915_vma_put(vma); } -static void i915_overlay_release_old_vid_tail(struct i915_overlay *overlay) +static void +i915_overlay_release_old_vid_tail(struct i915_overlay *overlay) { i915_overlay_release_old_vma(overlay); } @@ -237,7 +239,7 @@ static void i915_overlay_last_flip_retire(struct i915_active *active) } /* overlay needs to be disabled in OCMD reg */ -int i915_overlay_off(struct drm_device *drm) +static int i915_overlay_off(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -286,7 +288,7 @@ int i915_overlay_off(struct drm_device *drm) * Recover from an interruption due to a signal. * We have to be careful not to repeat work forever an make forward progress. */ -int i915_overlay_recover_from_interrupt(struct drm_device *drm) +static int i915_overlay_recover_from_interrupt(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -299,7 +301,7 @@ int i915_overlay_recover_from_interrupt(struct drm_device *drm) * Needs to be called before the overlay register are changed * via intel_overlay_(un)map_regs. */ -int i915_overlay_release_old_vid(struct drm_device *drm) +static int i915_overlay_release_old_vid(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -337,7 +339,7 @@ int i915_overlay_release_old_vid(struct drm_device *drm) return i915_active_wait(&overlay->last_flip); } -void i915_overlay_reset(struct drm_device *drm) +static void i915_overlay_reset(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay = i915->overlay; @@ -348,9 +350,9 @@ void i915_overlay_reset(struct drm_device *drm) overlay->frontbuffer_bits = 0; } -struct i915_vma *i915_overlay_pin_fb(struct drm_device *drm, - struct drm_gem_object *obj, - u32 *offset) +static struct i915_vma *i915_overlay_pin_fb(struct drm_device *drm, + struct drm_gem_object *obj, + u32 *offset) { struct drm_i915_gem_object *new_bo = to_intel_bo(obj); struct i915_gem_ww_ctx ww; @@ -379,13 +381,13 @@ retry: return vma; } -void i915_overlay_unpin_fb(struct drm_device *drm, - struct i915_vma *vma) +static void i915_overlay_unpin_fb(struct drm_device *drm, + struct i915_vma *vma) { i915_vma_unpin(vma); } -struct drm_gem_object * +static struct drm_gem_object * i915_overlay_obj_lookup(struct drm_device *drm, struct drm_file *file_priv, u32 handle) @@ -444,8 +446,8 @@ err_put_bo: return err; } -void __iomem *i915_overlay_setup(struct drm_device *drm, - bool needs_physical) +static void __iomem *i915_overlay_setup(struct drm_device *drm, + bool needs_physical) { struct drm_i915_private *i915 = to_i915(drm); struct intel_engine_cs *engine; @@ -477,7 +479,7 @@ void __iomem *i915_overlay_setup(struct drm_device *drm, return overlay->regs; } -void i915_overlay_cleanup(struct drm_device *drm) +static void i915_overlay_cleanup(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); struct i915_overlay *overlay; @@ -498,3 +500,18 @@ void i915_overlay_cleanup(struct drm_device *drm) kfree(overlay); } + +const struct intel_display_overlay_interface i915_display_overlay_interface = { + .is_active = i915_overlay_is_active, + .overlay_on = i915_overlay_on, + .overlay_continue = i915_overlay_continue, + .overlay_off = i915_overlay_off, + .recover_from_interrupt = i915_overlay_recover_from_interrupt, + .release_old_vid = i915_overlay_release_old_vid, + .reset = i915_overlay_reset, + .obj_lookup = i915_overlay_obj_lookup, + .pin_fb = i915_overlay_pin_fb, + .unpin_fb = i915_overlay_unpin_fb, + .setup = i915_overlay_setup, + .cleanup = i915_overlay_cleanup, +}; diff --git a/drivers/gpu/drm/i915/i915_overlay.h b/drivers/gpu/drm/i915/i915_overlay.h index f553de2abeaa..f8053eb8d189 100644 --- a/drivers/gpu/drm/i915/i915_overlay.h +++ b/drivers/gpu/drm/i915/i915_overlay.h @@ -6,38 +6,6 @@ #ifndef __I915_OVERLAY_H__ #define __I915_OVERLAY_H__ -#include - -struct drm_device; -struct drm_file; -struct drm_gem_object; -struct i915_vma; - -bool i915_overlay_is_active(struct drm_device *drm); -int i915_overlay_on(struct drm_device *drm, - u32 frontbuffer_bits); -int i915_overlay_continue(struct drm_device *drm, - struct i915_vma *vma, - bool load_polyphase_filter); -int i915_overlay_off(struct drm_device *drm); -int i915_overlay_recover_from_interrupt(struct drm_device *drm); -int i915_overlay_release_old_vid(struct drm_device *drm); - -void i915_overlay_reset(struct drm_device *drm); - -struct i915_vma *i915_overlay_pin_fb(struct drm_device *drm, - struct drm_gem_object *obj, - u32 *offset); -void i915_overlay_unpin_fb(struct drm_device *drm, - struct i915_vma *vma); - -struct drm_gem_object * -i915_overlay_obj_lookup(struct drm_device *drm, - struct drm_file *file_priv, - u32 handle); - -void __iomem *i915_overlay_setup(struct drm_device *drm, - bool needs_physical); -void i915_overlay_cleanup(struct drm_device *drm); +extern const struct intel_display_overlay_interface i915_display_overlay_interface; #endif /* __I915_OVERLAY_H__ */ diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index a630466b3d72..c4fb9f13371a 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -303,6 +303,7 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ i915-display/intel_modeset_lock.o \ i915-display/intel_modeset_setup.o \ i915-display/intel_modeset_verify.o \ + i915-display/intel_overlay.o \ i915-display/intel_panel.o \ i915-display/intel_parent.o \ i915-display/intel_pch.o \ diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 50da825ec06c..b4b0f58ae3ee 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -9,6 +9,7 @@ struct dma_fence; struct drm_crtc; struct drm_device; +struct drm_file; struct drm_framebuffer; struct drm_gem_object; struct drm_plane_state; @@ -63,6 +64,35 @@ struct intel_display_irq_interface { void (*synchronize)(struct drm_device *drm); }; +struct intel_display_overlay_interface { + bool (*is_active)(struct drm_device *drm); + + int (*overlay_on)(struct drm_device *drm, + u32 frontbuffer_bits); + int (*overlay_continue)(struct drm_device *drm, + struct i915_vma *vma, + bool load_polyphase_filter); + int (*overlay_off)(struct drm_device *drm); + int (*recover_from_interrupt)(struct drm_device *drm); + int (*release_old_vid)(struct drm_device *drm); + + void (*reset)(struct drm_device *drm); + + struct i915_vma *(*pin_fb)(struct drm_device *drm, + struct drm_gem_object *obj, + u32 *offset); + void (*unpin_fb)(struct drm_device *drm, + struct i915_vma *vma); + + struct drm_gem_object *(*obj_lookup)(struct drm_device *drm, + struct drm_file *filp, + u32 handle); + + void __iomem *(*setup)(struct drm_device *drm, + bool needs_physical); + void (*cleanup)(struct drm_device *drm); +}; + struct intel_display_panic_interface { struct intel_panic *(*alloc)(void); int (*setup)(struct intel_panic *panic, struct drm_scanout_buffer *sb); @@ -150,6 +180,9 @@ struct intel_display_parent_interface { /** @panic: Panic interface */ const struct intel_display_panic_interface *panic; + /** @overlay: Overlay. Optional. */ + const struct intel_display_overlay_interface *overlay; + /** @pc8: PC8 interface. Optional. */ const struct intel_display_pc8_interface *pc8; -- cgit v1.2.3 From b2d51bc1601c762c63f19c119589a0a0c44bc8ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 26 Feb 2026 10:20:23 +0100 Subject: gpio: generic: Don't use 'proxy' headers Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20260226092023.4096921-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mmio.c | 4 +--- include/linux/gpio/generic.h | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index edbcaad57d00..0941d034a49c 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -42,18 +42,16 @@ o ` ~~~~\___/~~~~ ` controller in FPGA is ,.` #include #include -#include #include -#include #include #include +#include #include #include #include #include #include #include -#include #include #include diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index ff566dc9c3cb..de43c06c83ef 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -3,9 +3,15 @@ #ifndef __LINUX_GPIO_GENERIC_H #define __LINUX_GPIO_GENERIC_H +#include +#include #include -#include +#include +#include #include +#include + +#include struct device; -- cgit v1.2.3 From fa4a3a95139e7293c1333a33bd7b19e7261e3bd0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 18:20:06 +0100 Subject: gpio: introduce a header for symbols shared by suppliers and consumers GPIO_LINE_DIRECTION_IN/OUT definitions are used both in supplier (GPIO controller drivers) as well as consumer code. In order to not force the consumers to include gpio/driver.h or - even worse - to redefine these values, create a new header file - gpio/defs.h - and move them over there. Include this header from both gpio/consumer.h and gpio/driver.h. Reviewed-by: Linus Walleij Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260223172006.204268-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 2 ++ include/linux/gpio/defs.h | 9 +++++++++ include/linux/gpio/driver.h | 5 ++--- 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 include/linux/gpio/defs.h (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 0d8408582918..3efb5cb1e1d1 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -6,6 +6,8 @@ #include #include +#include "defs.h" + struct acpi_device; struct device; struct fwnode_handle; diff --git a/include/linux/gpio/defs.h b/include/linux/gpio/defs.h new file mode 100644 index 000000000000..b69fd7c041b2 --- /dev/null +++ b/include/linux/gpio/defs.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_GPIO_DEFS_H +#define __LINUX_GPIO_DEFS_H + +#define GPIO_LINE_DIRECTION_IN 1 +#define GPIO_LINE_DIRECTION_OUT 0 + +#endif /* __LINUX_GPIO_DEFS_H */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index fabe2baf7b50..5f5ddcbfa445 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,6 +20,8 @@ #include #endif +#include "defs.h" + struct device; struct irq_chip; struct irq_data; @@ -42,9 +44,6 @@ union gpio_irq_fwspec { #endif }; -#define GPIO_LINE_DIRECTION_IN 1 -#define GPIO_LINE_DIRECTION_OUT 0 - /** * struct gpio_irq_chip - GPIO interrupt controller */ -- cgit v1.2.3 From 0a93d30861617ecf207dcc4c6c736435fac36dae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:42 +0100 Subject: hrtimer: Provide a static branch based hrtimer_hres_enabled() The scheduler evaluates this via hrtimer_is_hres_active() every time it has to update HRTICK. This needs to follow three pointers, which is expensive. Provide a static branch based mechanism to avoid that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.136503358@kernel.org --- include/linux/hrtimer.h | 13 +++++++++---- kernel/time/hrtimer.c | 28 +++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 74adbd4e7003..c9ca105ba009 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -153,17 +153,22 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) } #ifdef CONFIG_HIGH_RES_TIMERS +extern unsigned int hrtimer_resolution; struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern unsigned int hrtimer_resolution; +extern struct static_key_false hrtimer_highres_enabled_key; -#else +static inline bool hrtimer_highres_enabled(void) +{ + return static_branch_likely(&hrtimer_highres_enabled_key); +} +#else /* CONFIG_HIGH_RES_TIMERS */ #define hrtimer_resolution (unsigned int)LOW_RES_NSEC - -#endif +static inline bool hrtimer_highres_enabled(void) { return false; } +#endif /* !CONFIG_HIGH_RES_TIMERS */ static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3088db419aa6..67917ce696d4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -126,6 +126,25 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -649,7 +668,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -750,6 +771,7 @@ static void hrtimer_switch_to_hres(void) tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else @@ -947,11 +969,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -962,6 +983,7 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base; unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); -- cgit v1.2.3 From c3a92213eb3dd8ea6f664d16a08eda800e34eaad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:47 +0100 Subject: sched: Use hrtimer_highres_enabled() Use the static branch based variant and thereby avoid following three pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.203610956@kernel.org --- include/linux/hrtimer.h | 6 ------ kernel/sched/sched.h | 37 +++++++++---------------------------- 2 files changed, 9 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ca105ba009..b5003856fd60 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -146,12 +146,6 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } -static inline int hrtimer_is_hres_active(struct hrtimer *timer) -{ - return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? - timer->base->cpu_base->hres_active : 0; -} - #ifdef CONFIG_HIGH_RES_TIMERS extern unsigned int hrtimer_resolution; struct clock_event_device; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73bc20c47631..0aa089dfaaa4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3019,25 +3019,19 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; * - enabled by features * - hrtimer is actually high res */ -static inline int hrtick_enabled(struct rq *rq) +static inline bool hrtick_enabled(struct rq *rq) { - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); } -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_enabled_fair(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK) && hrtick_enabled(rq); } -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_enabled_dl(struct rq *rq) { - if (!sched_feat(HRTICK_DL)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); } extern void hrtick_start(struct rq *rq, u64 delay); @@ -3047,22 +3041,9 @@ static inline bool hrtick_active(struct rq *rq) } #else /* !CONFIG_SCHED_HRTICK: */ - -static inline int hrtick_enabled_fair(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled_dl(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; -} - +static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } +static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } +static inline bool hrtick_enabled(struct rq *rq) { return false; } #endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -- cgit v1.2.3 From b7dd64778aa3f89de9afa1e81171cfe110ddc525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:36:01 +0100 Subject: hrtimer: Provide LAZY_REARM mode The hrtick timer is frequently rearmed before expiry and most of the time the new expiry is past the armed one. As this happens on every context switch it becomes expensive with scheduling heavy work loads especially in virtual machines as the "hardware" reprogamming implies a VM exit. Add a lazy rearm mode flag which skips the reprogamming if: 1) The timer was the first expiring timer before the rearm 2) The new expiry time is farther out than the armed time This avoids a massive amount of reprogramming operations of the hrtick timer for the price of eventually taking the alredy armed interrupt for nothing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.408524456@kernel.org --- include/linux/hrtimer.h | 8 ++++++++ include/linux/hrtimer_types.h | 3 +++ kernel/time/hrtimer.c | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b5003856fd60..c924bb2498db 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -31,6 +31,13 @@ * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. + * HRTIMER_MODE_LAZY_REARM - Avoid reprogramming if the timer was the + * first expiring timer and is moved into the + * future. Special mode for the HRTICK timer to + * avoid extensive reprogramming of the hardware, + * which is expensive in virtual machines. Risks + * a pointless expiry, but that's better than + * reprogramming on every context switch, */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, @@ -38,6 +45,7 @@ enum hrtimer_mode { HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, + HRTIMER_MODE_LAZY_REARM = 0x10, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 8fbbb6bdf7a1..64381c64cdbd 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -33,6 +33,8 @@ enum hrtimer_restart { * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. + * @is_lazy: Set if the timer is frequently rearmed to avoid updates + * of the clock event device * * The hrtimer structure must be initialized by hrtimer_setup() */ @@ -45,6 +47,7 @@ struct hrtimer { u8 is_rel; u8 is_soft; u8 is_hard; + u8 is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 67917ce696d4..e54f8b59f6b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1152,7 +1152,7 @@ static void __remove_hrtimer(struct hrtimer *timer, * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ - if (reprogram && timer == cpu_base->next_timer) + if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) hrtimer_force_reprogram(cpu_base, 1); } @@ -1321,6 +1321,20 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return 0; } + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (new_base->cpu_base->expires_next <= hrtimer_get_expires(timer)) + return 0; + } + /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the @@ -1675,6 +1689,7 @@ static void __hrtimer_setup(struct hrtimer *timer, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -- cgit v1.2.3 From 70802807398c65f5a49b2baec87e1f6c8db43de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:15 +0100 Subject: clockevents: Remove redundant CLOCK_EVT_FEAT_KTIME The only real usecase for this is the hrtimer based broadcast device. No point in using two different feature flags for this. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.609049777@kernel.org --- include/linux/clockchips.h | 1 - kernel/time/clockevents.c | 4 ++-- kernel/time/tick-broadcast-hrtimer.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index b0df28ddd394..5e8f7819f6a6 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -45,7 +45,6 @@ enum clock_event_state { */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 -# define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index eaae1ce9f060..5abaeef08e6a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -319,8 +319,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a88b72b0f35e..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, -- cgit v1.2.3 From 2e27beeb66e43f3b84aef5a07e486a5d50695c06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:20 +0100 Subject: timekeeping: Allow inlining clocksource::read() On some architectures clocksource::read() boils down to a single instruction, so the indirect function call is just a massive overhead especially with speculative execution mitigations in effect. Allow architectures to enable conditional inlining of that read to avoid that by: - providing a static branch to switch to the inlined variant - disabling the branch before clocksource changes - enabling the branch after a clocksource change, when the clocksource indicates in a feature flag that it is the one which provides the inlined variant This is intentionally not a static call as that would only remove the indirect call, but not the rest of the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.675151545@kernel.org --- include/linux/clocksource.h | 2 ++ kernel/time/Kconfig | 3 ++ kernel/time/timekeeping.c | 74 +++++++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..54366d5c4d19 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,6 +149,8 @@ struct clocksource { #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 + /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..07b048ba0cca 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,6 +17,9 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91fa2003351c..63aa31f02ebc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,34 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include -#include -#include +#include +#include +#include +#include #include -#include -#include -#include +#include #include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include #include #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -288,13 +289,36 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif + /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -375,7 +399,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -1631,7 +1655,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } -- cgit v1.2.3 From cd38bdb8e696a1a1eb12fc6662a6e420977aacfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:40 +0100 Subject: timekeeping: Provide infrastructure for coupled clockevents Some architectures have clockevent devices which are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. Well known examples are TSC/TSC deadline timer and the S390 TOD clocksource/comparator. While the concept is nice it has some downsides: 1) The clockevents core code is strictly based on relative expiry times as that's the most common case for clockevent device hardware. That requires to convert the absolute expiry time provided by the caller (hrtimers, NOHZ code) to a relative expiry time by reading and substracting the current time. The clockevent::set_next_event() callback must then read the counter again to convert the relative expiry back into a absolute one. 2) The conversion factors from nanoseconds to counter clock cycles are set up when the clockevent is registered. When NTP applies corrections then the clockevent conversion factors can deviate from the clocksource conversion substantially which either results in timers firing late or in the worst case early. The early expiry then needs to do a reprogam with a short delta. In most cases this is papered over by the fact that the read in the set_next_event() callback happens after the read which is used to calculate the delta. So the tendency is that timers expire mostly late. All of this can be avoided by providing support for these devices in the core code: 1) The timekeeping core keeps track of the last update to the clocksource by storing the base nanoseconds and the corresponding clocksource counter value. That's used to keep the conversion math for reading the time within 64-bit in the common case. This information can be used to avoid both reads of the underlying clocksource in the clockevents reprogramming path: delta = expiry - base_ns; cycles = base_cycles + ((delta * clockevent::mult) >> clockevent::shift); The resulting cycles value can be directly used to program the comparator. 2) As #1 does not longer provide the "compensation" through the second read the deviation of the clocksource and clockevent conversions caused by NTP become more prominent. This can be cured by letting the timekeeping core compute and store the reverse conversion factors when the clocksource cycles to nanoseconds factors are modified by NTP: CS::MULT (1 << NS_TO_CYC_SHIFT) --------------- = ---------------------- (1 << CS:SHIFT) NS_TO_CYC_MULT Ergo: NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT The NS_TO_CYC_SHIFT value is calculated when the clocksource is installed so that it aims for a one hour maximum sleep time. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.944763521@kernel.org --- include/linux/clocksource.h | 1 + include/linux/timekeeper_internal.h | 8 +++ kernel/time/Kconfig | 3 + kernel/time/timekeeping.c | 110 ++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.h | 2 + 5 files changed, 124 insertions(+) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 54366d5c4d19..25774fc5b53d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -150,6 +150,7 @@ struct clocksource { #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 #define CLOCK_SOURCE_CAN_INLINE_READ 0x400 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index b8ae89ea28ab..e36d11e33e0c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -72,6 +72,10 @@ struct tk_read_base { * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @cs_id: The ID of the current clocksource + * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion + * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion + * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @clock_valid: Indicator for valid clock @@ -159,6 +163,10 @@ struct timekeeper { u64 raw_sec; /* Cachline 3 and 4 (timekeeping internal variables): */ + enum clocksource_ids cs_id; + u32 cs_ns_to_cyc_mult; + u32 cs_ns_to_cyc_shift; + u64 cs_ns_to_cyc_maxns; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; u8 clock_valid; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 07b048ba0cca..b51bc5625129 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -47,6 +47,9 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63aa31f02ebc..b7a0f93011e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -391,6 +391,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + } } /* Timekeeper helper functions. */ @@ -720,6 +734,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -754,6 +798,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); @@ -808,6 +853,71 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_update_coarse_nsecs(tk); } +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; +} + /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); -- cgit v1.2.3 From 89f951a1e8ad781e7ac70eccddab0e0c270485f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:45 +0100 Subject: clockevents: Provide support for clocksource coupled comparators Some clockevent devices are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. The timekeeping core provides a function to convert and absolute CLOCK_MONOTONIC based expiry time to a absolute clock cycles time which can be directly fed into the comparator. That spares two time reads in the next event progamming path, one to convert the absolute nanoseconds time to a delta value and the other to convert the delta value back to a absolute time value suitable for the comparator. Provide a new clocksource callback which takes the absolute cycle value and wire it up in clockevents_program_event(). Similar to clocksources allow architectures to inline the rearm operation. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.010425428@kernel.org --- include/linux/clockchips.h | 7 +++++-- kernel/time/Kconfig | 4 ++++ kernel/time/clockevents.c | 44 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 5e8f7819f6a6..92d90220c0d4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -43,8 +43,9 @@ enum clock_event_state { /* * Clock event features */ -# define CLOCK_EVT_FEAT_PERIODIC 0x000001 -# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_PERIODIC 0x000001 +# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 /* * x86(64) specific (mis)features: @@ -100,6 +101,7 @@ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); + void (*set_next_coupled)(u64 cycles, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; @@ -107,6 +109,7 @@ struct clock_event_device { u32 shift; enum clock_event_state state_use_accessors; unsigned int features; + enum clocksource_ids cs_id; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b51bc5625129..e1968ab8b37f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -50,6 +50,10 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CLOCKEVENTS_COUPLED bool +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5abaeef08e6a..83712aa1d385 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -292,6 +292,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,11 +332,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; + u64 cycles; int rc; if (WARN_ON_ONCE(expires < 0)) @@ -323,6 +354,9 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; @@ -330,8 +364,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + cycles = ((u64)delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) cycles, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -- cgit v1.2.3 From f2e388a019e4cf83a15883a3d1f1384298e9a6aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:59 +0100 Subject: hrtimer: Reduce trace noise in hrtimer_start() hrtimer_start() when invoked with an already armed timer traces like: -.. [032] d.h2. 5.002263: hrtimer_cancel: hrtimer= .... -.. [032] d.h1. 5.002263: hrtimer_start: hrtimer= .... Which is incorrect as the timer doesn't get canceled. Just the expiry time changes. The internal dequeue operation which is required for that is not really interesting for trace analysis. But it makes it tedious to keep real cancellations and the above case apart. Remove the cancel tracing in hrtimer_start() and add a 'was_armed' indicator to the hrtimer start tracepoint, which clearly indicates what the state of the hrtimer is when hrtimer_start() is invoked: -.. [032] d.h1. 6.200103: hrtimer_start: hrtimer= .... was_armed=0 -.. [032] d.h1. 6.200558: hrtimer_start: hrtimer= .... was_armed=1 Fixes: c6a2a1770245 ("hrtimer: Add tracepoint for hrtimers") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.208491877@kernel.org --- include/trace/events/timer.h | 11 +++++++---- kernel/time/hrtimer.c | 43 ++++++++++++++++++++----------------------- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 1641ae3e6ca0..ab9a9386f7b6 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -218,12 +218,13 @@ TRACE_EVENT(hrtimer_setup, * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer * @mode: the hrtimers mode + * @was_armed: Was armed when hrtimer_start*() was invoked */ TRACE_EVENT(hrtimer_start, - TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), + TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode, bool was_armed), - TP_ARGS(hrtimer, mode), + TP_ARGS(hrtimer, mode, was_armed), TP_STRUCT__entry( __field( void *, hrtimer ) @@ -231,6 +232,7 @@ TRACE_EVENT(hrtimer_start, __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) + __field( bool, was_armed ) ), TP_fast_assign( @@ -239,13 +241,14 @@ TRACE_EVENT(hrtimer_start, __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; + __entry->was_armed = was_armed; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " - "mode=%s", __entry->hrtimer, __entry->function, + "mode=%s was_armed=%d", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, - decode_hrtimer_mode(__entry->mode)) + decode_hrtimer_mode(__entry->mode), __entry->was_armed) ); /** diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa63e0b20455..6e4ac8dea312 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -529,17 +529,10 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) { debug_hrtimer_activate(timer, mode); - trace_hrtimer_start(timer, mode); -} - -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); + trace_hrtimer_start(timer, mode, was_armed); } static struct hrtimer_clock_base * @@ -1137,9 +1130,9 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * Returns true when the new timer is the leftmost timer in the tree. */ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - enum hrtimer_mode mode) + enum hrtimer_mode mode, bool was_armed) { - debug_activate(timer, mode); + debug_activate(timer, mode, was_armed); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -1199,6 +1192,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; + debug_hrtimer_deactivate(timer); + /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current @@ -1207,7 +1202,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* @@ -1274,15 +1268,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; - bool force_local, first; + bool force_local, first, was_armed; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. + * (on removal and on enqueue). To avoid that prevent the reprogram + * on removal, keep the timer local to the current CPU and enforce + * reprogramming after it is queued no matter whether it is the new + * first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; @@ -1304,7 +1298,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ - remove_hrtimer(timer, base, true, force_local); + was_armed = remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); @@ -1321,7 +1315,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, new_base = base; } - first = enqueue_hrtimer(timer, new_base, mode); + first = enqueue_hrtimer(timer, new_base, mode, was_armed); /* * If the hrtimer interrupt is running, then it will reevaluate the @@ -1439,8 +1433,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); - if (!hrtimer_callback_running(timer)) + if (!hrtimer_callback_running(timer)) { ret = remove_hrtimer(timer, base, false, false); + if (ret) + trace_hrtimer_cancel(timer); + } unlock_hrtimer_base(timer, &flags); @@ -1877,7 +1874,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* * Separate the ->running assignment from the ->state assignment. @@ -2356,7 +2353,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the @@ -2373,7 +2370,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); } } -- cgit v1.2.3 From 7d27eafe54659d19cef10dab4520cbcdfb17b0e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:18 +0100 Subject: hrtimer: Replace the bitfield in hrtimer_cpu_base Use bool for the various flags as that creates better code in the hot path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.475262618@kernel.org --- include/linux/hrtimer_defs.h | 10 +++++----- kernel/time/hrtimer.c | 25 +++++++++++++------------ 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 02b010df6570..f9fbf9a48f59 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -83,11 +83,11 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; - unsigned int hres_active : 1, - in_hrtirq : 1, - hang_detected : 1, - softirq_activated : 1, - online : 1; + bool hres_active; + bool in_hrtirq; + bool hang_detected; + bool softirq_activated; + bool online; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e6f02e980371..3b80a4453ee6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -741,7 +741,7 @@ static void hrtimer_switch_to_hres(void) pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); @@ -1854,7 +1854,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1881,7 +1881,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->in_hrtirq = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. @@ -1892,7 +1892,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -1905,12 +1905,12 @@ retry: * against it. */ cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; + cpu_base->in_hrtirq = false; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; + cpu_base->hang_detected = false; return; } @@ -1939,7 +1939,7 @@ retry: * time away. */ cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; + cpu_base->hang_detected = true; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); @@ -1987,7 +1987,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2239,13 +2239,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2303,7 +2304,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; -- cgit v1.2.3 From 22f011be7aaa77ca8f502b9dd07b7334f9965d18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:23 +0100 Subject: hrtimer: Convert state and properties to boolean All 'u8' flags are true booleans, so make it entirely clear that these can only contain true or false. This is especially true for hrtimer::state, which has a historical leftover of using the state with bitwise operations. That was used in the early hrtimer implementation with several bits, but then converted to a boolean state. But that conversion missed to replace the bit OR and bit check operations all over the place, which creates suboptimal code. As of today 'state' is a misnomer because it's only purpose is to reflect whether the timer is enqueued into the RB-tree or not. Rename it to 'is_queued' and make all operations on it boolean. This reduces text size from 8926 to 8732 bytes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.542427240@kernel.org --- include/linux/hrtimer.h | 31 ++--------------------- include/linux/hrtimer_types.h | 12 ++++----- kernel/time/hrtimer.c | 58 +++++++++++++++++++++++++++++-------------- kernel/time/timer_list.c | 2 +- 4 files changed, 49 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c924bb2498db..4ad4a454b4c5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -63,33 +63,6 @@ enum hrtimer_mode { HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; -/* - * Values to track state of the timer - * - * Possible states: - * - * 0x00 inactive - * 0x01 enqueued into rbtree - * - * The callback state is not part of the timer->state because clearing it would - * mean touching the timer after the callback, this makes it impossible to free - * the timer from the callback function. - * - * Therefore we track the callback state in: - * - * timer->base->cpu_base->running == timer - * - * On SMP it is possible to have a "callback function running and enqueued" - * status. It happens for example when a posix timer expired and the callback - * queued a signal. Between dropping the lock which protects the posix timer - * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. - * - * All state transitions are protected by cpu_base->lock. - */ -#define HRTIMER_STATE_INACTIVE 0x00 -#define HRTIMER_STATE_ENQUEUED 0x01 - /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure @@ -300,8 +273,8 @@ extern bool hrtimer_active(const struct hrtimer *timer); */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { - /* The READ_ONCE pairs with the update functions of timer->state */ - return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); + /* The READ_ONCE pairs with the update functions of timer->is_queued */ + return READ_ONCE(timer->is_queued); } /* diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 64381c64cdbd..0e22bc91d00f 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -28,7 +28,7 @@ enum hrtimer_restart { * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) + * @is_queued: Indicates whether a timer is enqueued or not * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context @@ -43,11 +43,11 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; - u8 is_lazy; + bool is_queued; + bool is_rel; + bool is_soft; + bool is_hard; + bool is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3b80a4453ee6..6bab3b7eb0de 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -49,6 +49,28 @@ #include "tick-internal.h" +/* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + /* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an @@ -1038,7 +1060,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1082,7 +1104,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -1096,18 +1118,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - u8 newstate, bool reprogram) + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; lockdep_assert_held(&cpu_base->lock); - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + if (!timer->is_queued) return; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -1127,11 +1149,11 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { - u8 state = timer->state; + bool queued_state = timer->is_queued; lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (queued_state) { bool reprogram; debug_hrtimer_deactivate(timer); @@ -1153,11 +1175,11 @@ static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_ba * and a moment later when it's requeued). */ if (!restart) - state = HRTIMER_STATE_INACTIVE; + queued_state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; - __remove_hrtimer(timer, base, state, reprogram); + __remove_hrtimer(timer, base, queued_state, reprogram); return true; } return false; @@ -1704,7 +1726,7 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) + if (timer->is_queued || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); @@ -1721,7 +1743,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1744,11 +1766,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1787,15 +1809,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) + if (restart == HRTIMER_RESTART && !timer->is_queued) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 488e47e96e93..19e61826b7de 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), -- cgit v1.2.3 From 8e10f6b81afbf60e48bb4a71676ede4c7e374e79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:43 +0100 Subject: hrtimer: Add hrtimer_rearm tracepoint Analyzing the reprogramming of the clock event device is essential to debug the behaviour of the hrtimer subsystem especially with the upcoming deferred rearming scheme. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.803669745@kernel.org --- include/trace/events/timer.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index ab9a9386f7b6..a54613f59e55 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -324,6 +324,30 @@ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_ARGS(hrtimer) ); +/** + * hrtimer_rearm - Invoked when the clockevent device is rearmed + * @next_event: The next expiry time (CLOCK_MONOTONIC) + */ +TRACE_EVENT(hrtimer_rearm, + + TP_PROTO(ktime_t next_event, bool deferred), + + TP_ARGS(next_event, deferred), + + TP_STRUCT__entry( + __field( s64, next_event ) + __field( bool, deferred ) + ), + + TP_fast_assign( + __entry->next_event = next_event; + __entry->deferred = deferred; + ), + + TP_printk("next_event=%llu deferred=%d", + (unsigned long long) __entry->next_event, __entry->deferred) +); + /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer -- cgit v1.2.3 From 9e07a9c980eaa93fd1bba722d31eeb4bf0cbbfb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:53 +0100 Subject: hrtimer: Rename hrtimer_cpu_base::in_hrtirq to deferred_rearm The upcoming deferred rearming scheme has the same effect as the deferred rearming when the hrtimer interrupt is executing. So it can reuse the in_hrtirq flag, but when it gets deferred beyond the hrtimer interrupt path, then the name does not make sense anymore. Rename it to deferred_rearm upfront to keep the actual functional change separate from the mechanical rename churn. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.935623347@kernel.org --- include/linux/hrtimer_defs.h | 4 ++-- kernel/time/hrtimer.c | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index f9fbf9a48f59..2c3bdbd562d2 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -53,7 +53,7 @@ enum hrtimer_base_type { * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode - * @in_hrtirq: hrtimer_interrupt() is currently executing + * @deferred_rearm: A deferred rearm is pending * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. @@ -84,7 +84,7 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; bool hres_active; - bool in_hrtirq; + bool deferred_rearm; bool hang_detected; bool softirq_activated; bool online; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e05a1885d24..6f05d2569286 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -883,11 +883,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -921,12 +918,8 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) return false; /* @@ -1334,11 +1327,8 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del first = enqueue_hrtimer(timer, base, mode, was_armed); } - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return false; if (!was_first || cpu_base != this_cpu_base) { @@ -1947,14 +1937,14 @@ static __latent_entropy void hrtimer_run_softirq(void) /* * Very similar to hrtimer_force_reprogram(), except it deals with - * in_hrtirq and hang_detected. + * deferred_rearm and hang_detected. */ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) { ktime_t expires_next = hrtimer_update_next_event(cpu_base); cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = false; + cpu_base->deferred_rearm = false; if (unlikely(cpu_base->hang_detected)) { /* @@ -1985,7 +1975,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = true; + cpu_base->deferred_rearm = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. -- cgit v1.2.3 From a43b4856bc039675165a50d9ef5f41b28520f0f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:37:58 +0100 Subject: hrtimer: Prepare stubs for deferred rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer set NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(). To make this correct the affected code parts need to be made aware of this. Provide empty stubs for the deferred rearming mechanism, so that the relevant code changes for entry, softirq and scheduler can be split up into separate changes independent of the actual enablement in the hrtimer code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.000891171@kernel.org --- include/linux/hrtimer.h | 1 + include/linux/hrtimer_rearm.h | 21 +++++++++++++++++++++ kernel/time/Kconfig | 4 ++++ 3 files changed, 26 insertions(+) create mode 100644 include/linux/hrtimer_rearm.h (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4ad4a454b4c5..c087b7142330 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,6 +13,7 @@ #define _LINUX_HRTIMER_H #include +#include #include #include #include diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h new file mode 100644 index 000000000000..6293076c03a6 --- /dev/null +++ b/include/linux/hrtimer_rearm.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _LINUX_HRTIMER_REARM_H +#define _LINUX_HRTIMER_REARM_H + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +#endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e1968ab8b37f..b95bfee3f592 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -58,6 +58,10 @@ config GENERIC_CLOCKEVENTS_COUPLED_INLINE config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool n + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK -- cgit v1.2.3 From 0e98eb14814ef669e07ca6effaa03df2e57ef956 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:03 +0100 Subject: entry: Prepare for deferred hrtimer rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer sets NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(), which avoids multiple rearms and re-evaluation of the timer wheel. As this is only relevant for interrupt to user return split the work masks up and hand them in as arguments from the relevant exit to user functions, which allows the compiler to optimize the deferred handling out for the syscall exit to user case. Add the rearm checks to the approritate places in the exit to user loop and the interrupt return to kernel path, so that the rearming is always guaranteed. In the return to user space path this is handled in the same way as TIF_RSEQ to avoid extra instructions in the fast path, which are truly hurtful for device interrupt heavy work loads as the extra instructions and conditionals while benign at first sight accumulate quickly into measurable regressions. The return from syscall path is completely unaffected due to the above mentioned split so syscall heavy workloads wont have any extra burden. For now this is just placing empty stubs at the right places which are all optimized out by the compiler until the actual functionality is in place. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.066469985@kernel.org --- include/linux/irq-entry-common.h | 25 +++++++++++++++++++------ include/linux/rseq_entry.h | 16 +++++++++++++--- kernel/entry/common.c | 4 +++- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..b976946b3cdb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -3,6 +3,7 @@ #define __LINUX_IRQENTRYCOMMON_H #include +#include #include #include #include @@ -33,6 +34,14 @@ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM) +#else +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK) +#endif + /** * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs @@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work /** * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack + * @work_mask: Which TIF bits need to be evaluated * * 1) check that interrupts are disabled * 2) call tick_nohz_user_enter_prepare() @@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs, + const unsigned long work_mask) { unsigned long ti_work; @@ -222,8 +233,10 @@ static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare(); ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); + if (unlikely(ti_work & work_mask)) { + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + } arch_exit_to_user_mode_prepare(regs, ti_work); } @@ -239,7 +252,7 @@ static __always_inline void __exit_to_user_mode_validate(void) /* Temporary workaround to keep ARM64 alive */ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -253,7 +266,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg */ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL); rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -267,7 +280,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re */ static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ); rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..17956e119e81 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include #include @@ -110,7 +111,7 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } -static __always_inline bool rseq_grant_slice_extension(bool work_pending) +static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; struct rseq_slice_ctrl usr_ctrl; @@ -215,11 +216,20 @@ efault: return false; } +static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) +{ + if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { + hrtimer_rearm_deferred_tif(ti_work); + return true; + } + return false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9ef63e414791..9e1a6afb07f2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -50,7 +50,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { - if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) schedule(); } @@ -225,6 +225,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) */ if (state.exit_rcu) { instrumentation_begin(); + hrtimer_rearm_deferred(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -238,6 +239,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); + hrtimer_rearm_deferred(); /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From 15dd3a9488557d3e6ebcecacab79f4e56b69ab54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:18 +0100 Subject: hrtimer: Push reprogramming timers into the interrupt return path Currently hrtimer_interrupt() runs expired timers, which can re-arm themselves, after which it computes the next expiration time and re-programs the hardware. However, things like HRTICK, a highres timer driving preemption, cannot re-arm itself at the point of running, since the next task has not been determined yet. The schedule() in the interrupt return path will switch to the next task, which then causes a new hrtimer to be programmed. This then results in reprogramming the hardware at least twice, once after running the timers, and once upon selecting the new task. Notably, *both* events happen in the interrupt. By pushing the hrtimer reprogram all the way into the interrupt return path, it runs after schedule() picks the new task and the double reprogram can be avoided. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.273488269@kernel.org --- include/asm-generic/thread_info_tif.h | 5 ++- include/linux/hrtimer_rearm.h | 72 ++++++++++++++++++++++++++++++++--- kernel/time/Kconfig | 4 +- kernel/time/hrtimer.c | 38 +++++++++++++++--- 4 files changed, 107 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index da1610a78f92..528e6fc7efe9 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -41,11 +41,14 @@ #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #ifdef HAVE_TIF_RESTORE_SIGMASK -# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif #define TIF_RSEQ 11 // Run RSEQ fast path #define _TIF_RSEQ BIT(TIF_RSEQ) +#define TIF_HRTIMER_REARM 12 // re-arm the timer +#define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h index 6293076c03a6..a6f2e5d5e1c7 100644 --- a/include/linux/hrtimer_rearm.h +++ b/include/linux/hrtimer_rearm.h @@ -3,12 +3,74 @@ #define _LINUX_HRTIMER_REARM_H #ifdef CONFIG_HRTIMER_REARM_DEFERRED -static __always_inline void __hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +#include + +void __hrtimer_rearm_deferred(void); + +/* + * This is purely CPU local, so check the TIF bit first to avoid the overhead of + * the atomic test_and_clear_bit() operation for the common case where the bit + * is not set. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred_tif(unsigned long tif_work) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(tif_work & _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + return true; + } + return false; +} + +#define TIF_REARM_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_HRTIMER_REARM) + +/* Invoked from the exit to user before invoking exit_to_user_mode_loop() */ static __always_inline bool -hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } -static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) +{ + /* Help the compiler to optimize the function out for syscall returns */ + if (!(tif_mask & _TIF_HRTIMER_REARM)) + return false; + /* + * Rearm the timer if none of the resched flags is set before going into + * the loop which re-enables interrupts. + */ + if (unlikely((*tif_work & TIF_REARM_MASK) == _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + __hrtimer_rearm_deferred(); + /* Don't go into the loop if HRTIMER_REARM was the only flag */ + *tif_work &= ~TIF_HRTIMER_REARM; + return !*tif_work; + } + return false; +} + +/* Invoked from the time slice extension decision function */ +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) +{ + if (hrtimer_test_and_clear_rearm_deferred_tif(tif_work)) + __hrtimer_rearm_deferred(); +} + +/* + * This is to be called on all irqentry_exit() paths that will enable + * interrupts. + */ +static __always_inline void hrtimer_rearm_deferred(void) +{ + hrtimer_rearm_deferred_tif(read_thread_flags()); +} + +/* + * Invoked from the scheduler on entry to __schedule() so it can defer + * rearming after the load balancing callbacks which might change hrtick. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) +{ + return hrtimer_test_and_clear_rearm_deferred_tif(read_thread_flags()); +} + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void __hrtimer_rearm_deferred(void) { } static __always_inline void hrtimer_rearm_deferred(void) { } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b95bfee3f592..6d6aace0a693 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -60,7 +60,9 @@ config GENERIC_CMOS_UPDATE # Deferred rearming of the hrtimer interrupt config HRTIMER_REARM_DEFERRED - def_bool n + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK # Select to handle posix CPU timers from task_work # and not from the timer interrupt context diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6f05d2569286..2e5f0e292efb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1939,10 +1939,9 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, + ktime_t expires_next, bool deferred) { - ktime_t expires_next = hrtimer_update_next_event(cpu_base); - cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1954,9 +1953,37 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } - hrtimer_rearm_event(expires_next, false); + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now, expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + hrtimer_rearm(cpu_base, now, expires_next, true); } +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, now, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + /* * High resolution timer interrupt * Called with interrupts disabled @@ -2014,9 +2041,10 @@ retry: cpu_base->hang_detected = true; } - hrtimer_rearm(cpu_base, now); + hrtimer_interrupt_rearm(cpu_base, now, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* -- cgit v1.2.3 From b95c4442b02162904e9012e670b602ebeb3c6c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:23 +0100 Subject: hrtimer: Avoid re-evaluation when nothing changed Most times there is no change between hrtimer_interrupt() deferring the rearm and the invocation of hrtimer_rearm_deferred(). In those cases it's a pointless exercise to re-evaluate the next expiring timer. Cache the required data and use it if nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.338569372@kernel.org --- include/linux/hrtimer_defs.h | 53 ++++++++++++++++++++++---------------------- kernel/time/hrtimer.c | 45 +++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 2c3bdbd562d2..b6846efec210 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -47,32 +47,31 @@ enum hrtimer_base_type { /** * struct hrtimer_cpu_base - the per cpu clock bases - * @lock: lock protecting the base and associated clock bases - * and timers - * @cpu: cpu number - * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set_seq: Sequence counter of clock was set events - * @hres_active: State of high resolution mode - * @deferred_rearm: A deferred rearm is pending - * @hang_detected: The last hrtimer interrupt detected a hang - * @softirq_activated: displays, if the softirq is raised - update of softirq - * related settings is not required then. - * @nr_events: Total number of hrtimer interrupt events - * @nr_retries: Total number of hrtimer interrupt retries - * @nr_hangs: Total number of hrtimer interrupt hangs - * @max_hang_time: Maximum time spent in hrtimer_interrupt - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are - * expired - * @online: CPU is online from an hrtimers point of view - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer - * callback to finish. - * @expires_next: absolute time of the next event, is required for remote - * hrtimer enqueue; it is the total first expiry time (hard - * and soft hrtimer are taken into account) - * @next_timer: Pointer to the first expiring timer - * @softirq_expires_next: Time to check, if soft queues needs also to be expired - * @softirq_next_timer: Pointer to the first expiring softirq based timer - * @clock_base: array of clock bases for this cpu + * @lock: lock protecting the base and associated clock bases and timers + * @cpu: cpu number + * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set_seq: Sequence counter of clock was set events + * @hres_active: State of high resolution mode + * @deferred_rearm: A deferred rearm is pending + * @deferred_needs_update: The deferred rearm must re-evaluate the first timer + * @hang_detected: The last hrtimer interrupt detected a hang + * @softirq_activated: displays, if the softirq is raised - update of softirq + * related settings is not required then. + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are expired + * @online: CPU is online from an hrtimers point of view + * @timer_waiters: A hrtimer_cancel() waiters for the timer callback to finish. + * @expires_next: Absolute time of the next event, is required for remote + * hrtimer enqueue; it is the total first expiry time (hard + * and soft hrtimer are taken into account) + * @next_timer: Pointer to the first expiring timer + * @softirq_expires_next: Time to check, if soft queues needs also to be expired + * @softirq_next_timer: Pointer to the first expiring softirq based timer + * @deferred_expires_next: Cached expires next value for deferred rearm + * @clock_base: Array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on @@ -85,6 +84,7 @@ struct hrtimer_cpu_base { unsigned int clock_was_set_seq; bool hres_active; bool deferred_rearm; + bool deferred_needs_update; bool hang_detected; bool softirq_activated; bool online; @@ -102,6 +102,7 @@ struct hrtimer_cpu_base { struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; + ktime_t deferred_expires_next; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; call_single_data_t csd; } ____cacheline_aligned; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e5f0e292efb..e9592cb1e39a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -919,8 +919,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act return false; /* If a deferred rearm is pending the remote CPU will take care of it */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -1141,7 +1143,12 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b * a local timer is removed to be immediately restarted. That's handled * at the call site. */ - if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } @@ -1328,8 +1335,10 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del } /* If a deferred rearm is pending skip reprogramming the device */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } if (!was_first || cpu_base != this_cpu_base) { /* @@ -1939,8 +1948,7 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, - ktime_t expires_next, bool deferred) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) { cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1950,7 +1958,7 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, * Give the system a chance to do something else than looping * on hrtimer interrupts. */ - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + expires_next = ktime_add_ns(ktime_get(), 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } hrtimer_rearm_event(expires_next, deferred); @@ -1960,27 +1968,36 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, void __hrtimer_rearm_deferred(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t now, expires_next; + ktime_t expires_next; if (!cpu_base->deferred_rearm) return; guard(raw_spinlock)(&cpu_base->lock); - now = hrtimer_update_base(cpu_base); - expires_next = hrtimer_update_next_event(cpu_base); - hrtimer_rearm(cpu_base, now, expires_next, true); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); } static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; set_thread_flag(TIF_HRTIMER_REARM); } #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { - hrtimer_rearm(cpu_base, now, expires_next, false); + hrtimer_rearm(cpu_base, expires_next, false); } #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ @@ -2041,7 +2058,7 @@ retry: cpu_base->hang_detected = true; } - hrtimer_interrupt_rearm(cpu_base, now, expires_next); + hrtimer_interrupt_rearm(cpu_base, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3 From eddffab8282e388dddf032f3295fcec87eb08095 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:28 +0100 Subject: hrtimer: Keep track of first expiring timer per clock base Evaluating the next expiry time of all clock bases is cache line expensive as the expiry time of the first expiring timer is not cached in the base and requires to access the timer itself, which is definitely in a different cache line. It's way more efficient to keep track of the expiry time on enqueue and dequeue operations as the relevant data is already in the cache at that point. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.404839710@kernel.org --- include/linux/hrtimer_defs.h | 2 ++ kernel/time/hrtimer.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index b6846efec210..fb38df4c0b64 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -19,6 +19,7 @@ * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer + * @expires_next: Absolute time of the next event in this clock base * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @offset: offset of this clock to the monotonic base @@ -28,6 +29,7 @@ struct hrtimer_clock_base { unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; + ktime_t expires_next; struct hrtimer *running; struct timerqueue_head active; ktime_t offset; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e9592cb1e39a..d70899a9ddc1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1107,7 +1107,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - return timerqueue_add(&base->active, &timer->node); + if (!timerqueue_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} + +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_node *next = timerqueue_getnext(&base->active); + + base->expires_next = next ? next->expires : KTIME_MAX; } /* @@ -1122,6 +1133,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; + bool was_first; lockdep_assert_held(&cpu_base->lock); @@ -1131,9 +1143,17 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); + was_first = &timer->node == timerqueue_getnext(&base->active); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* * If reprogram is false don't update cpu_base->next_timer and do not * touch the clock event device. @@ -1182,9 +1202,12 @@ static inline bool remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) { + bool was_first = false; + /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); + was_first = &timer->node == timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); } @@ -1197,8 +1220,16 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - /* Returns true if this is the first expiring timer */ - return timerqueue_add(&base->active, &timer->node); + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, -- cgit v1.2.3 From 671047943dce5af24e023bca3c5cc244d7565f5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:47 +0100 Subject: rbtree: Provide rbtree with links Some RB tree users require quick access to the next and the previous node, e.g. to check whether a modification of the node results in a change of the nodes position in the tree. If the node position does not change, then the modification can happen in place without going through a full enqueue requeue cycle. A upcoming use case for this are the timer queues of the hrtimer subsystem as they can optimize for timers which are frequently rearmed while enqueued. This can be obviously achieved with rb_next() and rb_prev(), but those turned out to be quite expensive for hotpath operations depending on the tree depth. Add a linked RB tree variant where add() and erase() maintain the links between the nodes. Like the cached variant it provides a pointer to the left most node in the root. It intentionally does not use a [h]list head as there is no real need for true list operations as the list is strictly coupled to the tree and and cannot be manipulated independently. It sets the nodes previous pointer to NULL for the left most node and the next pointer to NULL for the right most node. This allows a quick check especially for the left most node without consulting the list head address, which creates better code. Aside of the rb_leftmost cached pointer this could trivially provide a rb_rightmost pointer as well, but there is no usage for that (yet). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.668401024@kernel.org --- include/linux/rbtree.h | 81 +++++++++++++++++++++++++++++++++++++++----- include/linux/rbtree_types.h | 16 +++++++++ lib/rbtree.c | 17 ++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4091e978aef2..48acdc3889dd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -35,10 +35,15 @@ #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_LINKED_NODE(lnode) RB_EMPTY_NODE(&(lnode)->node) +#define RB_CLEAR_LINKED_NODE(lnode) ({ \ + RB_CLEAR_NODE(&(lnode)->node); \ + (lnode)->prev = (lnode)->next = NULL; \ +}) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); - +extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -213,15 +218,10 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, return leftmost ? node : NULL; } -/** - * rb_add() - insert @node into @tree - * @node: node to insert - * @tree: tree to insert @node into - * @less: operator defining the (partial) node order - */ static __always_inline void -rb_add(struct rb_node *node, struct rb_root *tree, - bool (*less)(struct rb_node *, const struct rb_node *)) +__rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; @@ -234,10 +234,73 @@ rb_add(struct rb_node *node, struct rb_root *tree, link = &parent->rb_right; } + linkop(node, parent, link); rb_link_node(node, parent, link); rb_insert_color(node, tree); } +#define __node_2_linked_node(_n) \ + rb_entry((_n), struct rb_node_linked, node) + +static inline void +rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link) +{ + if (!parent) + return; + + struct rb_node_linked *nnew = __node_2_linked_node(node); + struct rb_node_linked *npar = __node_2_linked_node(parent); + + if (link == &parent->rb_left) { + nnew->prev = npar->prev; + nnew->next = npar; + npar->prev = nnew; + if (nnew->prev) + nnew->prev->next = nnew; + } else { + nnew->next = npar->next; + nnew->prev = npar; + npar->next = nnew; + if (nnew->next) + nnew->next->prev = nnew; + } +} + +/** + * rb_add_linked() - insert @node into the leftmost linked tree @tree + * @node: node to insert + * @tree: linked tree to insert @node into + * @less: operator defining the (partial) node order + * + * Returns @true when @node is the new leftmost, @false otherwise. + */ +static __always_inline bool +rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node); + if (!node->prev) + tree->rb_leftmost = node; + return !node->prev; +} + +/* Empty linkop function which is optimized away by the compiler */ +static __always_inline void +rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { } + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(node, tree, less, rb_link_noop); +} + /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert diff --git a/include/linux/rbtree_types.h b/include/linux/rbtree_types.h index 45b6ecde3665..3c7ae53e8139 100644 --- a/include/linux/rbtree_types.h +++ b/include/linux/rbtree_types.h @@ -9,6 +9,12 @@ struct rb_node { } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ +struct rb_node_linked { + struct rb_node node; + struct rb_node_linked *prev; + struct rb_node_linked *next; +}; + struct rb_root { struct rb_node *rb_node; }; @@ -28,7 +34,17 @@ struct rb_root_cached { struct rb_node *rb_leftmost; }; +/* + * Leftmost tree with links. This would allow a trivial rb_rightmost update, + * but that has been omitted due to the lack of users. + */ +struct rb_root_linked { + struct rb_root rb_root; + struct rb_node_linked *rb_leftmost; +}; + #define RB_ROOT (struct rb_root) { NULL, } #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } +#define RB_ROOT_LINKED (struct rb_root_linked) { {NULL, }, NULL } #endif diff --git a/lib/rbtree.c b/lib/rbtree.c index 18d42bcf4ec9..5790d6ecba4e 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -446,6 +446,23 @@ void rb_erase(struct rb_node *node, struct rb_root *root) } EXPORT_SYMBOL(rb_erase); +bool rb_erase_linked(struct rb_node_linked *node, struct rb_root_linked *root) +{ + if (node->prev) + node->prev->next = node->next; + else + root->rb_leftmost = node->next; + + if (node->next) + node->next->prev = node->prev; + + rb_erase(&node->node, &root->rb_root); + RB_CLEAR_LINKED_NODE(node); + + return !!root->rb_leftmost; +} +EXPORT_SYMBOL_GPL(rb_erase_linked); + /* * Augmented rbtree manipulation functions. * -- cgit v1.2.3 From 1339eeb73d6b99cf3aa9981f3f91d6ac4a49c72e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:52 +0100 Subject: timerqueue: Provide linked timerqueue The hrtimer subsystem wants to peak ahead to the next and previous timer to evaluated whether a to be rearmed timer can stay at the same position in the RB tree with the new expiry time. The linked RB tree provides the infrastructure for this as it maintains links to the previous and next nodes for each entry in the tree. Provide timerqueue wrappers around that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.734827095@kernel.org --- include/linux/timerqueue.h | 56 ++++++++++++++++++++++++++++++++++------ include/linux/timerqueue_types.h | 15 ++++++++--- lib/timerqueue.c | 14 ++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index d306d9dd2207..7d0aaa766580 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -5,12 +5,11 @@ #include #include -extern bool timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern bool timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); -extern struct timerqueue_node *timerqueue_iterate_next( - struct timerqueue_node *node); +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node); + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time @@ -19,8 +18,7 @@ extern struct timerqueue_node *timerqueue_iterate_next( * * Returns a pointer to the timer node that has the earliest expiration time. */ -static inline -struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); @@ -41,4 +39,46 @@ static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } + +/* Timer queues with linked nodes */ + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_first(struct timerqueue_linked_head *head) +{ + return rb_entry_safe(head->rb_root.rb_leftmost, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_next(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.next, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_prev(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.prev, struct timerqueue_linked_node, node); +} + +static __always_inline +bool timerqueue_linked_del(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_erase_linked(&node->node, &head->rb_root); +} + +static __always_inline void timerqueue_linked_init(struct timerqueue_linked_node *node) +{ + RB_CLEAR_LINKED_NODE(&node->node); +} + +static __always_inline bool timerqueue_linked_node_queued(struct timerqueue_linked_node *node) +{ + return !RB_EMPTY_LINKED_NODE(&node->node); +} + +static __always_inline void timerqueue_linked_init_head(struct timerqueue_linked_head *head) +{ + head->rb_root = RB_ROOT_LINKED; +} + #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h index dc298d0923e3..be2218b147c4 100644 --- a/include/linux/timerqueue_types.h +++ b/include/linux/timerqueue_types.h @@ -6,12 +6,21 @@ #include struct timerqueue_node { - struct rb_node node; - ktime_t expires; + struct rb_node node; + ktime_t expires; }; struct timerqueue_head { - struct rb_root_cached rb_root; + struct rb_root_cached rb_root; +}; + +struct timerqueue_linked_node { + struct rb_node_linked node; + ktime_t expires; +}; + +struct timerqueue_linked_head { + struct rb_root_linked rb_root; }; #endif /* _LINUX_TIMERQUEUE_TYPES_H */ diff --git a/lib/timerqueue.c b/lib/timerqueue.c index cdb9c7658478..e2a1e08cb4bd 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -82,3 +82,17 @@ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); + +#define __node_2_tq_linked(_n) \ + container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node) + +static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b) +{ + return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires; +} + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less); +} +EXPORT_SYMBOL_GPL(timerqueue_linked_add); -- cgit v1.2.3 From b7418e6e9b87b849af4df93d527ff83498d1e4c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:57 +0100 Subject: hrtimer: Use linked timerqueue To prepare for optimizing the rearming of enqueued timers, switch to the linked timerqueue. That allows to check whether the new expiry time changes the position of the timer in the RB tree or not, by checking the new expiry time against the previous and the next timers expiry. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.806643179@kernel.org --- include/linux/hrtimer_defs.h | 16 ++++++++-------- include/linux/hrtimer_types.h | 8 ++++---- kernel/time/hrtimer.c | 34 +++++++++++++++++----------------- kernel/time/timer_list.c | 10 ++++------ 4 files changed, 33 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index fb38df4c0b64..0f851b2432c3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -25,14 +25,14 @@ * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { - struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; - seqcount_raw_spinlock_t seq; - ktime_t expires_next; - struct hrtimer *running; - struct timerqueue_head active; - ktime_t offset; + struct hrtimer_cpu_base *cpu_base; + unsigned int index; + clockid_t clockid; + seqcount_raw_spinlock_t seq; + ktime_t expires_next; + struct hrtimer *running; + struct timerqueue_linked_head active; + ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 0e22bc91d00f..b5dacc8271a4 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -17,7 +17,7 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, + * @node: Linked timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding @@ -39,15 +39,15 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*__private function)(struct hrtimer *); + struct timerqueue_linked_node node; struct hrtimer_clock_base *base; bool is_queued; bool is_rel; bool is_soft; bool is_hard; bool is_lazy; + ktime_t _softexpires; + enum hrtimer_restart (*__private function)(struct hrtimer *); }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d1e58482e0a9..5e45982363ce 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -557,10 +557,10 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas * If the excluded timer is the first on this base evaluate the * next timer. */ - struct timerqueue_node *node = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); if (unlikely(&exclude->node == node)) { - node = timerqueue_iterate_next(node); + node = timerqueue_linked_next(node); if (!node) continue; expires = ktime_sub(node->expires, base->offset); @@ -576,7 +576,7 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return container_of(next, struct hrtimer, node); } @@ -938,9 +938,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; @@ -1112,7 +1112,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - if (!timerqueue_add(&base->active, &timer->node)) + if (!timerqueue_linked_add(&base->active, &timer->node)) return false; base->expires_next = hrtimer_get_expires(timer); @@ -1121,7 +1121,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba static inline void base_update_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); base->expires_next = next ? next->expires : KTIME_MAX; } @@ -1148,9 +1148,9 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); - was_first = &timer->node == timerqueue_getnext(&base->active); + was_first = !timerqueue_linked_prev(&timer->node); - if (!timerqueue_del(&base->active, &timer->node)) + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* Nothing to update if this was not the first timer in the base */ @@ -1212,8 +1212,8 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); - was_first = &timer->node == timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); + was_first = !timerqueue_linked_prev(&timer->node); + timerqueue_linked_del(&base->active, &timer->node); } /* Set the new expiry time */ @@ -1226,7 +1226,7 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); /* If it's the first expiring timer now or again, update base */ - if (timerqueue_add(&base->active, &timer->node)) { + if (timerqueue_linked_add(&base->active, &timer->node)) { base->expires_next = expires; return true; } @@ -1758,7 +1758,7 @@ static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(st timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); + timerqueue_linked_init(&timer->node); if (WARN_ON_ONCE(!fn)) ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; @@ -1923,7 +1923,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return next ? container_of(next, struct hrtimer, node) : NULL; } @@ -2369,7 +2369,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2399,10 +2399,10 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { - struct timerqueue_node *node; + struct timerqueue_linked_node *node; struct hrtimer *timer; - while ((node = timerqueue_getnext(&old_base->active))) { + while ((node = timerqueue_linked_first(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 19e61826b7de..e2e14fd1b466 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } -- cgit v1.2.3 From 38e18d825f7281fdc16d3241df5115ce6eaeaf79 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:41 -0800 Subject: locking: Fix rwlock and spinlock lock context annotations Fix two incorrect rwlock_t lock context annotations. Add the raw_spinlock_t lock context annotations that are missing. Fixes: f16a802d402d ("locking/rwlock, spinlock: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-2-bvanassche@acm.org --- include/linux/rwlock.h | 4 ++-- include/linux/rwlock_api_smp.h | 6 ++++-- include/linux/spinlock.h | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3390d21c95dd..21ceefc4a49f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -30,10 +30,10 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_read_lock(rwlock_t *lock) __acquires_shared(lock); - extern int do_raw_read_trylock(rwlock_t *lock); + extern int do_raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); extern void do_raw_read_unlock(rwlock_t *lock) __releases_shared(lock); extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock); - extern int do_raw_write_trylock(rwlock_t *lock); +extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 61a852609eab..9e02a5f28cd1 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -23,7 +23,7 @@ void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires_shared(lock); void __lockfunc _raw_write_lock_irq(rwlock_t *lock) __acquires(lock); unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) - __acquires(lock); + __acquires_shared(lock); unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) __acquires(lock); int __lockfunc _raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); @@ -36,7 +36,7 @@ void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) __releases_shared(lock); void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) __releases(lock); void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); + __releases_shared(lock); void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); @@ -116,6 +116,7 @@ _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) #endif static inline int __raw_read_trylock(rwlock_t *lock) + __cond_acquires_shared(true, lock) { preempt_disable(); if (do_raw_read_trylock(lock)) { @@ -127,6 +128,7 @@ static inline int __raw_read_trylock(rwlock_t *lock) } static inline int __raw_write_trylock(rwlock_t *lock) + __cond_acquires(true, lock) { preempt_disable(); if (do_raw_write_trylock(lock)) { diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index e1e2f144af9b..241277cd34cf 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -178,7 +178,7 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); - extern int do_raw_spin_trylock(raw_spinlock_t *lock); + extern int do_raw_spin_trylock(raw_spinlock_t *lock) __cond_acquires(true, lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) @@ -189,6 +189,7 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) + __cond_acquires(true, lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); -- cgit v1.2.3 From 39be7b21af24d1d2ed3b18caac57dd219fef226e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:42 -0800 Subject: signal: Fix the lock_task_sighand() annotation lock_task_sighand() may return NULL. Make this clear in its lock context annotation. Fixes: 04e49d926f43 ("sched: Enable context analysis for core.c and fair.c") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-3-bvanassche@acm.org --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..a4835a7de07e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -739,7 +739,7 @@ static inline int thread_group_empty(struct task_struct *p) extern struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) - __acquires(&task->sighand->siglock); + __cond_acquires(nonnull, &task->sighand->siglock); static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) -- cgit v1.2.3 From 3dcef70e41ab13483803c536ddea8d5f1803ee25 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:43 -0800 Subject: ww-mutex: Fix the ww_acquire_ctx function annotations The ww_acquire_done() call is optional. Reflect this in the annotations of ww_acquire_done(). Fixes: 47907461e4f6 ("locking/ww_mutex: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Maarten Lankhorst Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-4-bvanassche@acm.org --- include/linux/ww_mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 85b1fff02fde..0c95ead5a297 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -181,7 +181,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, * data structures. */ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) - __releases(ctx) __acquires_shared(ctx) __no_context_analysis + __must_hold(ctx) { #ifdef DEBUG_WW_MUTEXES lockdep_assert_held(ctx); @@ -199,7 +199,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) * mutexes have been released with ww_mutex_unlock. */ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) - __releases_shared(ctx) __no_context_analysis + __releases(ctx) __no_context_analysis { #ifdef CONFIG_DEBUG_LOCK_ALLOC mutex_release(&ctx->first_lock_dep_map, _THIS_IP_); -- cgit v1.2.3 From 4a91a33f15c634fb3477d122bdf1eef098d77ee3 Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 27 Feb 2026 14:24:01 +0530 Subject: workqueue: Update documentation as per system_percpu_wq naming Update documentation to use "per-CPU workqueue" instead of "global workqueue" to match the system_wq to system_percpu_wq rename. The workqueue behavior remains unchanged; this just aligns terminology with the clearer naming. Fixes: a2be943b46b4 ("workqueue: replace use of system_wq with system_percpu_wq") Signed-off-by: Mallesh Koujalagi Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..fc5744402a66 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -712,14 +712,14 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) } /** - * schedule_work - put work task in global workqueue + * schedule_work - put work task in per-CPU workqueue * @work: job to be done * - * Returns %false if @work was already on the kernel-global workqueue and + * Returns %false if @work was already on the system per-CPU workqueue and * %true otherwise. * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global + * This puts a job in the system per-CPU workqueue if it was not already + * queued and leaves it in the same position on the system per-CPU * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the @@ -796,12 +796,12 @@ extern void __warn_flushing_systemwide_wq(void) }) /** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * schedule_delayed_work_on - queue work in per-CPU workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, @@ -811,11 +811,11 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, } /** - * schedule_delayed_work - put work task in global workqueue after delay + * schedule_delayed_work - put work task in per-CPU workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, -- cgit v1.2.3 From e1092d5e15e6a9b168bf830af9a26d7ea17cd57d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:44 +0100 Subject: PCI/PTM: Do not enable PTM automatically for Root and Switch Upstream Ports Currently we enable PTM automatically for Root and Switch Upstream Ports if the advertised capabilities support the relevant role. However, there are a few issues with this. First of all, if there is no Endpoint that actually needs the PTM functionality, this is just wasting link bandwidth. There are just a couple of drivers calling pci_ptm_enable() in the tree. Secondly, we do the enablement in pci_ptm_init() that is called pretty early for the Switch Upstream Port before Downstream Ports are even enumerated. Since the Upstream Port configuration affects the whole Switch, enabling it this early might cause PTM requests to be sent. We actually do see effects of this: pcieport 0000:00:07.1: pciehp: Slot(6-1): Card present pcieport 0000:00:07.1: pciehp: Slot(6-1): Link Up pci 0000:2c:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port ... pci 0000:2c:00.0: PTM enabled, 4ns granularity At this point we have only enumerated the Switch Upstream Port and now PTM got enabled which immediately triggers a flood of errors: pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:d44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 pcieport 0000:00:07.1: AER: device recovery successful pcieport 0000:00:07.1: AER: Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 In the above TLP Header the Requester ID is 0 which causes an error as we have ACS Source Validation enabled. Change the PTM enablement to happen at the time pci_enable_ptm() is called. It will try to enable PTM first for upstream devices before enabling for the Endpoint itself. For disable path we need to keep count of how many times PTM has been enabled and disable it only on the last, so change the dev->ptm_enabled to a counter (and rename it to dev->ptm_enable_cnt analogous to dev->pci_enable_cnt). Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260224111044.3487873-6-mika.westerberg@linux.intel.com --- drivers/pci/pcie/ptm.c | 68 +++++++++++++++++++++++++++++--------------------- include/linux/pci.h | 2 +- 2 files changed, 40 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 2c848ae4f15f..a41ffd1914de 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -52,6 +52,7 @@ void pci_ptm_init(struct pci_dev *dev) return; dev->ptm_cap = ptm; + atomic_set(&dev->ptm_enable_cnt, 0); pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u32)); pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); @@ -85,10 +86,6 @@ void pci_ptm_init(struct pci_dev *dev) dev->ptm_responder = 1; if (cap & PCI_PTM_CAP_REQ) dev->ptm_requester = 1; - - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || - pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) - pci_enable_ptm(dev); } void pci_save_ptm_state(struct pci_dev *dev) @@ -129,26 +126,11 @@ void pci_restore_ptm_state(struct pci_dev *dev) static int __pci_enable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; - struct pci_dev *ups; u32 ctrl; if (!ptm) return -EINVAL; - /* - * A device uses local PTM Messages to request time information - * from a PTM Root that's farther upstream. Every device along the - * path must support PTM and have it enabled so it can handle the - * messages. Therefore, if this device is not a PTM Root, the - * upstream link partner must have PTM enabled before we can enable - * PTM. - */ - if (!dev->ptm_root) { - ups = pci_upstream_ptm(dev); - if (!ups || !ups->ptm_enabled) - return -EINVAL; - } - switch (pci_pcie_type(dev)) { case PCI_EXP_TYPE_ROOT_PORT: if (!dev->ptm_root) @@ -193,11 +175,35 @@ int pci_enable_ptm(struct pci_dev *dev) int rc; char clock_desc[8]; + /* + * A device uses local PTM Messages to request time information + * from a PTM Root that's farther upstream. Every device along + * the path must support PTM and have it enabled so it can + * handle the messages. Therefore, if this device is not a PTM + * Root, the upstream link partner must have PTM enabled before + * we can enable PTM. + */ + if (!dev->ptm_root) { + struct pci_dev *parent; + + parent = pci_upstream_ptm(dev); + if (!parent) + return -EINVAL; + /* Enable PTM for the parent */ + rc = pci_enable_ptm(parent); + if (rc) + return rc; + } + + /* Already enabled? */ + if (atomic_inc_return(&dev->ptm_enable_cnt) > 1) + return 0; + rc = __pci_enable_ptm(dev); - if (rc) + if (rc) { + atomic_dec(&dev->ptm_enable_cnt); return rc; - - dev->ptm_enabled = 1; + } switch (dev->ptm_granularity) { case 0: @@ -239,27 +245,31 @@ static void __pci_disable_ptm(struct pci_dev *dev) */ void pci_disable_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) { + struct pci_dev *parent; + + if (atomic_dec_and_test(&dev->ptm_enable_cnt)) __pci_disable_ptm(dev); - dev->ptm_enabled = 0; - } + + parent = pci_upstream_ptm(dev); + if (parent) + pci_disable_ptm(parent); } EXPORT_SYMBOL(pci_disable_ptm); /* - * Disable PTM, but preserve dev->ptm_enabled so we silently re-enable it on + * Disable PTM, but preserve dev->ptm_enable_cnt so we silently re-enable it on * resume if necessary. */ void pci_suspend_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) + if (atomic_read(&dev->ptm_enable_cnt)) __pci_disable_ptm(dev); } /* If PTM was enabled before suspend, re-enable it when resuming */ void pci_resume_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) + if (atomic_read(&dev->ptm_enable_cnt)) __pci_enable_ptm(dev); } @@ -268,7 +278,7 @@ bool pcie_ptm_enabled(struct pci_dev *dev) if (!dev) return false; - return dev->ptm_enabled; + return atomic_read(&dev->ptm_enable_cnt); } EXPORT_SYMBOL(pcie_ptm_enabled); diff --git a/include/linux/pci.h b/include/linux/pci.h index 8aaa72dcb164..c620d4b6c52e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -518,7 +518,7 @@ struct pci_dev { unsigned int ptm_root:1; unsigned int ptm_responder:1; unsigned int ptm_requester:1; - unsigned int ptm_enabled:1; + atomic_t ptm_enable_cnt; u8 ptm_granularity; #endif #ifdef CONFIG_PCI_MSI -- cgit v1.2.3 From 658fa7b1c47a857af484c5c5dff8d0164b7c7bfb Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Fri, 6 Feb 2026 19:56:52 +0530 Subject: ACPI: CPPC: Add cppc_get_perf() API to read performance controls Add cppc_get_perf() function to read values of performance control registers including desired_perf, min_perf, max_perf, energy_perf, and auto_sel. This provides a read interface to complement the existing cppc_set_perf() write interface for performance control registers. Note that auto_sel is read by cppc_get_perf() but not written by cppc_set_perf() to avoid unintended mode changes during performance updates. It can be updated with existing dedicated cppc_set_auto_sel() API. Use cppc_get_perf() in cppc_cpufreq_get_cpu_data() to initialize perf_ctrls with current hardware register values during cpufreq policy initialization. Signed-off-by: Sumit Gupta Reviewed-by: Pierre Gondois Reviewed-by: Lifeng Zheng Link: https://patch.msgid.link/20260206142658.72583-2-sumitg@nvidia.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 80 ++++++++++++++++++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 6 ++++ include/acpi/cppc_acpi.h | 5 +++ 3 files changed, 91 insertions(+) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index f0e513e9ed5d..5122e99bddb0 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1738,6 +1738,86 @@ int cppc_set_enable(int cpu, bool enable) } EXPORT_SYMBOL_GPL(cppc_set_enable); +/** + * cppc_get_perf - Get a CPU's performance controls. + * @cpu: CPU for which to get performance controls. + * @perf_ctrls: ptr to cppc_perf_ctrls. See cppc_acpi.h + * + * Return: 0 for success with perf_ctrls, -ERRNO otherwise. + */ +int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cpc_register_resource *desired_perf_reg, + *min_perf_reg, *max_perf_reg, + *energy_perf_reg, *auto_sel_reg; + u64 desired_perf = 0, min = 0, max = 0, energy_perf = 0, auto_sel = 0; + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = 0, regs_in_pcc = 0; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + if (!perf_ctrls) { + pr_debug("Invalid perf_ctrls pointer\n"); + return -EINVAL; + } + + desired_perf_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; + min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF]; + max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF]; + energy_perf_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; + + /* Are any of the regs PCC ?*/ + if (CPC_IN_PCC(desired_perf_reg) || CPC_IN_PCC(min_perf_reg) || + CPC_IN_PCC(max_perf_reg) || CPC_IN_PCC(energy_perf_reg) || + CPC_IN_PCC(auto_sel_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); + return -ENODEV; + } + pcc_ss_data = pcc_data[pcc_ss_id]; + regs_in_pcc = 1; + down_write(&pcc_ss_data->pcc_lock); + /* Ring doorbell once to update PCC subspace */ + if (send_pcc_cmd(pcc_ss_id, CMD_READ) < 0) { + ret = -EIO; + goto out_err; + } + } + + /* Read optional elements if present */ + if (CPC_SUPPORTED(max_perf_reg)) + cpc_read(cpu, max_perf_reg, &max); + perf_ctrls->max_perf = max; + + if (CPC_SUPPORTED(min_perf_reg)) + cpc_read(cpu, min_perf_reg, &min); + perf_ctrls->min_perf = min; + + if (CPC_SUPPORTED(desired_perf_reg)) + cpc_read(cpu, desired_perf_reg, &desired_perf); + perf_ctrls->desired_perf = desired_perf; + + if (CPC_SUPPORTED(energy_perf_reg)) + cpc_read(cpu, energy_perf_reg, &energy_perf); + perf_ctrls->energy_perf = energy_perf; + + if (CPC_SUPPORTED(auto_sel_reg)) + cpc_read(cpu, auto_sel_reg, &auto_sel); + perf_ctrls->auto_sel = (bool)auto_sel; + +out_err: + if (regs_in_pcc) + up_write(&pcc_ss_data->pcc_lock); + return ret; +} +EXPORT_SYMBOL_GPL(cppc_get_perf); + /** * cppc_set_perf - Set a CPU's performance controls. * @cpu: CPU for which to set performance controls. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 011f35cb47b9..a61a24e0dcae 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -594,6 +594,12 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) goto free_mask; } + ret = cppc_get_perf(cpu, &cpu_data->perf_ctrls); + if (ret) { + pr_debug("Err reading CPU%d perf ctrls: ret:%d\n", cpu, ret); + goto free_mask; + } + return cpu_data; free_mask: diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 4d644f03098e..3fc796c0d902 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -151,6 +151,7 @@ extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); +extern int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); @@ -193,6 +194,10 @@ static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ { return -EOPNOTSUPP; } +static inline int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + return -EOPNOTSUPP; +} static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { return -EOPNOTSUPP; -- cgit v1.2.3 From 13c45a26635fa51a68911aa57e6778bdad18b103 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Fri, 6 Feb 2026 19:56:57 +0530 Subject: ACPI: CPPC: add APIs and sysfs interface for perf_limited Add sysfs interface to read/write the Performance Limited register. The Performance Limited register indicates to the OS that an unpredictable event (like thermal throttling) has limited processor performance. It contains two sticky bits set by the platform: - Bit 0 (Desired_Excursion): Set when delivered performance is constrained below desired performance. Not used when Autonomous Selection is enabled. - Bit 1 (Minimum_Excursion): Set when delivered performance is constrained below minimum performance. These bits remain set until OSPM explicitly clears them. The write operation accepts a bitmask of bits to clear: - Write 0x1 to clear bit 0 - Write 0x2 to clear bit 1 - Write 0x3 to clear both bits This enables users to detect if platform throttling impacted a workload. Users clear the register before execution, run the workload, then check afterward - if set, hardware throttling occurred during that time window. The interface is exposed as: /sys/devices/system/cpu/cpuX/cpufreq/perf_limited Signed-off-by: Sumit Gupta Reviewed-by: Pierre Gondois Reviewed-by: Lifeng Zheng Link: https://patch.msgid.link/20260206142658.72583-7-sumitg@nvidia.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 56 ++++++++++++++++++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 5 ++++ include/acpi/cppc_acpi.h | 15 +++++++++++ 3 files changed, 76 insertions(+) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 94a7ffa8be3c..53a6ffd995a1 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1978,6 +1978,62 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) } EXPORT_SYMBOL_GPL(cppc_set_perf); +/** + * cppc_get_perf_limited - Get the Performance Limited register value. + * @cpu: CPU from which to get Performance Limited register. + * @perf_limited: Pointer to store the Performance Limited value. + * + * The returned value contains sticky status bits indicating platform-imposed + * performance limitations. + * + * Return: 0 for success, -EIO on failure, -EOPNOTSUPP if not supported. + */ +int cppc_get_perf_limited(int cpu, u64 *perf_limited) +{ + return cppc_get_reg_val(cpu, PERF_LIMITED, perf_limited); +} +EXPORT_SYMBOL_GPL(cppc_get_perf_limited); + +/** + * cppc_set_perf_limited() - Clear bits in the Performance Limited register. + * @cpu: CPU on which to write register. + * @bits_to_clear: Bitmask of bits to clear in the perf_limited register. + * + * The Performance Limited register contains two sticky bits set by platform: + * - Bit 0 (Desired_Excursion): Set when delivered performance is constrained + * below desired performance. Not used when Autonomous Selection is enabled. + * - Bit 1 (Minimum_Excursion): Set when delivered performance is constrained + * below minimum performance. + * + * These bits are sticky and remain set until OSPM explicitly clears them. + * This function only allows clearing bits (the platform sets them). + * + * Return: 0 for success, -EINVAL for invalid bits, -EIO on register + * access failure, -EOPNOTSUPP if not supported. + */ +int cppc_set_perf_limited(int cpu, u64 bits_to_clear) +{ + u64 current_val, new_val; + int ret; + + /* Only bits 0 and 1 are valid */ + if (bits_to_clear & ~CPPC_PERF_LIMITED_MASK) + return -EINVAL; + + if (!bits_to_clear) + return 0; + + ret = cppc_get_perf_limited(cpu, ¤t_val); + if (ret) + return ret; + + /* Clear the specified bits */ + new_val = current_val & ~bits_to_clear; + + return cppc_set_reg_val(cpu, PERF_LIMITED, new_val); +} +EXPORT_SYMBOL_GPL(cppc_set_perf_limited); + /** * cppc_get_transition_latency - returns frequency transition latency in ns * @cpu_num: CPU number for per_cpu(). diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8a8cf76828ee..94d489a4c90d 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -985,16 +985,21 @@ store_energy_performance_preference_val(struct cpufreq_policy *policy, return count; } +CPPC_CPUFREQ_ATTR_RW_U64(perf_limited, cppc_get_perf_limited, + cppc_set_perf_limited) + cpufreq_freq_attr_ro(freqdomain_cpus); cpufreq_freq_attr_rw(auto_select); cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_performance_preference_val); +cpufreq_freq_attr_rw(perf_limited); static struct freq_attr *cppc_cpufreq_attr[] = { &freqdomain_cpus, &auto_select, &auto_act_window, &energy_performance_preference_val, + &perf_limited, NULL, }; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 3fc796c0d902..f7afa20b8ad9 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -42,6 +42,11 @@ #define CPPC_EPP_PERFORMANCE_PREF 0x00 #define CPPC_EPP_ENERGY_EFFICIENCY_PREF 0xFF +#define CPPC_PERF_LIMITED_DESIRED_EXCURSION BIT(0) +#define CPPC_PERF_LIMITED_MINIMUM_EXCURSION BIT(1) +#define CPPC_PERF_LIMITED_MASK (CPPC_PERF_LIMITED_DESIRED_EXCURSION | \ + CPPC_PERF_LIMITED_MINIMUM_EXCURSION) + /* Each register has the folowing format. */ struct cpc_reg { u8 descriptor; @@ -174,6 +179,8 @@ extern int cppc_get_auto_act_window(int cpu, u64 *auto_act_window); extern int cppc_set_auto_act_window(int cpu, u64 auto_act_window); extern int cppc_get_auto_sel(int cpu, bool *enable); extern int cppc_set_auto_sel(int cpu, bool enable); +extern int cppc_get_perf_limited(int cpu, u64 *perf_limited); +extern int cppc_set_perf_limited(int cpu, u64 bits_to_clear); extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); extern int amd_detect_prefcore(bool *detected); @@ -270,6 +277,14 @@ static inline int cppc_set_auto_sel(int cpu, bool enable) { return -EOPNOTSUPP; } +static inline int cppc_get_perf_limited(int cpu, u64 *perf_limited) +{ + return -EOPNOTSUPP; +} +static inline int cppc_set_perf_limited(int cpu, u64 bits_to_clear) +{ + return -EOPNOTSUPP; +} static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) { return -ENODEV; -- cgit v1.2.3 From 11c0663a595801b6e6f7a937adec8532706ef486 Mon Sep 17 00:00:00 2001 From: Jens Emil Schulz Østergaard Date: Thu, 26 Feb 2026 09:24:19 +0100 Subject: net: phy: micrel: Add support for lan9645x internal phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAN9645X is a family of switch chips with 5 internal copper phys. The internal PHY is based on parts of LAN8832. This is a low-power, single port triple-speed (10BASE-T/100BASE-TX/1000BASE-T) ethernet physical layer transceiver (PHY) that supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6 Unshielded Twisted Pair (UTP) cables. Add support for the internal PHY of the lan9645x chip family. Reviewed-by: Steen Hegelund Reviewed-by: Daniel Machon Signed-off-by: Jens Emil Schulz Østergaard Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260226-phy_micrel_add_support_for_lan9645x_internal_phy-v3-1-1fe82379962b@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 152 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/micrel_phy.h | 1 + 2 files changed, 153 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index c6b011a9d636..2aa1dedd21b8 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -6523,6 +6523,142 @@ static void lan8842_get_phy_stats(struct phy_device *phydev, stats->tx_errors = priv->phy_stats.tx_errors; } +#define LAN9645X_CTRL_REG 0x1f +#define LAN9645X_CTRL_REG_SW_SOFT_RST BIT(1) + +#define LAN9645X_DAC_ICAS_AMP_POWER_DOWN 0x47 +#define LAN9645X_BTRX_QBIAS_POWER_DOWN 0x46 +#define LAN9645X_TX_LOW_I_CH_CD_POWER_MGMT 0x45 +#define LAN9645X_TX_LOW_I_CH_B_POWER_MGMT 0x44 +#define LAN9645X_TX_LOW_I_CH_A_POWER_MGMT 0x43 + +static const struct lanphy_reg_data force_dac_tx_errata[] = { + /* Force channel A/B/C/D TX on */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_DAC_ICAS_AMP_POWER_DOWN, + 0 }, + /* Force channel A/B/C/D QBias on */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_BTRX_QBIAS_POWER_DOWN, + 0xaa }, + /* Tx low I on channel C/D overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_CD_POWER_MGMT, + 0xbfff }, + /* Channel B low I overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_B_POWER_MGMT, + 0xabbf }, + /* Channel A low I overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_A_POWER_MGMT, + 0xbd3f }, +}; + +static int lan9645x_config_init(struct phy_device *phydev) +{ + int ret; + + /* Apply erratas from previous generations. */ + ret = lan8842_erratas(phydev); + if (ret < 0) + return ret; + + /* Apply errata for an issue where bringing a port down, can cause a few + * CRC errors for traffic flowing through adjacent ports. + */ + return lanphy_write_reg_data(phydev, force_dac_tx_errata, + ARRAY_SIZE(force_dac_tx_errata)); +} + +static int lan9645x_suspend(struct phy_device *phydev) +{ + int ret, val; + + /* Force link down before software power down (SPD), by doing software + * soft reset. This resets the PHY, but keeps all register configuration + * intact. The bit self clears. + * + * This is needed as a workaround for an issue where performing SPD on a + * port can bring adjacent ports down, when there is traffic flowing + * through the ports. + */ + ret = phy_set_bits(phydev, LAN9645X_CTRL_REG, + LAN9645X_CTRL_REG_SW_SOFT_RST); + if (ret) + return ret; + + ret = phy_read_poll_timeout(phydev, LAN9645X_CTRL_REG, val, + !(val & LAN9645X_CTRL_REG_SW_SOFT_RST), + 3000, 100000, true); + if (ret) + return ret; + + return genphy_suspend(phydev); +} + +static int lan9645x_config_intr(struct phy_device *phydev) +{ + int err; + + /* enable / disable interrupts */ + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + /* This is an internal PHY of lan9645x and is not possible to + * change the polarity of irq sources in the OIC (CPU_INTR) + * found in lan9645x. Therefore change the polarity of the + * interrupt in the PHY from being active low instead of active + * high. + */ + err = phy_write(phydev, LAN8804_CONTROL, + LAN8804_CONTROL_INTR_POLARITY); + if (err) + return err; + + /* By default interrupt buffer is open-drain in which case the + * interrupt can be active only low. Therefore change the + * interrupt buffer to be push-pull to be able to change + * interrupt polarity. + */ + err = phy_write(phydev, LAN8804_OUTPUT_CONTROL, + LAN8804_OUTPUT_CONTROL_INTR_BUFFER); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + if (err) + return err; + + err = phy_write(phydev, LAN8814_INTC, + LAN8814_INT_LINK | LAN8814_INT_FLF); + } else { + err = phy_write(phydev, LAN8814_INTC, 0); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + } + + return err; +} + +static irqreturn_t lan9645x_handle_interrupt(struct phy_device *phydev) +{ + int status; + + status = phy_read(phydev, LAN8814_INTS); + if (status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + if (status & (LAN8814_INT_LINK | LAN8814_INT_FLF)) { + phy_trigger_machine(phydev); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + static struct phy_driver ksphy_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_KS8737), @@ -6761,6 +6897,21 @@ static struct phy_driver ksphy_driver[] = { .set_tunable = lan8842_set_tunable, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, +}, { + PHY_ID_MATCH_MODEL(PHY_ID_LAN9645X), + .name = "Microchip LAN9645X Gigabit PHY", + .config_init = lan9645x_config_init, + .driver_data = &ksz9021_type, + .probe = kszphy_probe, + .soft_reset = genphy_soft_reset, + .suspend = lan9645x_suspend, + .resume = genphy_resume, + .config_intr = lan9645x_config_intr, + .handle_interrupt = lan9645x_handle_interrupt, + .get_tunable = lan8842_get_tunable, + .set_tunable = lan8842_set_tunable, + .get_phy_stats = lan8842_get_phy_stats, + .update_stats = lan8842_update_stats, }, { PHY_ID_MATCH_MODEL(PHY_ID_KSZ9131), .name = "Microchip KSZ9131 Gigabit PHY", @@ -6859,6 +7010,7 @@ static const struct mdio_device_id __maybe_unused micrel_tbl[] = { { PHY_ID_MATCH_MODEL(PHY_ID_LAN8804) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8841) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8842) }, + { PHY_ID_MATCH_MODEL(PHY_ID_LAN9645X) }, { } }; diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index ca691641788b..9c6f9817383f 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -33,6 +33,7 @@ #define PHY_ID_LAN8804 0x00221670 #define PHY_ID_LAN8841 0x00221650 #define PHY_ID_LAN8842 0x002216C0 +#define PHY_ID_LAN9645X 0x002216D0 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 5151ec54f5861ad71e08190a7ce2173df788d36a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Feb 2026 02:12:15 +0000 Subject: net: use try_cmpxchg() in lock_sock_nested() Add a fast path in lock_sock_nested(), to avoid acquiring the socket spinlock only to set @owned to one: spin_lock_bh(&sk->sk_lock.slock); if (unlikely(sock_owned_by_user_nocheck(sk))) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); On x86_64 compiler generates something quite efficient: 00000000000077c0 : 77c0: f3 0f 1e fa endbr64 77c4: e8 00 00 00 00 call __fentry__ 77c9: b9 01 00 00 00 mov $0x1,%ecx 77ce: 31 c0 xor %eax,%eax 77d0: f0 48 0f b1 8f 48 01 00 00 lock cmpxchg %rcx,0x148(%rdi) 77d9: 75 06 jne slow_path 77db: 2e e9 00 00 00 00 cs jmp __x86_return_thunk-0x4 slow_path: ... Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260226021215.1764237-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 +++++++-- net/core/sock.c | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index e7099d3d0416..a7a8b31e9877 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -81,8 +81,13 @@ * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { - spinlock_t slock; - int owned; + union { + struct slock_owned { + int owned; + spinlock_t slock; + }; + long combined; + }; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics diff --git a/net/core/sock.c b/net/core/sock.c index 2032be9a03b5..9d841975a7a1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3780,6 +3780,19 @@ void noinline lock_sock_nested(struct sock *sk, int subclass) mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); +#ifdef CONFIG_64BIT + if (sizeof(struct slock_owned) == sizeof(long)) { + socket_lock_t tmp, old; + + tmp.slock = __SPIN_LOCK_UNLOCKED(tmp.slock); + tmp.owned = 1; + old.slock = __SPIN_LOCK_UNLOCKED(old.slock); + old.owned = 0; + if (likely(try_cmpxchg(&sk->sk_lock.combined, + &old.combined, tmp.combined))) + return; + } +#endif spin_lock_bh(&sk->sk_lock.slock); if (unlikely(sock_owned_by_user_nocheck(sk))) __lock_sock(sk); -- cgit v1.2.3 From 6466441a5ecd1c1168264e4c322bae455579b156 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Feb 2026 04:12:13 +0000 Subject: net: inline skb_add_rx_frag_netmem() This critical helper (via skb_add_rx_frag()) is mostly used from drivers rx fast path. It is time to inline it, this actually saves space in vmlinux: size vmlinux.old vmlinux text data bss dec hex filename 37350766 23092977 4846992 65290735 3e441ef vmlinux.old 37350600 23092977 4846992 65290569 3e44149 vmlinux Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260226041213.1892561-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 13 +++++++++++-- net/core/skbuff.c | 11 ----------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index daa4e4944ce3..9cc98f850f1d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,8 +2682,17 @@ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, shinfo->nr_frags = i + 1; } -void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, - int off, int size, unsigned int truesize); +static inline void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, + netmem_ref netmem, int off, + int size, unsigned int truesize) +{ + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_fill_netmem_desc(skb, i, netmem, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} static inline void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0e217041958a..513cbfed19bc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -891,17 +891,6 @@ skb_fail: } EXPORT_SYMBOL(napi_alloc_skb); -void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, - int off, int size, unsigned int truesize) -{ - DEBUG_NET_WARN_ON_ONCE(size > truesize); - - skb_fill_netmem_desc(skb, i, netmem, off, size); - skb->len += size; - skb->data_len += size; - skb->truesize += truesize; -} -EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) -- cgit v1.2.3 From 2164242c50084bd5b359b7d554d3a124e2c19074 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 26 Feb 2026 14:10:04 -0800 Subject: NFC: fix header file kernel-doc warnings Repair some of the comments: - use the correct enum names - don't use "/**" for a non-kernel-doc comment to fix these warnings: Warning: include/uapi/linux/nfc.h:127 Excess enum value '@NFC_EVENT_DEVICE_DEACTIVATED' description in 'nfc_commands' Warning: include/uapi/linux/nfc.h:204 Excess enum value '@NFC_ATTR_APDU' description in 'nfc_attrs' Warning: include/uapi/linux/nfc.h:302 expecting prototype for Pseudo(). Prototype was for NFC_RAW_HEADER_SIZE() instead Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226221004.1037909-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/nfc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 2f5b4be25261..82805eee4357 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -55,7 +55,7 @@ * (it sends %NFC_ATTR_DEVICE_INDEX) * @NFC_EVENT_TM_ACTIVATED: event emitted when the adapter is activated in * target mode. - * @NFC_EVENT_DEVICE_DEACTIVATED: event emitted when the adapter is deactivated + * @NFC_EVENT_TM_DEACTIVATED: event emitted when the adapter is deactivated * from target mode. * @NFC_CMD_LLC_GET_PARAMS: request LTO, RW, and MIUX parameters for a device * @NFC_CMD_LLC_SET_PARAMS: set one or more of LTO, RW, and MIUX parameters for @@ -156,7 +156,7 @@ enum nfc_commands { * @NFC_ATTR_SE_INDEX: Secure element index * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED) * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status - * @NFC_ATTR_APDU: Secure element APDU + * @NFC_ATTR_SE_APDU: Secure element APDU * @NFC_ATTR_TARGET_ISO15693_DSFID: ISO 15693 Data Storage Format Identifier * @NFC_ATTR_TARGET_ISO15693_UID: ISO 15693 Unique Identifier * @NFC_ATTR_SE_PARAMS: Parameters data from an evt_transaction @@ -291,7 +291,7 @@ struct sockaddr_nfc_llcp { #define NFC_HEADER_SIZE 1 -/** +/* * Pseudo-header info for raw socket packets * First byte is the adapter index * Second byte contains flags -- cgit v1.2.3 From 84446536f63d471ab16b2faa25eeab1df21ace0a Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Tue, 24 Feb 2026 21:56:19 +0100 Subject: ALSA: control: Verify put() result when in debug mode The put() operation is expected to return: 1) 0 on success if no changes were made 2) 1 on success if changes were made 3) error code otherwise Currently 2) is usually ignored when writing control-operations. While forcing compliance is not an option right now, make it easier for developers to adhere to the expectations and notice problems by logging them when CONFIG_SND_CTL_DEBUG is enabled. Due to large size of struct snd_ctl_elem_value, 'value_buf' is provided as a reusable buffer for kctl->put() verification. This prevents exhausting the stack when verifying the operation. >From user perspective, patch introduces a new trace/events category 'snd_ctl' containing a single 'snd_ctl_put' event type. Log sample: amixer-1086 [003] ..... 8.035939: snd_ctl_put: success: expected=0, actual=0 for ctl numid=1, iface=MIXER, name='Master Playback Volume', index=0, device=0, subdevice=0, card=0 amixer-1087 [003] ..... 8.938721: snd_ctl_put: success: expected=1, actual=1 for ctl numid=1, iface=MIXER, name='Master Playback Volume', index=0, device=0, subdevice=0, card=0 amixer-1088 [003] ..... 9.631470: snd_ctl_put: success: expected=1, actual=1 for ctl numid=1, iface=MIXER, name='Master Playback Volume', index=0, device=0, subdevice=0, card=0 amixer-1089 [000] ..... 9.636786: snd_ctl_put: fail: expected=1, actual=0 for ctl numid=5, iface=MIXER, name='Loopback Mute', index=0, device=0, subdevice=0, card=0 Signed-off-by: Cezary Rojewski Reviewed-by: Mark Brown Reviewed-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260224205619.584795-1-cezary.rojewski@intel.com --- include/sound/core.h | 3 ++ sound/core/Makefile | 1 + sound/core/control.c | 76 +++++++++++++++++++++++++++++++++++++++++++++- sound/core/control_trace.h | 55 +++++++++++++++++++++++++++++++++ sound/core/init.c | 8 +++++ 5 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 sound/core/control_trace.h (limited to 'include') diff --git a/include/sound/core.h b/include/sound/core.h index 64327e971122..4093ec82a0a1 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -133,6 +133,9 @@ struct snd_card { #ifdef CONFIG_SND_DEBUG struct dentry *debugfs_root; /* debugfs root for card */ #endif +#ifdef CONFIG_SND_CTL_DEBUG + struct snd_ctl_elem_value *value_buf; /* buffer for kctl->put() verification */ +#endif #ifdef CONFIG_PM unsigned int power_state; /* power state */ diff --git a/sound/core/Makefile b/sound/core/Makefile index 31a0623cc89d..fdd3bb6e81a9 100644 --- a/sound/core/Makefile +++ b/sound/core/Makefile @@ -23,6 +23,7 @@ snd-pcm-$(CONFIG_SND_PCM_IEC958) += pcm_iec958.o # for trace-points CFLAGS_pcm_lib.o := -I$(src) CFLAGS_pcm_native.o := -I$(src) +CFLAGS_control.o := -I$(src) snd-pcm-dmaengine-y := pcm_dmaengine.o diff --git a/sound/core/control.c b/sound/core/control.c index 934e84e93838..374e703d15a9 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -19,6 +19,13 @@ #include #include +#ifdef CONFIG_SND_CTL_DEBUG +#define CREATE_TRACE_POINTS +#include "control_trace.h" +#else +#define trace_snd_ctl_put(card, kctl, iname, expected, actual) +#endif + // Max allocation size for user controls. static int max_user_ctl_alloc_size = 8 * 1024 * 1024; module_param_named(max_user_ctl_alloc_size, max_user_ctl_alloc_size, int, 0444); @@ -1264,6 +1271,72 @@ static int snd_ctl_elem_read_user(struct snd_card *card, return result; } +#if IS_ENABLED(CONFIG_SND_CTL_DEBUG) + +static const char *const snd_ctl_elem_iface_names[] = { + [SNDRV_CTL_ELEM_IFACE_CARD] = "CARD", + [SNDRV_CTL_ELEM_IFACE_HWDEP] = "HWDEP", + [SNDRV_CTL_ELEM_IFACE_MIXER] = "MIXER", + [SNDRV_CTL_ELEM_IFACE_PCM] = "PCM", + [SNDRV_CTL_ELEM_IFACE_RAWMIDI] = "RAWMIDI", + [SNDRV_CTL_ELEM_IFACE_TIMER] = "TIMER", + [SNDRV_CTL_ELEM_IFACE_SEQUENCER] = "SEQUENCER", +}; + +static int snd_ctl_put_verify(struct snd_card *card, struct snd_kcontrol *kctl, + struct snd_ctl_elem_value *control) +{ + struct snd_ctl_elem_value *original = card->value_buf; + struct snd_ctl_elem_info info; + const char *iname; + int ret, retcmp; + + memset(original, 0, sizeof(*original)); + memset(&info, 0, sizeof(info)); + + ret = kctl->info(kctl, &info); + if (ret) + return ret; + + ret = kctl->get(kctl, original); + if (ret) + return ret; + + ret = kctl->put(kctl, control); + if (ret < 0) + return ret; + + /* Sanitize the new value (control->value) before comparing. */ + fill_remaining_elem_value(control, &info, 0); + + /* With known state for both new and original, do the comparison. */ + retcmp = memcmp(&original->value, &control->value, sizeof(original->value)); + if (retcmp) + retcmp = 1; + + iname = snd_ctl_elem_iface_names[kctl->id.iface]; + trace_snd_ctl_put(&kctl->id, iname, card->number, ret, retcmp); + + return ret; +} + +static int snd_ctl_put(struct snd_card *card, struct snd_kcontrol *kctl, + struct snd_ctl_elem_value *control, unsigned int access) +{ + if ((access & SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK) || + (access & SNDRV_CTL_ELEM_ACCESS_VOLATILE)) + return kctl->put(kctl, control); + + return snd_ctl_put_verify(card, kctl, control); +} +#else +static inline int snd_ctl_put(struct snd_card *card, struct snd_kcontrol *kctl, + struct snd_ctl_elem_value *control, unsigned int access) +{ + return kctl->put(kctl, control); +} +#endif + static int snd_ctl_elem_write(struct snd_card *card, struct snd_ctl_file *file, struct snd_ctl_elem_value *control) { @@ -1300,7 +1373,8 @@ static int snd_ctl_elem_write(struct snd_card *card, struct snd_ctl_file *file, false); } if (!result) - result = kctl->put(kctl, control); + result = snd_ctl_put(card, kctl, control, vd->access); + if (result < 0) { up_write(&card->controls_rwsem); return result; diff --git a/sound/core/control_trace.h b/sound/core/control_trace.h new file mode 100644 index 000000000000..d30e654b0860 --- /dev/null +++ b/sound/core/control_trace.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM snd_ctl + +#if !defined(_TRACE_SND_CTL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SND_CTL_H + +#include +#include + +TRACE_EVENT(snd_ctl_put, + + TP_PROTO(struct snd_ctl_elem_id *id, const char *iname, unsigned int card, + int expected, int actual), + + TP_ARGS(id, iname, card, expected, actual), + + TP_STRUCT__entry( + __field(unsigned int, numid) + __string(iname, iname) + __string(kname, id->name) + __field(unsigned int, index) + __field(unsigned int, device) + __field(unsigned int, subdevice) + __field(unsigned int, card) + __field(int, expected) + __field(int, actual) + ), + + TP_fast_assign( + __entry->numid = id->numid; + __assign_str(iname); + __assign_str(kname); + __entry->index = id->index; + __entry->device = id->device; + __entry->subdevice = id->subdevice; + __entry->card = card; + __entry->expected = expected; + __entry->actual = actual; + ), + + TP_printk("%s: expected=%d, actual=%d for ctl numid=%d, iface=%s, name='%s', index=%d, device=%d, subdevice=%d, card=%d\n", + __entry->expected == __entry->actual ? "success" : "fail", + __entry->expected, __entry->actual, __entry->numid, + __get_str(iname), __get_str(kname), __entry->index, + __entry->device, __entry->subdevice, __entry->card) +); + +#endif /* _TRACE_SND_CTL_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE control_trace +#include diff --git a/sound/core/init.c b/sound/core/init.c index 2f1bd9cbdbed..0c316189e947 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -362,6 +362,11 @@ static int snd_card_init(struct snd_card *card, struct device *parent, #ifdef CONFIG_SND_DEBUG card->debugfs_root = debugfs_create_dir(dev_name(&card->card_dev), sound_debugfs_root); +#endif +#ifdef CONFIG_SND_CTL_DEBUG + card->value_buf = kmalloc(sizeof(*card->value_buf), GFP_KERNEL); + if (!card->value_buf) + return -ENOMEM; #endif return 0; @@ -587,6 +592,9 @@ static int snd_card_do_free(struct snd_card *card) snd_device_free_all(card); if (card->private_free) card->private_free(card); +#ifdef CONFIG_SND_CTL_DEBUG + kfree(card->value_buf); +#endif if (snd_info_card_free(card) < 0) { dev_warn(card->dev, "unable to free card info\n"); /* Not fatal error */ -- cgit v1.2.3 From 8021729acf21f4bf3c43866b8919b68968028478 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:57 -0800 Subject: iio: tsl2772: fix all kernel-doc warnings Use the correct kernel-doc notation for struct members to eliminate kernel-doc warnings: Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_diode' not described in 'tsl2772_settings' Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_power' not described in 'tsl2772_settings' Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/platform_data/tsl2772.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/tsl2772.h b/include/linux/platform_data/tsl2772.h index f8ade15a35e2..f042e82b39c3 100644 --- a/include/linux/platform_data/tsl2772.h +++ b/include/linux/platform_data/tsl2772.h @@ -61,9 +61,9 @@ struct tsl2772_lux { * @prox_pulse_count: Number if proximity emitter pulses. * @prox_max_samples_cal: The number of samples that are taken when performing * a proximity calibration. - * @prox_diode Which diode(s) to use for driving the external + * @prox_diode: Which diode(s) to use for driving the external * LED(s) for proximity sensing. - * @prox_power The amount of power to use for the external LED(s). + * @prox_power: The amount of power to use for the external LED(s). */ struct tsl2772_settings { int als_time; -- cgit v1.2.3 From 6e5913328102f818303b01e854df37fa9f251a47 Mon Sep 17 00:00:00 2001 From: Raghav Sharma Date: Mon, 2 Feb 2026 16:05:53 +0530 Subject: dt-bindings: clock: exynosautov920: add G3D clock definitions Add device tree clock binding definitions for CMU_G3D Signed-off-by: Raghav Sharma Link: https://patch.msgid.link/20260202103555.2089376-2-raghav.s@samsung.com Signed-off-by: Krzysztof Kozlowski --- .../clock/samsung,exynosautov920-clock.yaml | 21 +++++++++++++++++++++ include/dt-bindings/clock/samsung,exynosautov920.h | 6 ++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml index 1318720193b3..6b1fc61a2ff9 100644 --- a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml +++ b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml @@ -35,6 +35,7 @@ properties: - samsung,exynosautov920-cmu-cpucl0 - samsung,exynosautov920-cmu-cpucl1 - samsung,exynosautov920-cmu-cpucl2 + - samsung,exynosautov920-cmu-g3d - samsung,exynosautov920-cmu-hsi0 - samsung,exynosautov920-cmu-hsi1 - samsung,exynosautov920-cmu-hsi2 @@ -287,6 +288,26 @@ allOf: - const: oscclk - const: noc + - if: + properties: + compatible: + contains: + const: samsung,exynosautov920-cmu-g3d + + then: + properties: + clocks: + items: + - description: External reference clock (38.4 MHz) + - description: CMU_G3D SWITCH clock (from CMU_TOP) + - description: CMU_G3D NOCP clock (from CMU_TOP) + + clock-names: + items: + - const: oscclk + - const: switch + - const: nocp + required: - compatible - "#clock-cells" diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h index 06dec27a8c77..f2628c220b22 100644 --- a/include/dt-bindings/clock/samsung,exynosautov920.h +++ b/include/dt-bindings/clock/samsung,exynosautov920.h @@ -309,4 +309,10 @@ #define CLK_MOUT_MFD_NOC_USER 1 #define CLK_DOUT_MFD_NOCP 2 +/* CMU_G3D */ +#define FOUT_PLL_G3D 1 +#define CLK_MOUT_G3D_NOC 2 +#define CLK_MOUT_G3D_SWITCH_USER 3 +#define CLK_MOUT_G3D_NOCP_USER 4 + #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */ -- cgit v1.2.3 From a2be37eedb52ea26938fa4cc9de1ff84963c57ad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 24 Feb 2026 11:42:04 +0100 Subject: firmware: exynos-acpm: Drop fake 'const' on handle pointer All the functions operating on the 'handle' pointer are claiming it is a pointer to const thus they should not modify the handle. In fact that's a false statement, because first thing these functions do is drop the cast to const with container_of: struct acpm_info *acpm = handle_to_acpm_info(handle); And with such cast the handle is easily writable with simple: acpm->handle.ops.pmic_ops.read_reg = NULL; The code is not correct logically, either, because functions like acpm_get_by_node() and acpm_handle_put() are meant to modify the handle reference counting, thus they must modify the handle. Modification here happens anyway, even if the reference counting is stored in the container which the handle is part of. The code does not have actual visible bug, but incorrect 'const' annotations could lead to incorrect compiler decisions. Fixes: a88927b534ba ("firmware: add Exynos ACPM protocol driver") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260224104203.42950-2-krzysztof.kozlowski@oss.qualcomm.com Signed-off-by: Krzysztof Kozlowski --- drivers/clk/samsung/clk-acpm.c | 4 +-- drivers/firmware/samsung/exynos-acpm-dvfs.c | 4 +-- drivers/firmware/samsung/exynos-acpm-dvfs.h | 4 +-- drivers/firmware/samsung/exynos-acpm-pmic.c | 10 +++--- drivers/firmware/samsung/exynos-acpm-pmic.h | 10 +++--- drivers/firmware/samsung/exynos-acpm.c | 16 +++++---- drivers/firmware/samsung/exynos-acpm.h | 2 +- drivers/mfd/sec-acpm.c | 10 +++--- .../linux/firmware/samsung/exynos-acpm-protocol.h | 40 +++++++++------------- 9 files changed, 48 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/drivers/clk/samsung/clk-acpm.c b/drivers/clk/samsung/clk-acpm.c index b90809ce3f88..d8944160793a 100644 --- a/drivers/clk/samsung/clk-acpm.c +++ b/drivers/clk/samsung/clk-acpm.c @@ -20,7 +20,7 @@ struct acpm_clk { u32 id; struct clk_hw hw; unsigned int mbox_chan_id; - const struct acpm_handle *handle; + struct acpm_handle *handle; }; struct acpm_clk_variant { @@ -113,7 +113,7 @@ static int acpm_clk_register(struct device *dev, struct acpm_clk *aclk, static int acpm_clk_probe(struct platform_device *pdev) { - const struct acpm_handle *acpm_handle; + struct acpm_handle *acpm_handle; struct clk_hw_onecell_data *clk_data; struct clk_hw **hws; struct device *dev = &pdev->dev; diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.c b/drivers/firmware/samsung/exynos-acpm-dvfs.c index 17e7be7757b3..06bdf62dea1f 100644 --- a/drivers/firmware/samsung/exynos-acpm-dvfs.c +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.c @@ -43,7 +43,7 @@ static void acpm_dvfs_init_set_rate_cmd(u32 cmd[4], unsigned int clk_id, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_dvfs_set_rate(const struct acpm_handle *handle, +int acpm_dvfs_set_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id, unsigned long rate) { @@ -63,7 +63,7 @@ static void acpm_dvfs_init_get_rate_cmd(u32 cmd[4], unsigned int clk_id) cmd[3] = ktime_to_ms(ktime_get()); } -unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, +unsigned long acpm_dvfs_get_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id) { struct acpm_xfer xfer; diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.h b/drivers/firmware/samsung/exynos-acpm-dvfs.h index 9f2778e649c9..b37b15426102 100644 --- a/drivers/firmware/samsung/exynos-acpm-dvfs.h +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.h @@ -11,10 +11,10 @@ struct acpm_handle; -int acpm_dvfs_set_rate(const struct acpm_handle *handle, +int acpm_dvfs_set_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int id, unsigned long rate); -unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, +unsigned long acpm_dvfs_get_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id); diff --git a/drivers/firmware/samsung/exynos-acpm-pmic.c b/drivers/firmware/samsung/exynos-acpm-pmic.c index 26a9024d8ed8..0c50993cc9a8 100644 --- a/drivers/firmware/samsung/exynos-acpm-pmic.c +++ b/drivers/firmware/samsung/exynos-acpm-pmic.c @@ -77,7 +77,7 @@ static void acpm_pmic_init_read_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan) cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_read_reg(const struct acpm_handle *handle, +int acpm_pmic_read_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 *buf) { @@ -107,7 +107,7 @@ static void acpm_pmic_init_bulk_read_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, FIELD_PREP(ACPM_PMIC_VALUE, count); } -int acpm_pmic_bulk_read(const struct acpm_handle *handle, +int acpm_pmic_bulk_read(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, u8 *buf) { @@ -150,7 +150,7 @@ static void acpm_pmic_init_write_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_write_reg(const struct acpm_handle *handle, +int acpm_pmic_write_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value) { @@ -187,7 +187,7 @@ static void acpm_pmic_init_bulk_write_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, } } -int acpm_pmic_bulk_write(const struct acpm_handle *handle, +int acpm_pmic_bulk_write(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, const u8 *buf) { @@ -220,7 +220,7 @@ static void acpm_pmic_init_update_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_update_reg(const struct acpm_handle *handle, +int acpm_pmic_update_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value, u8 mask) { diff --git a/drivers/firmware/samsung/exynos-acpm-pmic.h b/drivers/firmware/samsung/exynos-acpm-pmic.h index 078421888a14..88ae9aada2ae 100644 --- a/drivers/firmware/samsung/exynos-acpm-pmic.h +++ b/drivers/firmware/samsung/exynos-acpm-pmic.h @@ -11,19 +11,19 @@ struct acpm_handle; -int acpm_pmic_read_reg(const struct acpm_handle *handle, +int acpm_pmic_read_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 *buf); -int acpm_pmic_bulk_read(const struct acpm_handle *handle, +int acpm_pmic_bulk_read(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, u8 *buf); -int acpm_pmic_write_reg(const struct acpm_handle *handle, +int acpm_pmic_write_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value); -int acpm_pmic_bulk_write(const struct acpm_handle *handle, +int acpm_pmic_bulk_write(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, const u8 *buf); -int acpm_pmic_update_reg(const struct acpm_handle *handle, +int acpm_pmic_update_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value, u8 mask); #endif /* __EXYNOS_ACPM_PMIC_H__ */ diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c index c616ac951a0d..16c46ed60837 100644 --- a/drivers/firmware/samsung/exynos-acpm.c +++ b/drivers/firmware/samsung/exynos-acpm.c @@ -410,7 +410,7 @@ static int acpm_wait_for_message_response(struct acpm_chan *achan, * * Return: 0 on success, -errno otherwise. */ -int acpm_do_xfer(const struct acpm_handle *handle, const struct acpm_xfer *xfer) +int acpm_do_xfer(struct acpm_handle *handle, const struct acpm_xfer *xfer) { struct acpm_info *acpm = handle_to_acpm_info(handle); struct exynos_mbox_msg msg; @@ -674,7 +674,7 @@ static int acpm_probe(struct platform_device *pdev) * acpm_handle_put() - release the handle acquired by acpm_get_by_phandle. * @handle: Handle acquired by acpm_get_by_phandle. */ -static void acpm_handle_put(const struct acpm_handle *handle) +static void acpm_handle_put(struct acpm_handle *handle) { struct acpm_info *acpm = handle_to_acpm_info(handle); struct device *dev = acpm->dev; @@ -700,9 +700,11 @@ static void devm_acpm_release(struct device *dev, void *res) * @np: ACPM device tree node. * * Return: pointer to handle on success, ERR_PTR(-errno) otherwise. + * + * Note: handle CANNOT be pointer to const */ -static const struct acpm_handle *acpm_get_by_node(struct device *dev, - struct device_node *np) +static struct acpm_handle *acpm_get_by_node(struct device *dev, + struct device_node *np) { struct platform_device *pdev; struct device_link *link; @@ -743,10 +745,10 @@ static const struct acpm_handle *acpm_get_by_node(struct device *dev, * * Return: pointer to handle on success, ERR_PTR(-errno) otherwise. */ -const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np) +struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) { - const struct acpm_handle **ptr, *handle; + struct acpm_handle **ptr, *handle; ptr = devres_alloc(devm_acpm_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) diff --git a/drivers/firmware/samsung/exynos-acpm.h b/drivers/firmware/samsung/exynos-acpm.h index 8392fcb91f45..5df8354dc96c 100644 --- a/drivers/firmware/samsung/exynos-acpm.h +++ b/drivers/firmware/samsung/exynos-acpm.h @@ -17,7 +17,7 @@ struct acpm_xfer { struct acpm_handle; -int acpm_do_xfer(const struct acpm_handle *handle, +int acpm_do_xfer(struct acpm_handle *handle, const struct acpm_xfer *xfer); #endif /* __EXYNOS_ACPM_H__ */ diff --git a/drivers/mfd/sec-acpm.c b/drivers/mfd/sec-acpm.c index 537ea65685bf..0e23b9d9f7ee 100644 --- a/drivers/mfd/sec-acpm.c +++ b/drivers/mfd/sec-acpm.c @@ -367,7 +367,7 @@ static const struct regmap_config s2mpg11_regmap_config_meter = { }; struct sec_pmic_acpm_shared_bus_context { - const struct acpm_handle *acpm; + struct acpm_handle *acpm; unsigned int acpm_chan_id; u8 speedy_channel; }; @@ -390,7 +390,7 @@ static int sec_pmic_acpm_bus_write(void *context, const void *data, size_t count) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; size_t val_count = count - BITS_TO_BYTES(ACPM_ADDR_BITS); const u8 *d = data; @@ -410,7 +410,7 @@ static int sec_pmic_acpm_bus_read(void *context, const void *reg_buf, size_t reg void *val_buf, size_t val_size) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; const u8 *r = reg_buf; u8 reg; @@ -429,7 +429,7 @@ static int sec_pmic_acpm_bus_reg_update_bits(void *context, unsigned int reg, un unsigned int val) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; return pmic_ops->update_reg(acpm, ctx->shared->acpm_chan_id, ctx->type, reg & 0xff, @@ -480,7 +480,7 @@ static int sec_pmic_acpm_probe(struct platform_device *pdev) struct regmap *regmap_common, *regmap_pmic, *regmap; const struct sec_pmic_acpm_platform_data *pdata; struct sec_pmic_acpm_shared_bus_context *shared_ctx; - const struct acpm_handle *acpm; + struct acpm_handle *acpm; struct device *dev = &pdev->dev; int ret, irq; diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index 2091da965a5a..13f17dc4443b 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -14,30 +14,24 @@ struct acpm_handle; struct device_node; struct acpm_dvfs_ops { - int (*set_rate)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, unsigned int clk_id, - unsigned long rate); - unsigned long (*get_rate)(const struct acpm_handle *handle, + int (*set_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, + unsigned int clk_id, unsigned long rate); + unsigned long (*get_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id); }; struct acpm_pmic_ops { - int (*read_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 *buf); - int (*bulk_read)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, u8 *buf); - int (*write_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value); - int (*bulk_write)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, const u8 *buf); - int (*update_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value, u8 mask); + int (*read_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 *buf); + int (*bulk_read)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, u8 *buf); + int (*write_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value); + int (*bulk_write)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, const u8 *buf); + int (*update_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value, u8 mask); }; struct acpm_ops { @@ -56,12 +50,12 @@ struct acpm_handle { struct device; #if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) -const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np); +struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np); #else -static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np) +static inline struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) { return NULL; } -- cgit v1.2.3 From ff2998f29f390d963299103f0b247cc79106ced5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Feb 2026 14:44:12 +0100 Subject: net: sched: introduce qdisc-specific drop reason tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create new enum qdisc_drop_reason and trace_qdisc_drop tracepoint for qdisc layer drop diagnostics with direct qdisc context visibility. The new tracepoint includes qdisc handle, parent, kind (name), and device information. Existing SKB_DROP_REASON_QDISC_DROP is retained for backwards compatibility via kfree_skb_reason(). Convert qdiscs with drop reasons to use the new infrastructure. Change CAKE's cobalt_should_drop() return type from enum skb_drop_reason to enum qdisc_drop_reason to fix implicit enum conversion warnings. Use QDISC_DROP_UNSPEC as the 'not dropped' sentinel instead of SKB_NOT_DROPPED_YET. Both have the same compiled value (0), so the comparison logic remains semantically equivalent. Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/177211345275.3011628.1974310302645218067.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 42 ++----------------- include/net/dropreason-qdisc.h | 94 ++++++++++++++++++++++++++++++++++++++++++ include/net/dropreason.h | 6 +++ include/net/sch_generic.h | 43 +++++++++++++------ include/trace/events/qdisc.h | 51 +++++++++++++++++++++++ net/core/dev.c | 8 ++-- net/sched/sch_cake.c | 26 ++++++------ net/sched/sch_codel.c | 5 +-- net/sched/sch_dualpi2.c | 8 ++-- net/sched/sch_fq.c | 7 ++-- net/sched/sch_fq_codel.c | 4 +- net/sched/sch_fq_pie.c | 4 +- net/sched/sch_generic.c | 29 ++++++++++++- net/sched/sch_gred.c | 4 +- net/sched/sch_pie.c | 4 +- net/sched/sch_red.c | 4 +- net/sched/sch_sfb.c | 4 +- 17 files changed, 249 insertions(+), 94 deletions(-) create mode 100644 include/net/dropreason-qdisc.h (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a7b7abd66e21..3d8d284e05c8 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -68,12 +68,6 @@ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ FN(QDISC_BURST_DROP) \ - FN(QDISC_OVERLIMIT) \ - FN(QDISC_CONGESTED) \ - FN(CAKE_FLOOD) \ - FN(FQ_BAND_LIMIT) \ - FN(FQ_HORIZON_LIMIT) \ - FN(FQ_FLOW_LIMIT) \ FN(CPU_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ @@ -371,8 +365,10 @@ enum skb_drop_reason { /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ SKB_DROP_REASON_SECURITY_HOOK, /** - * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting ( - * failed to enqueue to current qdisc) + * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc during enqueue or + * dequeue. More specific drop reasons are available via the + * qdisc:qdisc_drop tracepoint, which also provides qdisc handle + * and name for identifying the source. */ SKB_DROP_REASON_QDISC_DROP, /** @@ -380,36 +376,6 @@ enum skb_drop_reason { * limit is hit. */ SKB_DROP_REASON_QDISC_BURST_DROP, - /** - * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc - * instance exceeds its total buffer size limit. - */ - SKB_DROP_REASON_QDISC_OVERLIMIT, - /** - * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm - * due to congestion. - */ - SKB_DROP_REASON_QDISC_CONGESTED, - /** - * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of - * CAKE qdisc AQM algorithm (BLUE). - */ - SKB_DROP_REASON_CAKE_FLOOD, - /** - * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band - * limit is reached. - */ - SKB_DROP_REASON_FQ_BAND_LIMIT, - /** - * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet - * timestamp is too far in the future. - */ - SKB_DROP_REASON_FQ_HORIZON_LIMIT, - /** - * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow - * exceeds its limits. - */ - SKB_DROP_REASON_FQ_FLOW_LIMIT, /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h new file mode 100644 index 000000000000..80a2d557e5f7 --- /dev/null +++ b/include/net/dropreason-qdisc.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_DROPREASON_QDISC_H +#define _LINUX_DROPREASON_QDISC_H +#include + +#define DEFINE_QDISC_DROP_REASON(FN, FNe) \ + FN(UNSPEC) \ + FN(GENERIC) \ + FN(OVERLIMIT) \ + FN(CONGESTED) \ + FN(CAKE_FLOOD) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ + FNe(MAX) + +#undef FN +#undef FNe +#define FN(reason) QDISC_DROP_##reason, +#define FNe(reason) QDISC_DROP_##reason + +/** + * enum qdisc_drop_reason - reason why a qdisc dropped a packet + * + * Qdisc-specific drop reasons for packet drops that occur within the + * traffic control (TC) queueing discipline layer. These reasons provide + * detailed diagnostics about why packets were dropped by various qdisc + * algorithms, enabling fine-grained monitoring and troubleshooting of + * queue behavior. + */ +enum qdisc_drop_reason { + /** + * @QDISC_DROP_UNSPEC: unspecified/invalid qdisc drop reason. + * Value 0 serves as analogous to SKB_NOT_DROPPED_YET for enum skb_drop_reason. + * Used for catching zero-initialized drop_reason fields. + */ + QDISC_DROP_UNSPEC = 0, + /** + * @__QDISC_DROP_REASON: subsystem base value for qdisc drop reasons + */ + __QDISC_DROP_REASON = SKB_DROP_REASON_SUBSYS_QDISC << + SKB_DROP_REASON_SUBSYS_SHIFT, + /** + * @QDISC_DROP_GENERIC: generic/default qdisc drop, used when no + * more specific reason applies + */ + QDISC_DROP_GENERIC, + /** + * @QDISC_DROP_OVERLIMIT: packet dropped because the qdisc queue + * length exceeded its configured limit (sch->limit). This typically + * indicates the queue is full and cannot accept more packets. + */ + QDISC_DROP_OVERLIMIT, + /** + * @QDISC_DROP_CONGESTED: packet dropped due to active congestion + * control algorithms (e.g., CoDel, PIE, RED) detecting network + * congestion. The qdisc proactively dropped the packet to signal + * congestion to the sender and prevent bufferbloat. + */ + QDISC_DROP_CONGESTED, + /** + * @QDISC_DROP_CAKE_FLOOD: CAKE qdisc dropped packet due to flood + * protection mechanism (BLUE algorithm). This indicates potential + * DoS/flood attack or unresponsive flow behavior. + */ + QDISC_DROP_CAKE_FLOOD, + /** + * @QDISC_DROP_FQ_BAND_LIMIT: FQ (Fair Queue) dropped packet because + * the priority band's packet limit was reached. Each priority band + * in FQ has its own limit. + */ + QDISC_DROP_FQ_BAND_LIMIT, + /** + * @QDISC_DROP_FQ_HORIZON_LIMIT: FQ dropped packet because its + * timestamp is too far in the future (beyond the configured horizon). + */ + QDISC_DROP_FQ_HORIZON_LIMIT, + /** + * @QDISC_DROP_FQ_FLOW_LIMIT: FQ dropped packet because an individual + * flow exceeded its per-flow packet limit. + */ + QDISC_DROP_FQ_FLOW_LIMIT, + /** + * @QDISC_DROP_MAX: the maximum of qdisc drop reasons, which + * shouldn't be used as a real 'reason' - only for tracing code gen + */ + QDISC_DROP_MAX, +}; + +#undef FN +#undef FNe + +#endif diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 7d3b1a2a6fec..1df60645fb27 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -23,6 +23,12 @@ enum skb_drop_reason_subsys { */ SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + /** + * @SKB_DROP_REASON_SUBSYS_QDISC: TC qdisc drop reasons, + * see include/net/dropreason-qdisc.h + */ + SKB_DROP_REASON_SUBSYS_QDISC, + /** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */ SKB_DROP_REASON_SUBSYS_NUM }; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c3a7268b567e..31c25a6d6acc 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -20,12 +20,15 @@ #include #include #include +#include struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; +struct Qdisc; +struct netdev_queue; struct qdisc_rate_table { struct tc_ratespec rate; @@ -1106,36 +1109,50 @@ static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) return cb; } +/* TC classifier accessors - use enum skb_drop_reason */ static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { - return tc_skb_cb(skb)->drop_reason; + return (enum skb_drop_reason)tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { - tc_skb_cb(skb)->drop_reason = reason; + tc_skb_cb(skb)->drop_reason = (enum qdisc_drop_reason)reason; } -static inline void tcf_kfree_skb_list(struct sk_buff *skb) +/* Qdisc accessors - use enum qdisc_drop_reason */ +static inline enum qdisc_drop_reason +tcf_get_qdisc_drop_reason(const struct sk_buff *skb) { - while (unlikely(skb)) { - struct sk_buff *next = skb->next; + return tc_skb_cb(skb)->drop_reason; +} - prefetch(next); - kfree_skb_reason(skb, tcf_get_drop_reason(skb)); - skb = next; - } +static inline void tcf_set_qdisc_drop_reason(const struct sk_buff *skb, + enum qdisc_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + +void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, + struct netdev_queue *txq, struct net_device *dev); + +static inline void tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, + struct netdev_queue *txq, + struct net_device *dev) +{ + if (unlikely(skb)) + __tcf_kfree_skb_list(skb, q, txq, dev); } static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, - enum skb_drop_reason reason) + enum qdisc_drop_reason reason) { DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); - tcf_set_drop_reason(skb, reason); + tcf_set_qdisc_drop_reason(skb, reason); skb->next = q->to_free; q->to_free = skb; } @@ -1312,9 +1329,9 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free, - enum skb_drop_reason reason) + enum qdisc_drop_reason reason) { - tcf_set_drop_reason(skb, reason); + tcf_set_qdisc_drop_reason(skb, reason); return qdisc_drop(skb, sch, to_free); } diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h index ff33f41a9db7..d8a5c2677470 100644 --- a/include/trace/events/qdisc.h +++ b/include/trace/events/qdisc.h @@ -74,6 +74,57 @@ TRACE_EVENT(qdisc_enqueue, __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); +#undef FN +#undef FNe +#define FN(reason) TRACE_DEFINE_ENUM(QDISC_DROP_##reason); +#define FNe(reason) TRACE_DEFINE_ENUM(QDISC_DROP_##reason); +DEFINE_QDISC_DROP_REASON(FN, FNe) + +#undef FN +#undef FNe +#define FN(reason) { QDISC_DROP_##reason, #reason }, +#define FNe(reason) { QDISC_DROP_##reason, #reason } + +TRACE_EVENT(qdisc_drop, + + TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, + struct net_device *dev, struct sk_buff *skb, + enum qdisc_drop_reason reason), + + TP_ARGS(qdisc, txq, dev, skb, reason), + + TP_STRUCT__entry( + __field(struct Qdisc *, qdisc) + __field(const struct netdev_queue *, txq) + __field(void *, skbaddr) + __field(int, ifindex) + __field(u32, handle) + __field(u32, parent) + __field(enum qdisc_drop_reason, reason) + __string(kind, qdisc->ops->id) + ), + + TP_fast_assign( + __entry->qdisc = qdisc; + __entry->txq = txq; + __entry->skbaddr = skb; + __entry->ifindex = dev ? dev->ifindex : 0; + __entry->handle = qdisc->handle; + __entry->parent = qdisc->parent; + __entry->reason = reason; + __assign_str(kind); + ), + + TP_printk("drop ifindex=%d kind=%s handle=0x%X parent=0x%X skbaddr=%p reason=%s", + __entry->ifindex, __get_str(kind), __entry->handle, + __entry->parent, __entry->skbaddr, + __print_symbolic(__entry->reason, + DEFINE_QDISC_DROP_REASON(FN, FNe))) +); + +#undef FN +#undef FNe + TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), diff --git a/net/core/dev.c b/net/core/dev.c index c1a9f7fdcffa..1cf3ad840697 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4166,7 +4166,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); - tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + tcf_set_qdisc_drop_reason(skb, QDISC_DROP_GENERIC); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && @@ -4274,8 +4274,8 @@ unlock: spin_unlock(root_lock); free_skbs: - tcf_kfree_skb_list(to_free); - tcf_kfree_skb_list(to_free2); + tcf_kfree_skb_list(to_free, q, txq, dev); + tcf_kfree_skb_list(to_free2, q, txq, dev); return rc; } @@ -5811,7 +5811,7 @@ static __latent_entropy void net_tx_action(void) to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); - tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free, q, NULL, qdisc_dev(q)); } rcu_read_unlock(); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a01f14b1c216..ca00fba7e451 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -497,13 +497,13 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) +static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) { - enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + enum qdisc_drop_reason reason = QDISC_DROP_UNSPEC; bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -548,7 +548,7 @@ static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) - reason = SKB_DROP_REASON_QDISC_CONGESTED; + reason = QDISC_DROP_CONGESTED; vars->count++; if (!vars->count) @@ -571,14 +571,14 @@ static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + if (vars->p_drop && reason == QDISC_DROP_UNSPEC && get_random_u32() < vars->p_drop) - reason = SKB_DROP_REASON_CAKE_FLOOD; + reason = QDISC_DROP_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) + else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC) vars->drop_next = now; return reason; @@ -1604,7 +1604,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); + qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); sch->q.qlen--; cake_heapify(q, 0); @@ -2004,7 +2004,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - enum skb_drop_reason reason; + enum qdisc_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; @@ -2225,7 +2225,7 @@ retry: !!(q->config->rate_flags & CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (reason == SKB_NOT_DROPPED_YET || !flow->head) + if (reason == QDISC_DROP_UNSPEC || !flow->head) break; /* drop this packet, get another one */ diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c6551578f1cf..dc2be90666ff 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, QDISC_DROP_CONGESTED); qdisc_qstats_drop(sch); } @@ -86,8 +86,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop_reason(skb, sch, to_free, - SKB_DROP_REASON_QDISC_OVERLIMIT); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 6d7e6389758d..020cc20c6b56 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -393,13 +393,11 @@ static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, qdisc_qstats_overlimit(sch); if (skb_in_l_queue(skb)) qdisc_qstats_overlimit(q->l_queue); - return qdisc_drop_reason(skb, sch, to_free, - SKB_DROP_REASON_QDISC_OVERLIMIT); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); } if (q->drop_early && must_drop(sch, q, skb)) { - qdisc_drop_reason(skb, sch, to_free, - SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_CONGESTED); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -593,7 +591,7 @@ static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) while ((skb = dequeue_packet(sch, q, &credit_change, now))) { if (!q->drop_early && must_drop(sch, q, skb)) { drop_and_retry(q, skb, sch, - SKB_DROP_REASON_QDISC_CONGESTED); + SKB_DROP_REASON_QDISC_DROP); continue; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 80235e85f844..81322187bbe2 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -539,7 +539,7 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } -#define FQDR(reason) SKB_DROP_REASON_FQ_##reason +#define FQDR(reason) QDISC_DROP_FQ_##reason static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) @@ -552,8 +552,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop_reason(skb, sch, to_free, - FQDR(BAND_LIMIT)); + return qdisc_drop_reason(skb, sch, to_free, FQDR(BAND_LIMIT)); } now = ktime_get_ns(); @@ -579,7 +578,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; return qdisc_drop_reason(skb, sch, to_free, - FQDR(FLOW_LIMIT)); + QDISC_DROP_FQ_FLOW_LIMIT); } if (fq_flow_is_detached(f)) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 8181b52dd9a8..2a3d758f67ab 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -168,7 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; - tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); + tcf_set_qdisc_drop_reason(skb, QDISC_DROP_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, QDISC_DROP_CONGESTED); qdisc_qstats_drop(sch); } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index d8ac3519e937..154c70f489f2 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,7 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; + enum qdisc_drop_reason reason = QDISC_DROP_OVERLIMIT; struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; int ret; @@ -162,7 +162,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->overmemory++; } - reason = SKB_DROP_REASON_QDISC_CONGESTED; + reason = QDISC_DROP_CONGESTED; if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, sel_flow->backlog, skb->len)) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 98ffe64de51f..556e0d800316 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,11 +25,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include @@ -37,6 +37,31 @@ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); +void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, + struct netdev_queue *txq, struct net_device *dev) +{ + while (skb) { + u32 reason = tc_skb_cb(skb)->drop_reason; + struct sk_buff *next = skb->next; + enum skb_drop_reason skb_reason; + + prefetch(next); + /* TC classifier and qdisc share drop_reason storage. + * Check subsystem mask to identify qdisc drop reasons, + * else pass through skb_drop_reason set by TC classifier. + */ + if ((reason & SKB_DROP_REASON_SUBSYS_MASK) == __QDISC_DROP_REASON) { + trace_qdisc_drop(q, txq, dev, skb, (enum qdisc_drop_reason)reason); + skb_reason = SKB_DROP_REASON_QDISC_DROP; + } else { + skb_reason = (enum skb_drop_reason)reason; + } + kfree_skb_reason(skb, skb_reason); + skb = next; + } +} +EXPORT_SYMBOL(__tcf_kfree_skb_list); + static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { @@ -741,7 +766,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, err = skb_array_produce(q, skb); if (unlikely(err)) { - tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); + tcf_set_qdisc_drop_reason(skb, QDISC_DROP_OVERLIMIT); if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 6706faba95b9..36d0cafac206 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stats.pdrop++; drop: - return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); congestion_drop: - qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_CONGESTED); return NET_XMIT_CN; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 0a377313b6a9..16f3f629cb8e 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; + enum qdisc_drop_reason reason = QDISC_DROP_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -94,7 +94,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } - reason = SKB_DROP_REASON_QDISC_CONGESTED; + reason = QDISC_DROP_CONGESTED; if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 479c42d11083..c8d3d09f15e3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -70,7 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q) static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED; + enum qdisc_drop_reason reason = QDISC_DROP_CONGESTED; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; unsigned int len; @@ -108,7 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case RED_HARD_MARK: - reason = SKB_DROP_REASON_QDISC_OVERLIMIT; + reason = QDISC_DROP_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { q->stats.forced_drop++; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index d2835f1168e1..013738662128 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -280,7 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; + enum qdisc_drop_reason reason = QDISC_DROP_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; @@ -381,7 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } r = get_random_u16() & SFB_MAX_PROB; - reason = SKB_DROP_REASON_QDISC_CONGESTED; + reason = QDISC_DROP_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { -- cgit v1.2.3 From 3e28f8ad478f165260deba751858afac46cffd2f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Feb 2026 14:44:19 +0100 Subject: net: sched: sfq: convert to qdisc drop reasons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert SFQ to use the new qdisc-specific drop reason infrastructure. This patch demonstrates how to convert a flow-based qdisc to use the new enum qdisc_drop_reason. As part of this conversion: - Add QDISC_DROP_MAXFLOWS for flow table exhaustion - Rename FQ_FLOW_LIMIT to generic FLOW_LIMIT, now shared by FQ and SFQ - Use QDISC_DROP_OVERLIMIT for sfq_drop() when overall limit exceeded - Use QDISC_DROP_FLOW_LIMIT for per-flow depth limit exceeded The FLOW_LIMIT reason is now a common drop reason for per-flow limits, applicable to both FQ and SFQ qdiscs. Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/177211345946.3011628.12770616071857185664.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/dropreason-qdisc.h | 18 ++++++++++++++---- net/sched/sch_fq.c | 2 +- net/sched/sch_sfq.c | 8 ++++---- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h index 80a2d557e5f7..02a9f580411b 100644 --- a/include/net/dropreason-qdisc.h +++ b/include/net/dropreason-qdisc.h @@ -9,10 +9,11 @@ FN(GENERIC) \ FN(OVERLIMIT) \ FN(CONGESTED) \ + FN(MAXFLOWS) \ FN(CAKE_FLOOD) \ FN(FQ_BAND_LIMIT) \ FN(FQ_HORIZON_LIMIT) \ - FN(FQ_FLOW_LIMIT) \ + FN(FLOW_LIMIT) \ FNe(MAX) #undef FN @@ -59,6 +60,13 @@ enum qdisc_drop_reason { * congestion to the sender and prevent bufferbloat. */ QDISC_DROP_CONGESTED, + /** + * @QDISC_DROP_MAXFLOWS: packet dropped because the qdisc's flow + * tracking table is full and no free slots are available to allocate + * for a new flow. This indicates flow table exhaustion in flow-based + * qdiscs that maintain per-flow state (e.g., SFQ). + */ + QDISC_DROP_MAXFLOWS, /** * @QDISC_DROP_CAKE_FLOOD: CAKE qdisc dropped packet due to flood * protection mechanism (BLUE algorithm). This indicates potential @@ -77,10 +85,12 @@ enum qdisc_drop_reason { */ QDISC_DROP_FQ_HORIZON_LIMIT, /** - * @QDISC_DROP_FQ_FLOW_LIMIT: FQ dropped packet because an individual - * flow exceeded its per-flow packet limit. + * @QDISC_DROP_FLOW_LIMIT: packet dropped because an individual flow + * exceeded its per-flow packet/depth limit. Used by FQ and SFQ qdiscs + * to enforce per-flow fairness and prevent a single flow from + * monopolizing queue resources. */ - QDISC_DROP_FQ_FLOW_LIMIT, + QDISC_DROP_FLOW_LIMIT, /** * @QDISC_DROP_MAX: the maximum of qdisc drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 81322187bbe2..eb5ae2b15cc0 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -578,7 +578,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; return qdisc_drop_reason(skb, sch, to_free, - QDISC_DROP_FQ_FLOW_LIMIT); + QDISC_DROP_FLOW_LIMIT); } if (fq_flow_is_detached(f)) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 503d7d3ca081..c3f3181dba54 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -302,7 +302,7 @@ drop: sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); return len; } @@ -363,7 +363,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; @@ -420,14 +420,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; - qdisc_drop(head, sch, to_free); + qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); slot_queue_add(slot, skb); qdisc_tree_reduce_backlog(sch, 0, delta); -- cgit v1.2.3 From f30d9073ec1909a3b06a9cee57215bed3458da80 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Feb 2026 14:44:29 +0100 Subject: net: sched: rename QDISC_DROP_FQ_* to generic names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename FQ-specific drop reasons to generic names: - QDISC_DROP_FQ_BAND_LIMIT -> QDISC_DROP_BAND_LIMIT - QDISC_DROP_FQ_HORIZON_LIMIT -> QDISC_DROP_HORIZON_LIMIT This follows the principle that drop reasons should describe the drop mechanism rather than being tied to a specific qdisc implementation. These concepts (priority band limits, timestamp horizon) could apply to other qdiscs as well. Remove the local macro define FQDR() and instead use the full QDISC_DROP_* name to make it easier to navigate code. Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/177211346902.3011628.12523261489552097455.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/dropreason-qdisc.h | 19 ++++++++++--------- net/sched/sch_fq.c | 7 ++----- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h index 02a9f580411b..a167302e79e5 100644 --- a/include/net/dropreason-qdisc.h +++ b/include/net/dropreason-qdisc.h @@ -11,8 +11,8 @@ FN(CONGESTED) \ FN(MAXFLOWS) \ FN(CAKE_FLOOD) \ - FN(FQ_BAND_LIMIT) \ - FN(FQ_HORIZON_LIMIT) \ + FN(BAND_LIMIT) \ + FN(HORIZON_LIMIT) \ FN(FLOW_LIMIT) \ FNe(MAX) @@ -74,16 +74,17 @@ enum qdisc_drop_reason { */ QDISC_DROP_CAKE_FLOOD, /** - * @QDISC_DROP_FQ_BAND_LIMIT: FQ (Fair Queue) dropped packet because - * the priority band's packet limit was reached. Each priority band - * in FQ has its own limit. + * @QDISC_DROP_BAND_LIMIT: packet dropped because the priority band's + * limit was reached. Used by qdiscs with priority bands that have + * per-band packet limits (e.g., FQ). */ - QDISC_DROP_FQ_BAND_LIMIT, + QDISC_DROP_BAND_LIMIT, /** - * @QDISC_DROP_FQ_HORIZON_LIMIT: FQ dropped packet because its - * timestamp is too far in the future (beyond the configured horizon). + * @QDISC_DROP_HORIZON_LIMIT: packet dropped because its timestamp + * is too far in the future (beyond the configured horizon). + * Used by qdiscs with time-based scheduling (e.g., FQ). */ - QDISC_DROP_FQ_HORIZON_LIMIT, + QDISC_DROP_HORIZON_LIMIT, /** * @QDISC_DROP_FLOW_LIMIT: packet dropped because an individual flow * exceeded its per-flow packet/depth limit. Used by FQ and SFQ qdiscs diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index eb5ae2b15cc0..9a550f832d78 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -539,8 +539,6 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } -#define FQDR(reason) QDISC_DROP_FQ_##reason - static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -552,7 +550,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop_reason(skb, sch, to_free, FQDR(BAND_LIMIT)); + return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_BAND_LIMIT); } now = ktime_get_ns(); @@ -564,7 +562,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->horizon_drop) { q->stat_horizon_drops++; return qdisc_drop_reason(skb, sch, to_free, - FQDR(HORIZON_LIMIT)); + QDISC_DROP_HORIZON_LIMIT); } q->stat_horizon_caps++; skb->tstamp = now + q->horizon; @@ -603,7 +601,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } -#undef FQDR static void fq_check_throttled(struct fq_sched_data *q, u64 now) { -- cgit v1.2.3 From 9d3e7f9718987338d9cfbd64292aab6a739d9d32 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Feb 2026 14:44:35 +0100 Subject: net: sched: rename QDISC_DROP_CAKE_FLOOD to QDISC_DROP_FLOOD_PROTECTION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename QDISC_DROP_CAKE_FLOOD to QDISC_DROP_FLOOD_PROTECTION to use a generic name without embedding the qdisc name. This follows the principle that drop reasons should describe the drop mechanism rather than being tied to a specific qdisc implementation. The flood protection drop reason is used by qdiscs implementing probabilistic drop algorithms (like BLUE) that detect unresponsive flows indicating potential DoS or flood attacks. CAKE uses this via its Cobalt AQM component. Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/177211347537.3011628.13759059534638729639.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/dropreason-qdisc.h | 11 ++++++----- net/sched/sch_cake.c | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h index a167302e79e5..84f19a51382c 100644 --- a/include/net/dropreason-qdisc.h +++ b/include/net/dropreason-qdisc.h @@ -10,7 +10,7 @@ FN(OVERLIMIT) \ FN(CONGESTED) \ FN(MAXFLOWS) \ - FN(CAKE_FLOOD) \ + FN(FLOOD_PROTECTION) \ FN(BAND_LIMIT) \ FN(HORIZON_LIMIT) \ FN(FLOW_LIMIT) \ @@ -68,11 +68,12 @@ enum qdisc_drop_reason { */ QDISC_DROP_MAXFLOWS, /** - * @QDISC_DROP_CAKE_FLOOD: CAKE qdisc dropped packet due to flood - * protection mechanism (BLUE algorithm). This indicates potential - * DoS/flood attack or unresponsive flow behavior. + * @QDISC_DROP_FLOOD_PROTECTION: packet dropped by flood protection + * mechanism detecting unresponsive flows (potential DoS/flood). + * Used by qdiscs implementing probabilistic drop algorithms like + * BLUE (e.g., CAKE's Cobalt AQM). */ - QDISC_DROP_CAKE_FLOOD, + QDISC_DROP_FLOOD_PROTECTION, /** * @QDISC_DROP_BAND_LIMIT: packet dropped because the priority band's * limit was reached. Used by qdiscs with priority bands that have diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ca00fba7e451..a64d27476231 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -573,7 +573,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, /* Simple BLUE implementation. Lack of ECN is deliberate. */ if (vars->p_drop && reason == QDISC_DROP_UNSPEC && get_random_u32() < vars->p_drop) - reason = QDISC_DROP_CAKE_FLOOD; + reason = QDISC_DROP_FLOOD_PROTECTION; /* Overload the drop_next field as an activity timeout */ if (!vars->count) -- cgit v1.2.3 From 67713dff6398315461db56fdf208e7fd7e37078e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Feb 2026 14:45:19 +0100 Subject: net: sched: sch_dualpi2: use qdisc_dequeue_drop() for dequeue drops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DualPI2 drops packets during dequeue but was using kfree_skb_reason() directly, bypassing trace_qdisc_drop. Convert to qdisc_dequeue_drop() and add QDISC_DROP_L4S_STEP_NON_ECN to the qdisc drop reason enum. - Set TCQ_F_DEQUEUE_DROPS flag in dualpi2_init() - Use enum qdisc_drop_reason in drop_and_retry() - Replace kfree_skb_reason() with qdisc_dequeue_drop() Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/177211351978.3011628.11267023360997620069.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ------ include/net/dropreason-qdisc.h | 8 ++++++++ net/sched/sch_dualpi2.c | 12 ++++++------ 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3d8d284e05c8..5c8c2eb3d2c5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -121,7 +121,6 @@ FN(CANFD_RX_INVALID_FRAME) \ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ - FN(DUALPI2_STEP_DROP) \ FN(PSP_INPUT) \ FN(PSP_OUTPUT) \ FNe(MAX) @@ -579,11 +578,6 @@ enum skb_drop_reason { * reached a path or socket not eligible for use of memory reserves */ SKB_DROP_REASON_PFMEMALLOC, - /** - * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop - * threshold of DualPI2 qdisc. - */ - SKB_DROP_REASON_DUALPI2_STEP_DROP, /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ SKB_DROP_REASON_PSP_INPUT, /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h index 84f19a51382c..fb151cd31751 100644 --- a/include/net/dropreason-qdisc.h +++ b/include/net/dropreason-qdisc.h @@ -14,6 +14,7 @@ FN(BAND_LIMIT) \ FN(HORIZON_LIMIT) \ FN(FLOW_LIMIT) \ + FN(L4S_STEP_NON_ECN) \ FNe(MAX) #undef FN @@ -93,6 +94,13 @@ enum qdisc_drop_reason { * monopolizing queue resources. */ QDISC_DROP_FLOW_LIMIT, + /** + * @QDISC_DROP_L4S_STEP_NON_ECN: DualPI2 qdisc dropped a non-ECN-capable + * packet because the L4S queue delay exceeded the step threshold. + * Since the packet cannot be ECN-marked, it must be dropped to signal + * congestion. See RFC 9332 for the DualQ Coupled AQM step mechanism. + */ + QDISC_DROP_L4S_STEP_NON_ECN, /** * @QDISC_DROP_MAX: the maximum of qdisc drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 020cc20c6b56..fe6f5e889625 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -571,11 +571,11 @@ static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, } static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, - struct Qdisc *sch, enum skb_drop_reason reason) + struct Qdisc *sch, enum qdisc_drop_reason reason) { ++q->deferred_drops_cnt; q->deferred_drops_len += qdisc_pkt_len(skb); - kfree_skb_reason(skb, reason); + qdisc_dequeue_drop(sch, skb, reason); qdisc_qstats_drop(sch); } @@ -590,15 +590,13 @@ static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) while ((skb = dequeue_packet(sch, q, &credit_change, now))) { if (!q->drop_early && must_drop(sch, q, skb)) { - drop_and_retry(q, skb, sch, - SKB_DROP_REASON_QDISC_DROP); + drop_and_retry(q, skb, sch, QDISC_DROP_CONGESTED); continue; } if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { qdisc_qstats_drop(q->l_queue); - drop_and_retry(q, skb, sch, - SKB_DROP_REASON_DUALPI2_STEP_DROP); + drop_and_retry(q, skb, sch, QDISC_DROP_L4S_STEP_NON_ECN); continue; } @@ -915,6 +913,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, struct dualpi2_sched_data *q = qdisc_priv(sch); int err; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1), extack); if (!q->l_queue) -- cgit v1.2.3 From 94c125bafa00042daf6d63b4fdd78384abc121fc Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Mon, 9 Feb 2026 13:21:51 -0800 Subject: scsi: core: Add 'serial' sysfs attribute for SCSI/SATA Add a 'serial' sysfs attribute for SCSI and SATA devices. This attribute exposes the Unit Serial Number, which is derived from the Device Identification Vital Product Data (VPD) page 0x80. Whitespace is stripped from the retrieved serial number to handle the different alignment (right-aligned for SCSI, potentially left-aligned for SATA). As noted in SAT-5 10.5.3, "Although SPC-5 defines the PRODUCT SERIAL NUMBER field as right-aligned, ACS-5 does not require its SERIAL NUMBER field to be right-aligned. Therefore, right-alignment of the PRODUCT SERIAL NUMBER field for the translation is not assured." This attribute is used by tools such as lsblk to display the serial number of block devices. [mkp: length adjustment] Signed-off-by: Igor Pylypiv Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260209212151.342151-1-ipylypiv@google.com Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/scsi/scsi_sysfs.c | 16 ++++++++++++++++ include/scsi/scsi_device.h | 1 + 3 files changed, 64 insertions(+) (limited to 'include') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d3a8cd4166f9..6e8c7a42603e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3460,6 +3461,52 @@ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len) } EXPORT_SYMBOL(scsi_vpd_lun_id); +/** + * scsi_vpd_lun_serial - return a unique device serial number + * @sdev: SCSI device + * @sn: buffer for the serial number + * @sn_size: size of the buffer + * + * Copies the device serial number into @sn based on the information in + * the VPD page 0x80 of the device. The string will be null terminated + * and have leading and trailing whitespace stripped. + * + * Returns the length of the serial number or error on failure. + */ +int scsi_vpd_lun_serial(struct scsi_device *sdev, char *sn, size_t sn_size) +{ + const struct scsi_vpd *vpd_pg80; + const unsigned char *d; + int len; + + guard(rcu)(); + vpd_pg80 = rcu_dereference(sdev->vpd_pg80); + if (!vpd_pg80) + return -ENXIO; + + len = vpd_pg80->len - 4; + d = vpd_pg80->data + 4; + + /* Skip leading spaces */ + while (len > 0 && isspace(*d)) { + len--; + d++; + } + + /* Skip trailing spaces */ + while (len > 0 && isspace(d[len - 1])) + len--; + + if (sn_size < len + 1) + return -EINVAL; + + memcpy(sn, d, len); + sn[len] = '\0'; + + return len; +} +EXPORT_SYMBOL(scsi_vpd_lun_serial); + /** * scsi_vpd_tpg_id - return a target port group identifier * @sdev: SCSI device diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 6b8c5c05f294..dfc3559e7e04 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1051,6 +1051,21 @@ sdev_show_wwid(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL); +static ssize_t +sdev_show_serial(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + ssize_t ret; + + ret = scsi_vpd_lun_serial(sdev, buf, PAGE_SIZE - 1); + if (ret < 0) + return ret; + + buf[ret] = '\n'; + return ret + 1; +} +static DEVICE_ATTR(serial, S_IRUGO, sdev_show_serial, NULL); + #define BLIST_FLAG_NAME(name) \ [const_ilog2((__force __u64)BLIST_##name)] = #name static const char *const sdev_bflags_name[] = { @@ -1295,6 +1310,7 @@ static struct attribute *scsi_sdev_attrs[] = { &dev_attr_device_busy.attr, &dev_attr_vendor.attr, &dev_attr_model.attr, + &dev_attr_serial.attr, &dev_attr_rev.attr, &dev_attr_rescan.attr, &dev_attr_delete.attr, diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d32f5841f4f8..9c2a7bbe5891 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -571,6 +571,7 @@ void scsi_put_internal_cmd(struct scsi_cmnd *scmd); extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t); +extern int scsi_vpd_lun_serial(struct scsi_device *, char *, size_t); extern int scsi_vpd_tpg_id(struct scsi_device *, int *); #ifdef CONFIG_PM -- cgit v1.2.3 From 06933066d88a3093953b062922c016a67d2cdbf8 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 22 Feb 2026 17:27:01 -0600 Subject: scsi: target: Add support for completing commands from backend context To complete a command several drivers just drop their reference and add it to list to be processed by a driver specific thread. So there's no need to go from backend context to the LIO thread then to the driver's thread. When avoiding the LIO thread, IOPS can increase from 20-30% for workloads like: fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=8K \ --ioengine=libaio --iodepth=128 --numjobs=$jobs where increasing jobs increases the performance improvement (this is using NVMe drives with LIO's submit_type=1 to directly submit). Add the infrastructure so drivers and userspace can control how to complete a command like is done for the submission path. In this commit there is no behavior change and we continue to defer to the LIO workqueue thread. In the subsequent commits we will allow drivers to report what they support and allow userspace to control the behavior. Signed-off-by: Mike Christie Link: https://patch.msgid.link/20260222232946.7637-2-michael.christie@oracle.com Signed-off-by: Martin K. Petersen --- drivers/target/target_core_device.c | 1 + drivers/target/target_core_transport.c | 60 +++++++++++++++++++++++++++------- include/target/target_core_base.h | 10 ++++++ include/target/target_core_fabric.h | 12 +++++-- 4 files changed, 69 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 74c6383f9eed..883a866e96ab 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -813,6 +813,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) DA_UNMAP_ZEROES_DATA_DEFAULT; dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN; dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT; + dev->dev_attrib.submit_type = TARGET_QUEUE_COMPL; /* Skip allocating lun_stats since we can't export them. */ xcopy_lun = &dev->xcopy_lun; diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index a7330c4fedde..34249fb80c67 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -902,13 +902,59 @@ static bool target_cmd_interrupted(struct se_cmd *cmd) return false; } +static void target_complete(struct se_cmd *cmd, int success) +{ + struct se_wwn *wwn = cmd->se_sess->se_tpg->se_tpg_wwn; + struct se_dev_attrib *da; + u8 compl_type; + int cpu; + + if (!wwn) { + cpu = cmd->cpuid; + goto queue_work; + } + + da = &cmd->se_dev->dev_attrib; + if (da->complete_type == TARGET_FABRIC_DEFAULT_COMPL) + compl_type = wwn->wwn_tf->tf_ops->default_compl_type; + else if (da->complete_type == TARGET_DIRECT_SUBMIT && + wwn->wwn_tf->tf_ops->direct_compl_supp) + compl_type = TARGET_DIRECT_COMPL; + else + compl_type = TARGET_QUEUE_COMPL; + + if (compl_type == TARGET_DIRECT_COMPL) { + /* + * Failure handling and processing secondary stages of + * complex commands can be too heavy to handle from the + * fabric driver so always defer. + */ + if (success && !cmd->transport_complete_callback) { + target_complete_ok_work(&cmd->work); + return; + } + + compl_type = TARGET_QUEUE_COMPL; + } + +queue_work: + INIT_WORK(&cmd->work, success ? target_complete_ok_work : + target_complete_failure_work); + + if (!wwn || wwn->cmd_compl_affinity == SE_COMPL_AFFINITY_CPUID) + cpu = cmd->cpuid; + else + cpu = wwn->cmd_compl_affinity; + + queue_work_on(cpu, target_completion_wq, &cmd->work); +} + /* May be called from interrupt context so must not sleep. */ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status, sense_reason_t sense_reason) { - struct se_wwn *wwn = cmd->se_sess->se_tpg->se_tpg_wwn; - int success, cpu; unsigned long flags; + int success; if (target_cmd_interrupted(cmd)) return; @@ -933,15 +979,7 @@ void target_complete_cmd_with_sense(struct se_cmd *cmd, u8 scsi_status, cmd->transport_state |= (CMD_T_COMPLETE | CMD_T_ACTIVE); spin_unlock_irqrestore(&cmd->t_state_lock, flags); - INIT_WORK(&cmd->work, success ? target_complete_ok_work : - target_complete_failure_work); - - if (!wwn || wwn->cmd_compl_affinity == SE_COMPL_AFFINITY_CPUID) - cpu = cmd->cpuid; - else - cpu = wwn->cmd_compl_affinity; - - queue_work_on(cpu, target_completion_wq, &cmd->work); + target_complete(cmd, success); } EXPORT_SYMBOL(target_complete_cmd_with_sense); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index b62d5fcce950..9a0e9f9e1ec4 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -111,6 +111,15 @@ /* Peripheral Device Text Identification Information */ #define PD_TEXT_ID_INFO_LEN 256 +enum target_compl_type { + /* Use the fabric driver's default completion type */ + TARGET_FABRIC_DEFAULT_COMPL, + /* Complete from the backend calling context */ + TARGET_DIRECT_COMPL, + /* Defer completion to the LIO workqueue */ + TARGET_QUEUE_COMPL, +}; + enum target_submit_type { /* Use the fabric driver's default submission type */ TARGET_FABRIC_DEFAULT_SUBMIT, @@ -741,6 +750,7 @@ struct se_dev_attrib { u32 atomic_granularity; u32 atomic_max_with_boundary; u32 atomic_max_boundary; + u8 complete_type; u8 submit_type; struct se_device *da_dev; struct config_group da_group; diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 3378ff9ee271..e9039e73d058 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -118,15 +118,21 @@ struct target_core_fabric_ops { * its entirety before a command is aborted. */ unsigned int write_pending_must_be_called:1; + /* + * Set this if the driver does not require calling queue_data_in + * queue_status and check_stop_free from a worker thread when + * completing successful commands. + */ + unsigned int direct_compl_supp:1; /* * Set this if the driver supports submitting commands to the backend * from target_submit/target_submit_cmd. */ unsigned int direct_submit_supp:1; - /* - * Set this to a target_submit_type value. - */ + /* Set this to a target_submit_type value. */ u8 default_submit_type; + /* Set this to the target_compl_type value. */ + u8 default_compl_type; }; int target_register_template(const struct target_core_fabric_ops *fo); -- cgit v1.2.3 From 2a76a626670b2ef391da37f457e8e51f168432a6 Mon Sep 17 00:00:00 2001 From: Taha Ed-Dafili <0rayn.dev@gmail.com> Date: Thu, 26 Feb 2026 15:11:03 +0000 Subject: iio: core: Add IIO_EV_INFO_SCALE to event info Implement support for IIO_EV_INFO_SCALE in the internal enum iio_event_info to allow proper ABI compliance. This allows drivers (like the ADXL345) to expose event scale attributes using the standard IIO ABI rather than manual device attributes. Signed-off-by: Taha Ed-Dafili <0rayn.dev@gmail.com> Reviewed-by: David Lechner Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-event.c | 1 + include/linux/iio/types.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c index 4149efcd5539..a0d6fcf2a9c9 100644 --- a/drivers/iio/industrialio-event.c +++ b/drivers/iio/industrialio-event.c @@ -256,6 +256,7 @@ static const char * const iio_ev_info_text[] = { [IIO_EV_INFO_TAP2_MIN_DELAY] = "tap2_min_delay", [IIO_EV_INFO_RUNNING_PERIOD] = "runningperiod", [IIO_EV_INFO_RUNNING_COUNT] = "runningcount", + [IIO_EV_INFO_SCALE] = "scale", }; static enum iio_event_direction iio_ev_attr_dir(struct iio_dev_attr *attr) diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 34eebad12d2c..4e3099defc1d 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -21,6 +21,7 @@ enum iio_event_info { IIO_EV_INFO_TAP2_MIN_DELAY, IIO_EV_INFO_RUNNING_PERIOD, IIO_EV_INFO_RUNNING_COUNT, + IIO_EV_INFO_SCALE, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From d3b693a13b39bce16e284e1c737874966b3a96de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:47:43 -0800 Subject: spi: spi-mem: clean up kernel-doc in spi-mem.h Eliminate all kernel-doc warnings in spi-mem.h: - add missing struct member descriptions - don't use "struct" for function descrptions Warning: include/linux/spi/spi-mem.h:202 struct member 'cmd' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'addr' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'dummy' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'data' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:286 Incorrect use of kernel-doc format: * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem Warning: include/linux/spi/spi-mem.h:298 Incorrect use of kernel-doc format: * struct spi_controller_mem_ops - SPI memory operations Warning: include/linux/spi/spi-mem.h:362 expecting prototype for struct spi_mem_set_drvdata. Prototype was for struct spi_controller_mem_ops instead Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301014743.3133167-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 5774e554c0f0..37f709784350 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -130,11 +130,13 @@ enum spi_mem_data_dir { /** * struct spi_mem_op - describes a SPI memory operation + * @cmd: the complete command * @cmd.nbytes: number of opcode bytes (only 1 or 2 are valid). The opcode is * sent MSB-first. * @cmd.buswidth: number of IO lines used to transmit the command * @cmd.opcode: operation opcode * @cmd.dtr: whether the command opcode should be sent in DTR mode or not + * @addr: the address attributes * @addr.nbytes: number of address bytes to send. Can be zero if the operation * does not need to send an address * @addr.buswidth: number of IO lines used to transmit the address cycles @@ -143,10 +145,12 @@ enum spi_mem_data_dir { * Note that only @addr.nbytes are taken into account in this * address value, so users should make sure the value fits in the * assigned number of bytes. + * @dummy: data for dummy operation * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can * be zero if the operation does not require dummy bytes * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes * @dummy.dtr: whether the dummy bytes should be sent in DTR mode or not + * @data: the data attributes * @data.buswidth: number of IO lanes used to send/receive the data * @data.dtr: whether the data should be sent in DTR mode or not * @data.ecc: whether error correction is required or not @@ -273,7 +277,7 @@ struct spi_mem { }; /** - * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem + * spi_mem_set_drvdata() - attach driver private data to a SPI mem * device * @mem: memory device * @data: data to attach to the memory device @@ -284,7 +288,7 @@ static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data) } /** - * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem + * spi_mem_get_drvdata() - get driver private data attached to a SPI mem * device * @mem: memory device * -- cgit v1.2.3 From 73942a6ea26bd7e02b7c260b8b7aa942397be894 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Tue, 24 Feb 2026 16:18:05 +0000 Subject: firmware: cs_dsp: Add API to hibernate the DSP For some parts, the DSP is kept running when in low power mode (hibernation), leaving the firmware ALSA controls enabled, but the registers are inaccessible. Attempts to access volatile firmware controls whilst in this state would produce errors in the kernel log due to a regmap_raw_read() into DSP registers whilst the regmap is in cache_only. To remove this error log, add a hibernating flag to indicate that the controls are inaccessible, so we no longer try to read or write to the registers whilst the regmap is in cache_only. This would still produce an error when trying to read or write to these controls, but this would be a different error (-EPERM instead of -EBUSY), and would not produce a spurious error log in the kernel. Upon wake from hibernation, the control caches are re-synced to the hardware, if the DSP is running. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20260224161821.93365-2-sbinding@opensource.cirrus.com Reviewed-by: Richard Fitzgerald Signed-off-by: Mark Brown --- drivers/firmware/cirrus/cs_dsp.c | 49 +++++++++++++++++++++++++++++++--- include/linux/firmware/cirrus/cs_dsp.h | 3 +++ 2 files changed, 48 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index b4f1c01e3b5b..f9d8a883900d 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -515,6 +515,7 @@ void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root) debugfs_create_bool("booted", 0444, root, &dsp->booted); debugfs_create_bool("running", 0444, root, &dsp->running); + debugfs_create_bool("hibernating", 0444, root, &dsp->hibernating); debugfs_create_x32("fw_id", 0444, root, &dsp->fw_id); debugfs_create_x32("fw_version", 0444, root, &dsp->fw_id_version); @@ -703,7 +704,7 @@ int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int lockdep_assert_held(&dsp->pwr_lock); - if (!dsp->running) + if (!dsp->running || dsp->hibernating) return -EPERM; ret = cs_dsp_coeff_base_reg(ctl, ®, 0); @@ -827,7 +828,7 @@ int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, } ctl->set = 1; - if (ctl->enabled && ctl->dsp->running) + if (ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) ret = cs_dsp_coeff_write_ctrl_raw(ctl, off, buf, len); if (ret < 0) @@ -920,12 +921,12 @@ int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, return -EINVAL; if (ctl->flags & WMFW_CTL_FLAG_VOLATILE) { - if (ctl->enabled && ctl->dsp->running) + if (ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) return cs_dsp_coeff_read_ctrl_raw(ctl, off, buf, len); else return -EPERM; } else { - if (!ctl->flags && ctl->enabled && ctl->dsp->running) + if (!ctl->flags && ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) ret = cs_dsp_coeff_read_ctrl_raw(ctl, 0, ctl->cache, ctl->len); if (buf != ctl->cache) @@ -1108,6 +1109,44 @@ err_ctl: return ret; } + +/** + * cs_dsp_hibernate() - Disable or enable all controls for a DSP + * @dsp: pointer to DSP structure + * @hibernate: whether to set controls to cache only mode + * + * When @hibernate is true, the DSP is entering hibernation mode where the + * regmap is inaccessible, and all controls become cache only. + * When @hibernate is false, the DSP has exited hibernation mode. If the DSP + * is running, all controls are re-synced to the DSP. + * + */ +void cs_dsp_hibernate(struct cs_dsp *dsp, bool hibernate) +{ + mutex_lock(&dsp->pwr_lock); + + if (!dsp->running) { + cs_dsp_dbg(dsp, "Cannot hibernate, DSP not running\n"); + goto out; + } + + if (dsp->hibernating == hibernate) + goto out; + + cs_dsp_dbg(dsp, "Set hibernating to %d\n", hibernate); + dsp->hibernating = hibernate; + + if (!dsp->hibernating && dsp->running) { + int ret = cs_dsp_coeff_sync_controls(dsp); + + if (ret) + cs_dsp_err(dsp, "Error syncing controls: %d\n", ret); + } +out: + mutex_unlock(&dsp->pwr_lock); +} +EXPORT_SYMBOL_NS_GPL(cs_dsp_hibernate, "FW_CS_DSP"); + struct cs_dsp_coeff_parsed_alg { int id; const u8 *name; @@ -2498,6 +2537,7 @@ int cs_dsp_adsp1_power_up(struct cs_dsp *dsp, goto err_ena; dsp->booted = true; + dsp->hibernating = false; /* Start the core running */ regmap_update_bits(dsp->regmap, dsp->base + ADSP1_CONTROL_30, @@ -2776,6 +2816,7 @@ int cs_dsp_power_up(struct cs_dsp *dsp, dsp->ops->disable_core(dsp); dsp->booted = true; + dsp->hibernating = false; mutex_unlock(&dsp->pwr_lock); diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 0ec1cdc5585d..4e3baa557068 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -179,6 +179,7 @@ struct cs_dsp { bool booted; bool running; + bool hibernating; struct list_head ctl_list; @@ -354,4 +355,6 @@ int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val); int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch); int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits); +void cs_dsp_hibernate(struct cs_dsp *dsp, bool hibernating); + #endif -- cgit v1.2.3 From 98eb42c7de6b0185c914df4cca61b49ff76821ee Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 25 Feb 2026 21:50:02 +0800 Subject: ASoC: add snd_soc_lookup_component_by_name helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper to help user to get the component by name. Signed-off-by: Bard Liao Reviewed-by: Charles Keepax Reviewed-by: Péter Ujfalusi Link: https://patch.msgid.link/20260225135004.2322987-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc.h | 1 + sound/soc/soc-core.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 7d8376c8e1be..1a486153dc76 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -465,6 +465,7 @@ struct snd_soc_component *snd_soc_lookup_component_nolocked(struct device *dev, const char *driver_name); struct snd_soc_component *snd_soc_lookup_component(struct device *dev, const char *driver_name); +struct snd_soc_component *snd_soc_lookup_component_by_name(const char *component_name); int soc_new_pcm(struct snd_soc_pcm_runtime *rtd); #ifdef CONFIG_SND_SOC_COMPRESS diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index d0fffef65daf..f41ca5109a6a 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -404,6 +404,19 @@ struct snd_soc_component *snd_soc_lookup_component(struct device *dev, } EXPORT_SYMBOL_GPL(snd_soc_lookup_component); +struct snd_soc_component *snd_soc_lookup_component_by_name(const char *component_name) +{ + struct snd_soc_component *component; + + guard(mutex)(&client_mutex); + for_each_component(component) + if (strstr(component->name, component_name)) + return component; + + return NULL; +} +EXPORT_SYMBOL_GPL(snd_soc_lookup_component_by_name); + struct snd_soc_pcm_runtime *snd_soc_get_pcm_runtime(struct snd_soc_card *card, struct snd_soc_dai_link *dai_link) -- cgit v1.2.3 From d69cb039ab1930706428566caf5a714d0cb3ed3d Mon Sep 17 00:00:00 2001 From: Janusz Dziedzic Date: Fri, 6 Feb 2026 18:15:49 +0100 Subject: wifi: cfg80211: set and report chandef CAC ongoing Allow to track and check CAC state from user mode by simple check phy channels eg. using iw phy1 channels command. This is done for regular CAC and background CAC. It is important for background CAC while we can start it from any app (eg. iw or hostapd). Signed-off-by: Janusz Dziedzic Link: https://patch.msgid.link/20260206171830.553879-3-janusz.dziedzic@gmail.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/chan.c | 27 +++++++++++++++++++++++++++ net/wireless/core.h | 4 ++++ net/wireless/mlme.c | 7 +++++++ net/wireless/nl80211.c | 7 +++++++ 6 files changed, 54 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fc01de19c798..e00045c150e7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -190,6 +190,8 @@ enum ieee80211_channel_flags { * on this channel. * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered. * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels. + * @cac_start_time: timestamp (CLOCK_BOOTTIME, nanoseconds) when CAC was + * started on this channel. Zero when CAC is not in progress. * @psd: power spectral density (in dBm) */ struct ieee80211_channel { @@ -207,6 +209,7 @@ struct ieee80211_channel { enum nl80211_dfs_state dfs_state; unsigned long dfs_state_entered; unsigned int dfs_cac_ms; + u64 cac_start_time; s8 psd; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b63f71850906..c75aa039f096 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4480,6 +4480,10 @@ enum nl80211_wmm_rule { * as a non-primary subchannel. Only applicable to S1G channels. * @NL80211_FREQUENCY_ATTR_NO_UHR: UHR operation is not allowed on this channel * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_CAC_START_TIME: Channel Availability Check (CAC) + * start time (CLOCK_BOOTTIME, nanoseconds). Only present when CAC is + * currently in progress on this channel. + * @NL80211_FREQUENCY_ATTR_PAD: attribute used for padding for 64-bit alignment * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4530,6 +4534,8 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_16MHZ, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY, NL80211_FREQUENCY_ATTR_NO_UHR, + NL80211_FREQUENCY_ATTR_CAC_START_TIME, + NL80211_FREQUENCY_ATTR_PAD, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 68221b1ab45e..dfe319565280 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -642,6 +642,33 @@ void cfg80211_set_dfs_state(struct wiphy *wiphy, } } +void cfg80211_set_cac_state(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + bool cac_ongoing) +{ + struct ieee80211_channel *c; + int width; + u64 cac_time; + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return; + + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return; + + /* Get the same timestamp for all subchannels */ + cac_time = cac_ongoing ? ktime_get_boottime_ns() : 0; + + for_each_subchan(chandef, freq, cf) { + c = ieee80211_get_channel_khz(wiphy, freq); + if (!c) + continue; + + c->cac_start_time = cac_time; + } +} + static bool cfg80211_dfs_permissive_check_wdev(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, diff --git a/net/wireless/core.h b/net/wireless/core.h index 6ac57b7b2615..6cace846d7a3 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -481,6 +481,10 @@ void cfg80211_set_dfs_state(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state); +void cfg80211_set_cac_state(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + bool cac_ongoing); + void cfg80211_dfs_channels_update_work(struct work_struct *work); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 212178d04efa..283ea4c7c61e 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1162,9 +1162,11 @@ void cfg80211_cac_event(struct net_device *netdev, fallthrough; case NL80211_RADAR_CAC_ABORTED: wdev->links[link_id].cac_started = false; + cfg80211_set_cac_state(wiphy, chandef, false); break; case NL80211_RADAR_CAC_STARTED: wdev->links[link_id].cac_started = true; + cfg80211_set_cac_state(wiphy, chandef, true); break; default: WARN_ON(1); @@ -1192,15 +1194,18 @@ __cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, switch (event) { case NL80211_RADAR_CAC_FINISHED: cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); + cfg80211_set_cac_state(wiphy, chandef, false); memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); break; case NL80211_RADAR_CAC_ABORTED: + cfg80211_set_cac_state(wiphy, chandef, false); if (!cancel_delayed_work(&rdev->background_cac_done_wk)) return; break; case NL80211_RADAR_CAC_STARTED: + cfg80211_set_cac_state(wiphy, chandef, true); break; default: return; @@ -1307,6 +1312,8 @@ void cfg80211_stop_radar_detection(struct wireless_dev *wdev) chandef = *wdev_chandef(wdev, link_id); rdev_end_cac(rdev, wdev->netdev, link_id); + wdev->links[link_id].cac_started = false; + cfg80211_set_cac_state(wiphy, &chandef, false); nl80211_radar_notify(rdev, &chandef, NL80211_RADAR_CAC_ABORTED, wdev->netdev, GFP_KERNEL); } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b94231c8441c..7e288d3ce5ae 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1333,6 +1333,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_UHR) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR)) goto nla_put_failure; + if (chan->cac_start_time && + nla_put_u64_64bit(msg, + NL80211_FREQUENCY_ATTR_CAC_START_TIME, + chan->cac_start_time, + NL80211_FREQUENCY_ATTR_PAD)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -11353,6 +11359,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, wdev->links[link_id].cac_started = true; wdev->links[link_id].cac_start_time = jiffies; wdev->links[link_id].cac_time_ms = cac_time_ms; + cfg80211_set_cac_state(wiphy, &chandef, true); return 0; } -- cgit v1.2.3 From 6a584e336cefb230e2d981a464f4d85562eb750c Mon Sep 17 00:00:00 2001 From: Hari Chandrakanthan Date: Mon, 16 Feb 2026 08:50:26 +0530 Subject: wifi: cfg80211: add support to handle incumbent signal detected event from mac80211/driver When any incumbent signal is detected by an AP/mesh interface operating in 6 GHz band, FCC mandates the AP/mesh to vacate the channels affected by it [1]. Add a new API cfg80211_incumbent_signal_notify() that can be used by mac80211 or drivers to notify the higher layers about the signal interference event with the interference bitmap in which each bit denotes the affected 20 MHz in the operating channel. Add support for the new nl80211 event and nl80211 attribute as well to notify userspace on the details about the interference event. Userspace is expected to process it and take further action - vacate the channel, or reduce the bandwidth. [1] - https://apps.fcc.gov/kdb/GetAttachment.html?id=nXQiRC%2B4mfiA54Zha%2BrW4Q%3D%3D&desc=987594%20D02%20U-NII%206%20GHz%20EMC%20Measurement%20v03&tracking_number=277034 Signed-off-by: Hari Chandrakanthan Signed-off-by: Amith A Link: https://patch.msgid.link/20260216032027.2310956-2-amith.a@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 23 +++++++++++++++++++++++ include/uapi/linux/nl80211.h | 19 +++++++++++++++++++ net/wireless/nl80211.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 101 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e00045c150e7..cea77bf90cfe 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10475,4 +10475,27 @@ cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); } + +/** + * cfg80211_incumbent_signal_notify - Notify userspace of incumbent signal detection + * @wiphy: the wiphy to use + * @chandef: channel definition in which the interference was detected + * @signal_interference_bitmap: bitmap indicating interference across 20 MHz segments + * @gfp: allocation context for message creation and multicast; pass GFP_ATOMIC + * if called from atomic context (e.g. firmware event handler), otherwise + * GFP_KERNEL + * + * Use this function to notify userspace when an incumbent signal is detected on + * the operating channel in the 6 GHz band. The notification includes the + * current channel definition and a bitmap representing interference across + * the operating bandwidth. Each bit in the bitmap corresponds to a 20 MHz + * segment, with the lowest bit representing the lowest frequency segment. + * Punctured sub-channels are included in the bitmap structure but are always + * set to zero since interference detection is not performed on them. + */ +void cfg80211_incumbent_signal_notify(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 signal_interference_bitmap, + gfp_t gfp); + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c75aa039f096..fe2c8c8d6dd6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1361,6 +1361,12 @@ * user space that the NAN new cluster has been joined. The cluster ID is * indicated by %NL80211_ATTR_MAC. * + * @NL80211_CMD_INCUMBENT_SIGNAL_DETECT: Once any incumbent signal is detected + * on the operating channel in 6 GHz band, userspace is notified with the + * signal interference bitmap using + * %NL80211_ATTR_INCUMBENT_SIGNAL_INTERFERENCE_BITMAP. The current channel + * definition is also sent. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1624,6 +1630,8 @@ enum nl80211_commands { NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, NL80211_CMD_NAN_CLUSTER_JOINED, + NL80211_CMD_INCUMBENT_SIGNAL_DETECT, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2984,6 +2992,15 @@ enum nl80211_commands { * this feature during association. This is a flag attribute. * Currently only supported in mac80211 drivers. * + * @NL80211_ATTR_INCUMBENT_SIGNAL_INTERFERENCE_BITMAP: u32 attribute specifying + * the signal interference bitmap detected on the operating bandwidth for + * %NL80211_CMD_INCUMBENT_SIGNAL_DETECT. Each bit represents a 20 MHz + * segment, lowest bit corresponds to the lowest 20 MHz segment, in the + * operating bandwidth where the interference is detected. Punctured + * sub-channels are included in the bitmap structure; however, since + * interference detection is not performed on these sub-channels, their + * corresponding bits are consistently set to zero. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3557,6 +3574,8 @@ enum nl80211_attrs { NL80211_ATTR_UHR_CAPABILITY, NL80211_ATTR_DISABLE_UHR, + NL80211_ATTR_INCUMBENT_SIGNAL_INTERFERENCE_BITMAP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b619f99c221e..0da055dad159 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21127,6 +21127,46 @@ void cfg80211_ch_switch_notify(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_ch_switch_notify); +void cfg80211_incumbent_signal_notify(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 signal_interference_bitmap, + gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_incumbent_signal_notify(wiphy, chandef, signal_interference_bitmap); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_INCUMBENT_SIGNAL_DETECT); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + if (nl80211_send_chandef(msg, chandef)) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_INCUMBENT_SIGNAL_INTERFERENCE_BITMAP, + signal_interference_bitmap)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_incumbent_signal_notify); + void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id, u8 count, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 643ccf4f0227..352a57d8b968 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4225,6 +4225,25 @@ TRACE_EVENT(cfg80211_nan_cluster_joined, WDEV_PR_ARG, __entry->cluster_id, __entry->new_cluster ? " [new]" : "") ); + +TRACE_EVENT(cfg80211_incumbent_signal_notify, + TP_PROTO(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 signal_interference_bitmap), + TP_ARGS(wiphy, chandef, signal_interference_bitmap), + TP_STRUCT__entry( + WIPHY_ENTRY + CHAN_DEF_ENTRY + __field(u32, signal_interference_bitmap) + ), + TP_fast_assign( + WIPHY_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + __entry->signal_interference_bitmap = signal_interference_bitmap; + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", signal_interference_bitmap=0x%x", + WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->signal_interference_bitmap) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 033fe322f5852d5144a85978e880e01b1787fd0d Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Thu, 19 Feb 2026 11:47:13 +0200 Subject: wifi: nl80211/cfg80211: support stations of non-netdev interfaces Currently, a station can only be added to a netdev interface, mainly because there was no need for a station of a non-netdev interface. But for NAN, we will have stations that belong to the NL80211_IFTYPE_NAN interface. Prepare for adding/changing/deleting a station that belongs to a non-netdev interface. This doesn't actually allow such stations - this will be done in a different patch. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260219114327.65c9cc96f814.Ic02066b88bb8ad6b21e15cbea8d720280008c83b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 9 +- drivers/net/wireless/ath/ath6kl/main.c | 4 +- drivers/net/wireless/ath/wil6210/cfg80211.c | 20 ++-- drivers/net/wireless/ath/wil6210/main.c | 3 +- drivers/net/wireless/ath/wil6210/wmi.c | 5 +- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 23 +++-- drivers/net/wireless/marvell/libertas/cfg.c | 2 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 24 ++--- drivers/net/wireless/marvell/mwifiex/uap_event.c | 7 +- drivers/net/wireless/microchip/wilc1000/cfg80211.c | 26 +++--- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 26 +++--- drivers/net/wireless/quantenna/qtnfmac/event.c | 6 +- drivers/net/wireless/virtual/virt_wifi.c | 12 ++- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 24 ++--- include/net/cfg80211.h | 24 ++--- net/mac80211/cfg.c | 21 +++-- net/mac80211/sta_info.c | 4 +- net/wireless/nl80211.c | 84 ++++++++++------- net/wireless/rdev-ops.h | 30 +++--- net/wireless/trace.h | 104 ++++++++++++++------- net/wireless/util.c | 2 +- net/wireless/wext-compat.c | 6 +- 22 files changed, 264 insertions(+), 202 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 88f0197fc041..eecba2201b10 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1775,9 +1775,10 @@ static bool is_rate_ht40(s32 rate, u8 *mcs, bool *sgi) return false; } -static int ath6kl_get_station(struct wiphy *wiphy, struct net_device *dev, +static int ath6kl_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { + struct net_device *dev = wdev->netdev; struct ath6kl *ar = ath6kl_priv(dev); struct ath6kl_vif *vif = netdev_priv(dev); long left; @@ -2992,9 +2993,10 @@ static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev, static const u8 bcast_addr[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -static int ath6kl_del_station(struct wiphy *wiphy, struct net_device *dev, +static int ath6kl_del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { + struct net_device *dev = wdev->netdev; struct ath6kl *ar = ath6kl_priv(dev); struct ath6kl_vif *vif = netdev_priv(dev); const u8 *addr = params->mac ? params->mac : bcast_addr; @@ -3003,10 +3005,11 @@ static int ath6kl_del_station(struct wiphy *wiphy, struct net_device *dev, addr, WLAN_REASON_PREV_AUTH_NOT_VALID); } -static int ath6kl_change_station(struct wiphy *wiphy, struct net_device *dev, +static int ath6kl_change_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { + struct net_device *dev = wdev->netdev; struct ath6kl *ar = ath6kl_priv(dev); struct ath6kl_vif *vif = netdev_priv(dev); int err; diff --git a/drivers/net/wireless/ath/ath6kl/main.c b/drivers/net/wireless/ath/ath6kl/main.c index 85d6ad53cf94..8afc6589fc51 100644 --- a/drivers/net/wireless/ath/ath6kl/main.c +++ b/drivers/net/wireless/ath/ath6kl/main.c @@ -494,7 +494,7 @@ void ath6kl_connect_ap_mode_sta(struct ath6kl_vif *vif, u16 aid, u8 *mac_addr, sinfo->assoc_req_ies = ies; sinfo->assoc_req_ies_len = ies_len; - cfg80211_new_sta(vif->ndev, mac_addr, sinfo, GFP_KERNEL); + cfg80211_new_sta(&vif->wdev, mac_addr, sinfo, GFP_KERNEL); netif_wake_queue(vif->ndev); @@ -1011,7 +1011,7 @@ void ath6kl_disconnect_event(struct ath6kl_vif *vif, u8 reason, u8 *bssid, if (!is_broadcast_ether_addr(bssid)) { /* send event to application */ - cfg80211_del_sta(vif->ndev, bssid, GFP_KERNEL); + cfg80211_del_sta(&vif->wdev, bssid, GFP_KERNEL); } if (memcmp(vif->ndev->dev_addr, bssid, ETH_ALEN) == 0) { diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index c74f5e66166d..2d8660ccc6f3 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -533,11 +533,11 @@ int wil_cid_fill_sinfo(struct wil6210_vif *vif, int cid, } static int wil_cfg80211_get_station(struct wiphy *wiphy, - struct net_device *ndev, + struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { - struct wil6210_vif *vif = ndev_to_vif(ndev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); int rc; int cid = wil_find_cid(wil, vif->mid, mac); @@ -573,11 +573,11 @@ int wil_find_cid_by_idx(struct wil6210_priv *wil, u8 mid, int idx) } static int wil_cfg80211_dump_station(struct wiphy *wiphy, - struct net_device *dev, int idx, + struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct wil6210_vif *vif = ndev_to_vif(dev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); int rc; int cid = wil_find_cid_by_idx(wil, vif->mid, idx); @@ -2225,12 +2225,12 @@ static int wil_cfg80211_stop_ap(struct wiphy *wiphy, } static int wil_cfg80211_add_station(struct wiphy *wiphy, - struct net_device *dev, + struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { - struct wil6210_vif *vif = ndev_to_vif(dev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); wil_dbg_misc(wil, "add station %pM aid %d mid %d mask 0x%x set 0x%x\n", mac, params->aid, vif->mid, @@ -2250,11 +2250,11 @@ static int wil_cfg80211_add_station(struct wiphy *wiphy, } static int wil_cfg80211_del_station(struct wiphy *wiphy, - struct net_device *dev, + struct wireless_dev *wdev, struct station_del_parameters *params) { - struct wil6210_vif *vif = ndev_to_vif(dev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); wil_dbg_misc(wil, "del_station: %pM, reason=%d mid=%d\n", params->mac, params->reason_code, vif->mid); @@ -2267,12 +2267,12 @@ static int wil_cfg80211_del_station(struct wiphy *wiphy, } static int wil_cfg80211_change_station(struct wiphy *wiphy, - struct net_device *dev, + struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { - struct wil6210_vif *vif = ndev_to_vif(dev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); int authorize; int cid, i; struct wil_ring_tx_data *txdata = NULL; diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c index 44c24c6c8360..d5aec72ecdce 100644 --- a/drivers/net/wireless/ath/wil6210/main.c +++ b/drivers/net/wireless/ath/wil6210/main.c @@ -245,7 +245,6 @@ __acquires(&sta->tid_rx_lock) __releases(&sta->tid_rx_lock) { uint i; struct wil6210_priv *wil = vif_to_wil(vif); - struct net_device *ndev = vif_to_ndev(vif); struct wireless_dev *wdev = vif_to_wdev(vif); struct wil_sta_info *sta = &wil->sta[cid]; int min_ring_id = wil_get_min_tx_ring_id(wil); @@ -265,7 +264,7 @@ __acquires(&sta->tid_rx_lock) __releases(&sta->tid_rx_lock) case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: /* AP-like interface */ - cfg80211_del_sta(ndev, sta->addr, GFP_KERNEL); + cfg80211_del_sta(wdev, sta->addr, GFP_KERNEL); break; default: break; diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 05b040c684e8..479b2418ca34 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -1076,7 +1076,7 @@ static void wmi_evt_connect(struct wil6210_vif *vif, int id, void *d, int len) if (rc) { if (disable_ap_sme) /* notify new_sta has failed */ - cfg80211_del_sta(ndev, evt->bssid, GFP_KERNEL); + cfg80211_del_sta(wdev, evt->bssid, GFP_KERNEL); goto out; } @@ -1093,7 +1093,8 @@ static void wmi_evt_connect(struct wil6210_vif *vif, int id, void *d, int len) sinfo->assoc_req_ies_len = assoc_req_ielen; } - cfg80211_new_sta(ndev, evt->bssid, sinfo, GFP_KERNEL); + cfg80211_new_sta(ndev->ieee80211_ptr, evt->bssid, sinfo, + GFP_KERNEL); kfree(sinfo); } else { diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index cea02b33b798..f7e17994e59a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3132,11 +3132,11 @@ brcmf_cfg80211_get_station_ibss(struct brcmf_if *ifp, } static s32 -brcmf_cfg80211_get_station(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); struct brcmf_pub *drvr = cfg->pub; struct brcmf_scb_val_le scb_val; s32 err = 0; @@ -3255,11 +3255,11 @@ done: } static int -brcmf_cfg80211_dump_station(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); struct brcmf_pub *drvr = cfg->pub; s32 err; @@ -3284,7 +3284,8 @@ brcmf_cfg80211_dump_station(struct wiphy *wiphy, struct net_device *ndev, } if (idx < le32_to_cpu(cfg->assoclist.count)) { memcpy(mac, cfg->assoclist.mac[idx], ETH_ALEN); - return brcmf_cfg80211_get_station(wiphy, ndev, mac, sinfo); + return brcmf_cfg80211_get_station(wiphy, wdev, + mac, sinfo); } return -ENOENT; } @@ -5452,12 +5453,13 @@ brcmf_cfg80211_change_beacon(struct wiphy *wiphy, struct net_device *ndev, } static int -brcmf_cfg80211_del_station(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_pub *drvr = cfg->pub; struct brcmf_scb_val_le scbval; + struct net_device *ndev = wdev->netdev; struct brcmf_if *ifp = netdev_priv(ndev); s32 err; @@ -5484,12 +5486,12 @@ brcmf_cfg80211_del_station(struct wiphy *wiphy, struct net_device *ndev, } static int -brcmf_cfg80211_change_station(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_change_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_pub *drvr = cfg->pub; - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); s32 err; brcmf_dbg(TRACE, "Enter, MAC %pM, mask 0x%04x set 0x%04x\n", mac, @@ -6548,13 +6550,14 @@ brcmf_notify_connect_status_ap(struct brcmf_cfg80211_info *cfg, sinfo->assoc_req_ies_len = e->datalen; generation++; sinfo->generation = generation; - cfg80211_new_sta(ndev, e->addr, sinfo, GFP_KERNEL); + cfg80211_new_sta(ndev->ieee80211_ptr, e->addr, sinfo, + GFP_KERNEL); kfree(sinfo); } else if ((event == BRCMF_E_DISASSOC_IND) || (event == BRCMF_E_DEAUTH_IND) || (event == BRCMF_E_DEAUTH)) { - cfg80211_del_sta(ndev, e->addr, GFP_KERNEL); + cfg80211_del_sta(ndev->ieee80211_ptr, e->addr, GFP_KERNEL); } return 0; } diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 98517888dba7..56a82b26a1e9 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -1607,7 +1607,7 @@ static int lbs_cfg_del_key(struct wiphy *wiphy, struct net_device *netdev, * Get station */ -static int lbs_cfg_get_station(struct wiphy *wiphy, struct net_device *dev, +static int lbs_cfg_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { struct lbs_private *priv = wiphy_priv(wiphy); diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index eb28fe718e71..71e71a5af453 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -1554,10 +1554,10 @@ mwifiex_dump_station_info(struct mwifiex_private *priv, * requested station information, if available. */ static int -mwifiex_cfg80211_get_station(struct wiphy *wiphy, struct net_device *dev, +mwifiex_cfg80211_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); if (!priv->media_connected) return -ENOENT; @@ -1571,10 +1571,10 @@ mwifiex_cfg80211_get_station(struct wiphy *wiphy, struct net_device *dev, * CFG802.11 operation handler to dump station information. */ static int -mwifiex_cfg80211_dump_station(struct wiphy *wiphy, struct net_device *dev, +mwifiex_cfg80211_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); struct mwifiex_sta_node *node; int i; @@ -1901,10 +1901,11 @@ static int mwifiex_cfg80211_change_beacon(struct wiphy *wiphy, * associated stations list, no action is taken. */ static int -mwifiex_cfg80211_del_station(struct wiphy *wiphy, struct net_device *dev, +mwifiex_cfg80211_del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); + struct mwifiex_private *priv = + mwifiex_netdev_get_priv(wdev->netdev); struct mwifiex_sta_node *sta_node; u8 deauth_mac[ETH_ALEN]; @@ -3993,7 +3994,8 @@ mwifiex_cfg80211_uap_add_station(struct mwifiex_private *priv, const u8 *mac, if (!sinfo) return -ENOMEM; - cfg80211_new_sta(priv->netdev, mac, sinfo, GFP_KERNEL); + cfg80211_new_sta(priv->netdev->ieee80211_ptr, mac, sinfo, + GFP_KERNEL); kfree(sinfo); } @@ -4001,10 +4003,10 @@ mwifiex_cfg80211_uap_add_station(struct mwifiex_private *priv, const u8 *mac, } static int -mwifiex_cfg80211_add_station(struct wiphy *wiphy, struct net_device *dev, +mwifiex_cfg80211_add_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); if (priv->adapter->host_mlme_enabled && (GET_BSS_ROLE(priv) == MWIFIEX_BSS_ROLE_UAP)) @@ -4240,12 +4242,12 @@ mwifiex_cfg80211_start_radar_detection(struct wiphy *wiphy, } static int -mwifiex_cfg80211_change_station(struct wiphy *wiphy, struct net_device *dev, +mwifiex_cfg80211_change_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { int ret; - struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); if (priv->adapter->host_mlme_enabled && (GET_BSS_ROLE(priv) == MWIFIEX_BSS_ROLE_UAP)) diff --git a/drivers/net/wireless/marvell/mwifiex/uap_event.c b/drivers/net/wireless/marvell/mwifiex/uap_event.c index 703104fd1fbe..679fdae0f001 100644 --- a/drivers/net/wireless/marvell/mwifiex/uap_event.c +++ b/drivers/net/wireless/marvell/mwifiex/uap_event.c @@ -130,8 +130,8 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv) le16_to_cpu(event->len) - (u16)len; } } - cfg80211_new_sta(priv->netdev, event->sta_addr, sinfo, - GFP_KERNEL); + cfg80211_new_sta(priv->netdev->ieee80211_ptr, event->sta_addr, + sinfo, GFP_KERNEL); node = mwifiex_add_sta_entry(priv, event->sta_addr); if (!node) { @@ -162,7 +162,8 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv) case EVENT_UAP_STA_DEAUTH: deauth_mac = adapter->event_body + MWIFIEX_UAP_EVENT_EXTRA_HEADER; - cfg80211_del_sta(priv->netdev, deauth_mac, GFP_KERNEL); + cfg80211_del_sta(priv->netdev->ieee80211_ptr, deauth_mac, + GFP_KERNEL); if (priv->ap_11n_enabled) { mwifiex_11n_del_rx_reorder_tbl_by_ta(priv, deauth_mac); diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index 1561a601c7f2..21ef341e002b 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -733,9 +733,10 @@ static int set_default_mgmt_key(struct wiphy *wiphy, struct net_device *netdev, return wilc_set_default_mgmt_key_index(vif, key_index); } -static int get_station(struct wiphy *wiphy, struct net_device *dev, +static int get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { + struct net_device *dev = wdev->netdev; struct wilc_vif *vif = netdev_priv(dev); struct wilc_priv *priv = &vif->priv; struct wilc *wilc = vif->wilc; @@ -1312,10 +1313,10 @@ static int set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int dump_station(struct wiphy *wiphy, struct net_device *dev, +static int dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct wilc_vif *vif = netdev_priv(dev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); int ret; if (idx != 0) @@ -1450,11 +1451,11 @@ static int stop_ap(struct wiphy *wiphy, struct net_device *dev, return ret; } -static int add_station(struct wiphy *wiphy, struct net_device *dev, +static int add_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { int ret = 0; - struct wilc_vif *vif = netdev_priv(dev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); struct wilc_priv *priv = &vif->priv; if (vif->iftype == WILC_AP_MODE || vif->iftype == WILC_GO_MODE) { @@ -1463,18 +1464,18 @@ static int add_station(struct wiphy *wiphy, struct net_device *dev, ret = wilc_add_station(vif, mac, params); if (ret) - netdev_err(dev, "Host add station fail\n"); + netdev_err(wdev->netdev, "Host add station fail\n"); } return ret; } -static int del_station(struct wiphy *wiphy, struct net_device *dev, +static int del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { const u8 *mac = params->mac; int ret = 0; - struct wilc_vif *vif = netdev_priv(dev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); struct wilc_priv *priv = &vif->priv; struct sta_info *info; @@ -1488,20 +1489,19 @@ static int del_station(struct wiphy *wiphy, struct net_device *dev, ret = wilc_del_station(vif, mac); if (ret) - netdev_err(dev, "Host delete station fail\n"); + netdev_err(wdev->netdev, "Host delete station fail\n"); return ret; } -static int change_station(struct wiphy *wiphy, struct net_device *dev, +static int change_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { int ret = 0; - struct wilc_vif *vif = netdev_priv(dev); - + struct wilc_vif *vif = netdev_priv(wdev->netdev); if (vif->iftype == WILC_AP_MODE || vif->iftype == WILC_GO_MODE) { ret = wilc_edit_station(vif, mac, params); if (ret) - netdev_err(dev, "Host edit station fail\n"); + netdev_err(wdev->netdev, "Host edit station fail\n"); } return ret; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index f1188368e66b..340240847a2f 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -483,26 +483,26 @@ qtnf_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, } static int -qtnf_get_station(struct wiphy *wiphy, struct net_device *dev, +qtnf_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); sinfo->generation = vif->generation; return qtnf_cmd_get_sta_info(vif, mac, sinfo); } static int -qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev, +qtnf_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); const struct qtnf_sta_node *sta_node; int ret; - switch (vif->wdev.iftype) { + switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - if (idx != 0 || !vif->wdev.connected) + if (idx != 0 || !wdev->connected) return -ENOENT; ether_addr_copy(mac, vif->bssid); @@ -520,9 +520,9 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev, ret = qtnf_cmd_get_sta_info(vif, mac, sinfo); - if (vif->wdev.iftype == NL80211_IFTYPE_AP) { + if (wdev->iftype == NL80211_IFTYPE_AP) { if (ret == -ENOENT) { - cfg80211_del_sta(vif->netdev, mac, GFP_KERNEL); + cfg80211_del_sta(&vif->wdev, mac, GFP_KERNEL); sinfo->filled = 0; } } @@ -602,10 +602,10 @@ qtnf_set_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, } static int -qtnf_change_station(struct wiphy *wiphy, struct net_device *dev, +qtnf_change_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; ret = qtnf_cmd_send_change_sta(vif, mac, params); @@ -617,14 +617,14 @@ qtnf_change_station(struct wiphy *wiphy, struct net_device *dev, } static int -qtnf_del_station(struct wiphy *wiphy, struct net_device *dev, +qtnf_del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; if (params->mac && - (vif->wdev.iftype == NL80211_IFTYPE_AP) && + (wdev->iftype == NL80211_IFTYPE_AP) && !is_broadcast_ether_addr(params->mac) && !qtnf_sta_list_lookup(&vif->sta_list, params->mac)) return 0; diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c index 2551d74ed56e..fb5a56b6dd05 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/event.c +++ b/drivers/net/wireless/quantenna/qtnfmac/event.c @@ -90,8 +90,8 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif, goto out; } - cfg80211_new_sta(vif->netdev, sta_assoc->sta_addr, sinfo, - GFP_KERNEL); + cfg80211_new_sta(vif->netdev->ieee80211_ptr, sta_assoc->sta_addr, + sinfo, GFP_KERNEL); out: kfree(sinfo); @@ -126,7 +126,7 @@ qtnf_event_handle_sta_deauth(struct qtnf_wmac *mac, struct qtnf_vif *vif, sta_addr, reason); if (qtnf_sta_list_del(vif, sta_addr)) - cfg80211_del_sta(vif->netdev, sta_deauth->sta_addr, + cfg80211_del_sta(&vif->wdev, sta_deauth->sta_addr, GFP_KERNEL); return 0; diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 885dc7243e8d..bc349c763578 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -320,9 +320,11 @@ static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev, } /* Called with the rtnl lock held. */ -static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev, - const u8 *mac, struct station_info *sinfo) +static int virt_wifi_get_station(struct wiphy *wiphy, + struct wireless_dev *wdev, const u8 *mac, + struct station_info *sinfo) { + struct net_device *dev = wdev->netdev; struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "get_station\n"); @@ -345,10 +347,10 @@ static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev, } /* Called with the rtnl lock held. */ -static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev, +static int virt_wifi_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct virt_wifi_netdev_priv *priv = netdev_priv(dev); + struct virt_wifi_netdev_priv *priv = netdev_priv(wdev->netdev); wiphy_debug(wiphy, "dump_station\n"); @@ -356,7 +358,7 @@ static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev, return -ENOENT; ether_addr_copy(mac, fake_router_bssid); - return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo); + return virt_wifi_get_station(wiphy, wdev, fake_router_bssid, sinfo); } static const struct cfg80211_ops virt_wifi_cfg80211_ops = { diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 7cb0c6f22bf3..83422c5c8c44 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -960,11 +960,12 @@ static int cfg80211_rtw_set_default_key(struct wiphy *wiphy, } static int cfg80211_rtw_get_station(struct wiphy *wiphy, - struct net_device *ndev, + struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { int ret = 0; + struct net_device *ndev = wdev_to_ndev(wdev); struct adapter *padapter = rtw_netdev_priv(ndev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; struct sta_info *psta = NULL; @@ -1912,7 +1913,7 @@ static int cfg80211_rtw_flush_pmksa(struct wiphy *wiphy, void rtw_cfg80211_indicate_sta_assoc(struct adapter *padapter, u8 *pmgmt_frame, uint frame_len) { - struct net_device *ndev = padapter->pnetdev; + struct wireless_dev *wdev = padapter->rtw_wdev; { struct station_info sinfo = {}; @@ -1926,15 +1927,15 @@ void rtw_cfg80211_indicate_sta_assoc(struct adapter *padapter, u8 *pmgmt_frame, sinfo.filled = 0; sinfo.assoc_req_ies = pmgmt_frame + WLAN_HDR_A3_LEN + ie_offset; sinfo.assoc_req_ies_len = frame_len - WLAN_HDR_A3_LEN - ie_offset; - cfg80211_new_sta(ndev, GetAddr2Ptr(pmgmt_frame), &sinfo, GFP_ATOMIC); + cfg80211_new_sta(wdev, GetAddr2Ptr(pmgmt_frame), &sinfo, GFP_ATOMIC); } } void rtw_cfg80211_indicate_sta_disassoc(struct adapter *padapter, unsigned char *da, unsigned short reason) { - struct net_device *ndev = padapter->pnetdev; + struct wireless_dev *wdev = padapter->rtw_wdev; - cfg80211_del_sta(ndev, da, GFP_ATOMIC); + cfg80211_del_sta(wdev, da, GFP_ATOMIC); } static u8 rtw_get_chan_type(struct adapter *adapter) @@ -2323,21 +2324,22 @@ static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev, } static int cfg80211_rtw_add_station(struct wiphy *wiphy, - struct net_device *ndev, + struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { return 0; } -static int cfg80211_rtw_del_station(struct wiphy *wiphy, struct net_device *ndev, +static int cfg80211_rtw_del_station(struct wiphy *wiphy, + struct wireless_dev *wdev, struct station_del_parameters *params) { int ret = 0; struct list_head *phead, *plist, *tmp; u8 updated = false; struct sta_info *psta = NULL; - struct adapter *padapter = rtw_netdev_priv(ndev); + struct adapter *padapter = rtw_netdev_priv(wdev->netdev); struct mlme_priv *pmlmepriv = &(padapter->mlmepriv); struct sta_priv *pstapriv = &padapter->stapriv; const u8 *mac = params->mac; @@ -2388,7 +2390,7 @@ static int cfg80211_rtw_del_station(struct wiphy *wiphy, struct net_device *ndev } static int cfg80211_rtw_change_station(struct wiphy *wiphy, - struct net_device *ndev, + struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { @@ -2416,12 +2418,12 @@ static struct sta_info *rtw_sta_info_get_by_idx(const int idx, struct sta_priv * } static int cfg80211_rtw_dump_station(struct wiphy *wiphy, - struct net_device *ndev, + struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { int ret = 0; - struct adapter *padapter = rtw_netdev_priv(ndev); + struct adapter *padapter = rtw_netdev_priv(wdev_to_ndev(wdev)); struct sta_info *psta = NULL; struct sta_priv *pstapriv = &padapter->stapriv; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cea77bf90cfe..c21354647da0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4954,17 +4954,17 @@ struct cfg80211_ops { unsigned int link_id); - int (*add_station)(struct wiphy *wiphy, struct net_device *dev, + int (*add_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params); - int (*del_station)(struct wiphy *wiphy, struct net_device *dev, + int (*del_station)(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params); - int (*change_station)(struct wiphy *wiphy, struct net_device *dev, + int (*change_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params); - int (*get_station)(struct wiphy *wiphy, struct net_device *dev, + int (*get_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo); - int (*dump_station)(struct wiphy *wiphy, struct net_device *dev, + int (*dump_station)(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo); int (*add_mpath)(struct wiphy *wiphy, struct net_device *dev, @@ -8965,35 +8965,35 @@ static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) /** * cfg80211_new_sta - notify userspace about station * - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address * @sinfo: the station information * @gfp: allocation flags */ -void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, +void cfg80211_new_sta(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); /** * cfg80211_del_sta_sinfo - notify userspace about deletion of a station - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address. For MLD station, MLD address is used. * @sinfo: the station information/statistics * @gfp: allocation flags */ -void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, +void cfg80211_del_sta_sinfo(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); /** * cfg80211_del_sta - notify userspace about deletion of a station * - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address. For MLD station, MLD address is used. * @gfp: allocation flags */ -static inline void cfg80211_del_sta(struct net_device *dev, +static inline void cfg80211_del_sta(struct wireless_dev *wdev, const u8 *mac_addr, gfp_t gfp) { - cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp); + cfg80211_del_sta_sinfo(wdev, mac_addr, NULL, gfp); } /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index df5153772592..aa3b86644e8f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1000,10 +1000,10 @@ void sta_set_rate_info_tx(struct sta_info *sta, rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } -static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; @@ -1035,10 +1035,11 @@ static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, return drv_get_survey(local, idx, survey); } -static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_get_station(struct wiphy *wiphy, + struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; @@ -2363,7 +2364,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, return 0; } -static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_add_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { @@ -2381,7 +2382,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, sdata->vif.type != NL80211_IFTYPE_AP) return -EINVAL; } else - sdata = IEEE80211_DEV_TO_SUB_IF(dev); + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; @@ -2435,12 +2436,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, return sta_info_insert(sta); } -static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_del_station(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params) { struct ieee80211_sub_if_data *sdata; - sdata = IEEE80211_DEV_TO_SUB_IF(dev); + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); @@ -2450,10 +2451,10 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_change_station(struct wiphy *wiphy, - struct net_device *dev, const u8 *mac, + struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6dc22f1593be..4259e9c13ed7 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -974,7 +974,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) } sinfo->generation = local->sta_generation; - cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); + cfg80211_new_sta(&sdata->wdev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); @@ -1557,7 +1557,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); - cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); + cfg80211_del_sta_sinfo(&sdata->wdev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3486475dc119..f54b3cca6975 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7507,7 +7507,7 @@ nla_put_failure: static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, - struct net_device *dev, + struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, bool link_stats) { @@ -7523,7 +7523,10 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, return -1; } - if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + if ((wdev->netdev && + nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr) || nla_put_u32(msg, NL80211_ATTR_GENERATION, sinfo->generation)) goto nla_put_failure; @@ -8002,7 +8005,7 @@ static int nl80211_dump_station(struct sk_buff *skb, sinfo_alloc = true; } - err = rdev_dump_station(rdev, wdev->netdev, sta_idx, + err = rdev_dump_station(rdev, wdev, sta_idx, mac_addr, &sinfo); if (err == -ENOENT) break; @@ -8020,7 +8023,7 @@ static int nl80211_dump_station(struct sk_buff *skb, if (nl80211_send_station(skb, NL80211_CMD_NEW_STATION, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - rdev, wdev->netdev, mac_addr, + rdev, wdev, mac_addr, &sinfo, false) < 0) goto out; @@ -8041,7 +8044,7 @@ static int nl80211_dump_station(struct sk_buff *skb, static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; struct station_info sinfo; struct sk_buff *msg; u8 *mac_addr = NULL; @@ -8049,6 +8052,9 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) memset(&sinfo, 0, sizeof(sinfo)); + if (!wdev->netdev) + return -EINVAL; + if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -8065,7 +8071,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) } } - err = rdev_get_station(rdev, dev, mac_addr, &sinfo); + err = rdev_get_station(rdev, wdev, mac_addr, &sinfo); if (err) { cfg80211_sinfo_release_content(&sinfo); return err; @@ -8082,7 +8088,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo, false) < 0) { + rdev, wdev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -8444,13 +8450,17 @@ static int nl80211_parse_sta_txpower_setting(struct genl_info *info, static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct net_device *dev = wdev->netdev; struct station_parameters params; u8 *mac_addr; int err; memset(¶ms, 0, sizeof(params)); + if (!dev) + return -EINVAL; + if (!rdev->ops->change_station) return -EOPNOTSUPP; @@ -8523,7 +8533,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]); } - if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) + if (parse_station_flags(info, wdev->iftype, ¶ms)) return -EINVAL; if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) @@ -8583,7 +8593,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(params.vlan)) return PTR_ERR(params.vlan); - switch (dev->ieee80211_ptr->iftype) { + switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: @@ -8598,7 +8608,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) } /* driver will call cfg80211_check_station_change() */ - err = rdev_change_station(rdev, dev, mac_addr, ¶ms); + err = rdev_change_station(rdev, wdev, mac_addr, ¶ms); out_put_vlan: dev_put(params.vlan); @@ -8610,8 +8620,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev = info->user_ptr[1]; - struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wireless_dev *wdev = info->user_ptr[1]; + struct net_device *dev = wdev->netdev; struct station_parameters params; u8 *mac_addr = NULL; u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) | @@ -8619,6 +8629,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) memset(¶ms, 0, sizeof(params)); + if (!dev) + return -EINVAL; + if (!rdev->ops->add_station) return -EOPNOTSUPP; @@ -8668,7 +8681,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) * and is NOT supported for AP interface */ params.support_p2p_ps = - dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO; + wdev->iftype == NL80211_IFTYPE_P2P_GO; } if (info->attrs[NL80211_ATTR_PEER_AID]) @@ -8774,7 +8787,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) + if (parse_station_flags(info, wdev->iftype, ¶ms)) return -EINVAL; /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT @@ -8802,7 +8815,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8); - switch (dev->ieee80211_ptr->iftype) { + switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: @@ -8911,7 +8924,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.epp_peer = nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]); - err = rdev_add_station(rdev, dev, mac_addr, ¶ms); + err = rdev_add_station(rdev, wdev, mac_addr, ¶ms); out: dev_put(params.vlan); return err; @@ -8920,13 +8933,16 @@ out: static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - struct net_device *dev = info->user_ptr[1]; - struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wireless_dev *wdev = info->user_ptr[1]; + struct net_device *dev = wdev->netdev; struct station_del_parameters params; int link_id = nl80211_link_id_or_invalid(info->attrs); memset(¶ms, 0, sizeof(params)); + if (!dev) + return -EINVAL; + if (info->attrs[NL80211_ATTR_MAC]) params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -8982,7 +8998,7 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) params.link_id = link_id; - return rdev_del_station(rdev, dev, ¶ms); + return rdev_del_station(rdev, wdev, ¶ms); } static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, @@ -14241,7 +14257,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, mac_addr = wdev->links[0].client.current_bss->pub.bssid; - err = rdev_get_station(rdev, dev, mac_addr, &sinfo); + err = rdev_get_station(rdev, wdev, mac_addr, &sinfo); if (err) return err; @@ -17342,7 +17358,7 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) !ether_addr_equal(buf + ETH_ALEN, dev->dev_addr)) return -EINVAL; - err = rdev_get_station(rdev, dev, dest, &sinfo); + err = rdev_get_station(rdev, wdev, dest, &sinfo); if (err) return err; @@ -18424,21 +18440,21 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV), }, { .cmd = NL80211_CMD_SET_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_NEW_STATION, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_station, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_DEL_STATION, @@ -18449,7 +18465,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { * whether MAC address is passed or not. If MAC address is * passed, then even during MLO, link ID is not required. */ - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_GET_MPATH, @@ -20380,21 +20396,21 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_tx_mgmt_expired); -void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, +void cfg80211_new_sta(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { - struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; - trace_cfg80211_new_sta(dev, mac_addr, sinfo); + trace_cfg80211_new_sta(wdev, mac_addr, sinfo); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo, false) < 0) { + rdev, wdev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -20404,10 +20420,10 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, } EXPORT_SYMBOL(cfg80211_new_sta); -void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, +void cfg80211_del_sta_sinfo(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp) { - struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; struct station_info empty_sinfo = {}; @@ -20415,7 +20431,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, if (!sinfo) sinfo = &empty_sinfo; - trace_cfg80211_del_sta(dev, mac_addr); + trace_cfg80211_del_sta(wdev, mac_addr); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) { @@ -20424,7 +20440,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo, false) < 0) { + rdev, wdev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index ac6884bacf3f..a8f1e7ddc0c0 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -193,56 +193,56 @@ static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, u8 *mac, + struct wireless_dev *wdev, u8 *mac, struct station_parameters *params) { int ret; - trace_rdev_add_station(&rdev->wiphy, dev, mac, params); - ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); + trace_rdev_add_station(&rdev->wiphy, wdev, mac, params); + ret = rdev->ops->add_station(&rdev->wiphy, wdev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, + struct wireless_dev *wdev, struct station_del_parameters *params) { int ret; - trace_rdev_del_station(&rdev->wiphy, dev, params); - ret = rdev->ops->del_station(&rdev->wiphy, dev, params); + trace_rdev_del_station(&rdev->wiphy, wdev, params); + ret = rdev->ops->del_station(&rdev->wiphy, wdev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, u8 *mac, + struct wireless_dev *wdev, u8 *mac, struct station_parameters *params) { int ret; - trace_rdev_change_station(&rdev->wiphy, dev, mac, params); - ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); + trace_rdev_change_station(&rdev->wiphy, wdev, mac, params); + ret = rdev->ops->change_station(&rdev->wiphy, wdev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *mac, + struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { int ret; - trace_rdev_get_station(&rdev->wiphy, dev, mac); - ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); + trace_rdev_get_station(&rdev->wiphy, wdev, mac); + ret = rdev->ops->get_station(&rdev->wiphy, wdev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, - struct net_device *dev, int idx, u8 *mac, + struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { int ret; - trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); - ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); + trace_rdev_dump_station(&rdev->wiphy, wdev, idx, mac); + ret = rdev->ops->dump_station(&rdev->wiphy, wdev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 352a57d8b968..8ab78a899f57 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -856,12 +856,12 @@ TRACE_EVENT(rdev_end_cac, ); DECLARE_EVENT_CLASS(station_add_change, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u8 *mac, struct station_parameters *params), - TP_ARGS(wiphy, netdev, mac, params), + TP_ARGS(wiphy, wdev, mac, params), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(sta_mac) __field(u32, sta_flags_mask) __field(u32, sta_flags_set) @@ -888,7 +888,7 @@ DECLARE_EVENT_CLASS(station_add_change, ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); __entry->sta_flags_mask = params->sta_flags_mask; __entry->sta_flags_set = params->sta_flags_set; @@ -936,11 +936,11 @@ DECLARE_EVENT_CLASS(station_add_change, __entry->opmode_notif_used = params->link_sta_params.opmode_notif_used; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", station mac: %pM" ", station flags mask: 0x%x, station flags set: 0x%x, " "station modify mask: 0x%x, listen interval: %d, aid: %u, " "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->sta_mac, __entry->sta_flags_mask, __entry->sta_flags_set, __entry->sta_modify_mask, __entry->listen_interval, __entry->aid, __entry->plink_action, __entry->plink_state, @@ -948,15 +948,15 @@ DECLARE_EVENT_CLASS(station_add_change, ); DEFINE_EVENT(station_add_change, rdev_add_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u8 *mac, struct station_parameters *params), - TP_ARGS(wiphy, netdev, mac, params) + TP_ARGS(wiphy, wdev, mac, params) ); DEFINE_EVENT(station_add_change, rdev_change_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u8 *mac, struct station_parameters *params), - TP_ARGS(wiphy, netdev, mac, params) + TP_ARGS(wiphy, wdev, mac, params) ); DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt, @@ -977,12 +977,12 @@ DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt, ); DECLARE_EVENT_CLASS(station_del, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params), - TP_ARGS(wiphy, netdev, params), + TP_ARGS(wiphy, wdev, params), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(sta_mac) __field(u8, subtype) __field(u16, reason_code) @@ -990,28 +990,45 @@ DECLARE_EVENT_CLASS(station_del, ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(sta_mac, params->mac); __entry->subtype = params->subtype; __entry->reason_code = params->reason_code; __entry->link_id = params->link_id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", station mac: %pM" ", subtype: %u, reason_code: %u, link_id: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->sta_mac, __entry->subtype, __entry->reason_code, __entry->link_id) ); DEFINE_EVENT(station_del, rdev_del_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params), - TP_ARGS(wiphy, netdev, params) + TP_ARGS(wiphy, wdev, params) ); -DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), - TP_ARGS(wiphy, netdev, mac) +DECLARE_EVENT_CLASS(wiphy_wdev_mac_evt, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac), + TP_ARGS(wiphy, wdev, mac), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + MAC_ENTRY(sta_mac) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + MAC_ASSIGN(sta_mac, mac); + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", mac: %pM", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->sta_mac) +); + +DEFINE_EVENT(wiphy_wdev_mac_evt, rdev_get_station, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac), + TP_ARGS(wiphy, wdev, mac) ); DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath, @@ -1020,23 +1037,23 @@ DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath, ); TRACE_EVENT(rdev_dump_station, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int _idx, u8 *mac), - TP_ARGS(wiphy, netdev, _idx, mac), + TP_ARGS(wiphy, wdev, _idx, mac), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(sta_mac) __field(int, idx) ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); __entry->idx = _idx; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM, idx: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", station mac: %pM, idx: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->sta_mac, __entry->idx) ); @@ -3153,6 +3170,21 @@ DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt, NETDEV_PR_ARG, __entry->macaddr) ); +DECLARE_EVENT_CLASS(cfg80211_wdev_mac_evt, + TP_PROTO(struct wireless_dev *wdev, const u8 *macaddr), + TP_ARGS(wdev, macaddr), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(macaddr) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(macaddr, macaddr); + ), + TP_printk(WDEV_PR_FMT ", mac: %pM", + WDEV_PR_ARG, __entry->macaddr) +); + DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, TP_PROTO(struct net_device *netdev, const u8 *macaddr), TP_ARGS(netdev, macaddr) @@ -3342,26 +3374,26 @@ TRACE_EVENT(cfg80211_tx_mgmt_expired, ); TRACE_EVENT(cfg80211_new_sta, - TP_PROTO(struct net_device *netdev, const u8 *mac_addr, + TP_PROTO(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo), - TP_ARGS(netdev, mac_addr, sinfo), + TP_ARGS(wdev, mac_addr, sinfo), TP_STRUCT__entry( - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(mac_addr) SINFO_ENTRY ), TP_fast_assign( - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); SINFO_ASSIGN; ), - TP_printk(NETDEV_PR_FMT ", %pM", - NETDEV_PR_ARG, __entry->mac_addr) + TP_printk(WDEV_PR_FMT ", %pM", + WDEV_PR_ARG, __entry->mac_addr) ); -DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta, - TP_PROTO(struct net_device *netdev, const u8 *macaddr), - TP_ARGS(netdev, macaddr) +DEFINE_EVENT(cfg80211_wdev_mac_evt, cfg80211_del_sta, + TP_PROTO(struct wireless_dev *wdev, const u8 *macaddr), + TP_ARGS(wdev, macaddr) ); TRACE_EVENT(cfg80211_rx_mgmt, diff --git a/net/wireless/util.c b/net/wireless/util.c index b78530c3e3f8..702904048d5a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2669,7 +2669,7 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, guard(wiphy)(&rdev->wiphy); - return rdev_get_station(rdev, dev, mac_addr, sinfo); + return rdev_get_station(rdev, wdev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 5a70a0120343..98a4f4c7970d 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -7,7 +7,7 @@ * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2023, 2026 Intel Corporation */ #include @@ -1261,7 +1261,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, return err; scoped_guard(wiphy, &rdev->wiphy) { - err = rdev_get_station(rdev, dev, addr, &sinfo); + err = rdev_get_station(rdev, wdev, addr, &sinfo); } if (err) return err; @@ -1305,7 +1305,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) memset(&sinfo, 0, sizeof(sinfo)); - ret = rdev_get_station(rdev, dev, bssid, &sinfo); + ret = rdev_get_station(rdev, wdev, bssid, &sinfo); wiphy_unlock(&rdev->wiphy); if (ret) -- cgit v1.2.3 From 0495b64132154dd04ed5d443bb35afd3769a13a6 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 20 Feb 2026 01:12:41 +0530 Subject: wifi: mac80211: fetch FILS discovery template by link ID Currently, the FILS discovery template is always fetched from the default link of a virtual interface in both Multi-Link Operation (MLO) and non-MLO cases. However, in the MLO case there is a need to fetch the FILS discovery template from a specific link instead of the default link. Hence, add support for fetching the FILS discovery template based on the link ID from the corresponding link data. Signed-off-by: Sriram R Co-developed-by: Raj Kumar Bhagat Signed-off-by: Raj Kumar Bhagat Link: https://patch.msgid.link/20260220-fils-prob-by-link-v1-1-a2746a853f75@oss.qualcomm.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 2 +- drivers/net/wireless/ath/ath12k/mac.c | 3 ++- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7996/mcu.c | 3 ++- include/net/mac80211.h | 4 +++- net/mac80211/tx.c | 20 +++++++++++++------- 6 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index e4ee2ba1f669..dda77f87461e 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3305,7 +3305,7 @@ static int ath11k_mac_fils_discovery(struct ath11k_vif *arvif, if (info->fils_discovery.max_interval) { interval = info->fils_discovery.max_interval; - tmpl = ieee80211_get_fils_discovery_tmpl(ar->hw, arvif->vif); + tmpl = ieee80211_get_fils_discovery_tmpl(ar->hw, arvif->vif, 0); if (tmpl) ret = ath11k_wmi_fils_discovery_tmpl(ar, arvif->vdev_id, tmpl); diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index c6b88909b6b7..af57ac10d517 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4311,7 +4311,8 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, if (info->fils_discovery.max_interval) { interval = info->fils_discovery.max_interval; - tmpl = ieee80211_get_fils_discovery_tmpl(hw, vif); + tmpl = ieee80211_get_fils_discovery_tmpl(hw, vif, + info->link_id); if (tmpl) ret = ath12k_wmi_fils_discovery_tmpl(ar, arvif->vdev_id, tmpl); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 00bff4d3aab8..83ce06857a1e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -1977,7 +1977,7 @@ mt7915_mcu_add_inband_discov(struct mt7915_dev *dev, struct ieee80211_vif *vif, if (changed & BSS_CHANGED_FILS_DISCOVERY) { interval = vif->bss_conf.fils_discovery.max_interval; - skb = ieee80211_get_fils_discovery_tmpl(hw, vif); + skb = ieee80211_get_fils_discovery_tmpl(hw, vif, 0); } else if (changed & BSS_CHANGED_UNSOL_BCAST_PROBE_RESP && vif->bss_conf.unsol_bcast_probe_resp_interval) { interval = vif->bss_conf.unsol_bcast_probe_resp_interval; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c index c0c042de477b..968afc2967a8 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c @@ -2863,7 +2863,8 @@ int mt7996_mcu_beacon_inband_discov(struct mt7996_dev *dev, if (changed & BSS_CHANGED_FILS_DISCOVERY && link_conf->fils_discovery.max_interval) { interval = link_conf->fils_discovery.max_interval; - skb = ieee80211_get_fils_discovery_tmpl(hw, vif); + skb = ieee80211_get_fils_discovery_tmpl(hw, vif, + link_conf->link_id); } else if (changed & BSS_CHANGED_UNSOL_BCAST_PROBE_RESP && link_conf->unsol_bcast_probe_resp_interval) { interval = link_conf->unsol_bcast_probe_resp_interval; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7f9d96939a4e..d36c14a86c8a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7766,13 +7766,15 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO. * * The driver is responsible for freeing the returned skb. * * Return: FILS discovery template. %NULL on error. */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8cdbd417d7be..77ad85a49924 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5837,21 +5837,28 @@ out: EXPORT_SYMBOL(ieee80211_proberesp_get); struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + unsigned int link_id) { struct sk_buff *skb = NULL; struct fils_discovery_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; - rcu_read_lock(); - tmpl = rcu_dereference(sdata->deflink.u.ap.fils_discovery); - if (!tmpl) { - rcu_read_unlock(); + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return NULL; + + guard(rcu)(); + link = rcu_dereference(sdata->link[link_id]); + if (!link) + return NULL; + + tmpl = rcu_dereference(link->u.ap.fils_discovery); + if (!tmpl) return NULL; - } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { @@ -5859,7 +5866,6 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, skb_put_data(skb, tmpl->data, tmpl->len); } - rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); -- cgit v1.2.3 From e098c26b3524b6a8087dfc8f664d7cc76d30ecc2 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 20 Feb 2026 01:12:42 +0530 Subject: wifi: mac80211: fetch unsolicited probe response template by link ID Currently, the unsolicited probe response template is always fetched from the default link of a virtual interface in both Multi-Link Operation (MLO) and non-MLO cases. However, in the MLO case there is a need to fetch the unsolicited probe response template from a specific link instead of the default link. Hence, add support for fetching the unsolicited probe response template based on the link ID from the corresponding link data. Signed-off-by: Sriram R Co-developed-by: Raj Kumar Bhagat Signed-off-by: Raj Kumar Bhagat Link: https://patch.msgid.link/20260220-fils-prob-by-link-v1-2-a2746a853f75@oss.qualcomm.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 2 +- drivers/net/wireless/ath/ath12k/mac.c | 3 ++- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7996/mcu.c | 3 ++- include/net/mac80211.h | 4 +++- net/mac80211/tx.c | 20 +++++++++++++------- 6 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index dda77f87461e..ca08de6bbe85 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3314,7 +3314,7 @@ static int ath11k_mac_fils_discovery(struct ath11k_vif *arvif, interval = info->unsol_bcast_probe_resp_interval; tmpl = ieee80211_get_unsol_bcast_probe_resp_tmpl(ar->hw, - arvif->vif); + arvif->vif, 0); if (tmpl) ret = ath11k_wmi_probe_resp_tmpl(ar, arvif->vdev_id, tmpl); diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index af57ac10d517..275263f77f44 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4320,7 +4320,8 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, unsol_bcast_probe_resp_enabled = 1; interval = info->unsol_bcast_probe_resp_interval; - tmpl = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif); + tmpl = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif, + info->link_id); if (tmpl) ret = ath12k_wmi_probe_resp_tmpl(ar, arvif->vdev_id, tmpl); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 83ce06857a1e..2d2f34aa465d 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -1981,7 +1981,7 @@ mt7915_mcu_add_inband_discov(struct mt7915_dev *dev, struct ieee80211_vif *vif, } else if (changed & BSS_CHANGED_UNSOL_BCAST_PROBE_RESP && vif->bss_conf.unsol_bcast_probe_resp_interval) { interval = vif->bss_conf.unsol_bcast_probe_resp_interval; - skb = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif); + skb = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif, 0); } if (!skb) { diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c index 968afc2967a8..b4422a4754cd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c @@ -2868,7 +2868,8 @@ int mt7996_mcu_beacon_inband_discov(struct mt7996_dev *dev, } else if (changed & BSS_CHANGED_UNSOL_BCAST_PROBE_RESP && link_conf->unsol_bcast_probe_resp_interval) { interval = link_conf->unsol_bcast_probe_resp_interval; - skb = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif); + skb = ieee80211_get_unsol_bcast_probe_resp_tmpl(hw, vif, + link_conf->link_id); } if (!skb) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d36c14a86c8a..89027e94ba5c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7781,6 +7781,7 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, * probe response template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO. * * The driver is responsible for freeing the returned skb. * @@ -7788,7 +7789,8 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, */ struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 77ad85a49924..28dcdd7f0e05 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5872,21 +5872,28 @@ EXPORT_SYMBOL(ieee80211_get_fils_discovery_tmpl); struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + unsigned int link_id) { struct sk_buff *skb = NULL; struct unsol_bcast_probe_resp_data *tmpl = NULL; struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; if (sdata->vif.type != NL80211_IFTYPE_AP) return NULL; - rcu_read_lock(); - tmpl = rcu_dereference(sdata->deflink.u.ap.unsol_bcast_probe_resp); - if (!tmpl) { - rcu_read_unlock(); + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return NULL; + + guard(rcu)(); + link = rcu_dereference(sdata->link[link_id]); + if (!link) + return NULL; + + tmpl = rcu_dereference(link->u.ap.unsol_bcast_probe_resp); + if (!tmpl) return NULL; - } skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + tmpl->len); if (skb) { @@ -5894,7 +5901,6 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, skb_put_data(skb, tmpl->data, tmpl->len); } - rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_get_unsol_bcast_probe_resp_tmpl); -- cgit v1.2.3 From a536be923191e2662369ee87e5d7beb50946c71c Mon Sep 17 00:00:00 2001 From: Sai Pratyusha Magam Date: Thu, 26 Feb 2026 09:59:59 +0530 Subject: wifi: mac80211: Fix AAD/Nonce computation for management frames with MLO Per IEEE Std 802.11be-2024, 12.5.2.3.3, if the MPDU is an individually addressed Data frame between an AP MLD and a non-AP MLD associated with the AP MLD, then A1/A2/A3 will be MLD MAC addresses. Otherwise, Al/A2/A3 will be over-the-air link MAC addresses. Currently, during AAD and Nonce computation for software based encryption/decryption cases, mac80211 directly uses the addresses it receives in the skb frame header. However, after the first authentication, management frame addresses for non-AP MLD stations are translated to MLD addresses from over the air link addresses in software. This means that the skb header could contain translated MLD addresses, which when used as is, can lead to incorrect AAD/Nonce computation. In the following manner, ensure that the right set of addresses are used: In the receive path, stash the pre-translated link addresses in ieee80211_rx_data and use them for the AAD/Nonce computations when required. In the transmit path, offload the encryption for a CCMP/GCMP key to the hwsim driver that can then ensure that encryption and hence the AAD/Nonce computations are performed on the frame containing the right set of addresses, i.e, MLD addresses if unicast data frame and link addresses otherwise. To do so, register the set key handler in hwsim driver so mac80211 is aware that it is the driver that would take care of encrypting the frame. Offload encryption for a CCMP/GCMP key, while keeping the encryption for WEP/TKIP and MMIE generation for a AES_CMAC or a AES_GMAC key still at the SW crypto in mac layer Co-developed-by: Rohan Dutta Signed-off-by: Rohan Dutta Signed-off-by: Sai Pratyusha Magam Link: https://patch.msgid.link/20260226042959.3766157-1-sai.magam@oss.qualcomm.com [only store and apply link_addrs for unicast non-data rather storing always and applying for !unicast_data] Signed-off-by: Johannes Berg --- drivers/net/wireless/virtual/mac80211_hwsim.c | 46 +++++++++++++++++++++- include/net/mac80211.h | 7 ++++ net/mac80211/ieee80211_i.h | 2 + net/mac80211/rx.c | 5 +++ net/mac80211/tx.c | 32 ++++++++++++++++ net/mac80211/wpa.c | 55 +++++++++++++++++++++------ 6 files changed, 134 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index d2a43eec4641..d8bb34809965 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -1992,6 +1992,25 @@ mac80211_hwsim_select_tx_link(struct mac80211_hwsim_data *data, return NULL; } +static int mac80211_hwsim_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key) +{ + switch (key->cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + break; + default: + return 1; + } + + key->flags |= IEEE80211_KEY_FLAG_RESERVE_TAILROOM; + return 0; +} + static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) @@ -2002,7 +2021,7 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct ieee80211_vif *vif = txi->control.vif; - bool ack; + bool ack, unicast_data; enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; u32 _portid, i; @@ -2012,6 +2031,16 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, return; } + unicast_data = is_unicast_ether_addr(hdr->addr1) && + ieee80211_is_data(hdr->frame_control); + + if (unicast_data && ieee80211_encrypt_tx_skb(skb) < 0) { + ieee80211_free_txskb(hw, skb); + return; + } + /* re-assign hdr since skb data may have shifted after encryption */ + hdr = (void *)skb->data; + if (vif && vif->type == NL80211_IFTYPE_NAN && !data->tmp_chan) { /* For NAN Device simulation purposes, assume that NAN is always * on channel 6 or channel 149, unless a ROC is in progress (for @@ -2097,6 +2126,13 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, } } + if (!unicast_data && ieee80211_encrypt_tx_skb(skb) < 0) { + ieee80211_free_txskb(hw, skb); + return; + } + /* re-assign hdr since skb data may have shifted after encryption */ + hdr = (void *)skb->data; + if (WARN(!channel, "TX w/o channel - queue = %d\n", txi->hw_queue)) { ieee80211_free_txskb(hw, skb); return; @@ -4245,6 +4281,7 @@ static int mac80211_hwsim_set_radar_background(struct ieee80211_hw *hw, .stop_nan = mac80211_hwsim_stop_nan, \ .nan_change_conf = mac80211_hwsim_change_nan_config, \ .set_radar_background = mac80211_hwsim_set_radar_background, \ + .set_key = mac80211_hwsim_set_key, \ HWSIM_DEBUGFS_OPS #define HWSIM_NON_MLO_OPS \ @@ -5684,6 +5721,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_SUPPORTS_5_10_MHZ | WIPHY_FLAG_HAS_CHANNEL_SWITCH; + hw->wiphy->flags |= WIPHY_FLAG_IBSS_RSN; hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR | NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | NL80211_FEATURE_STATIC_SMPS | @@ -5702,6 +5740,12 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BSS_COLOR); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_CAN_REPLACE_PTK0); + wiphy_ext_feature_set(hw->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID); hw->wiphy->interface_modes = param->iftypes; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 89027e94ba5c..9f8251fb9832 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7966,4 +7966,11 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, * Return: %true iff the vif is a NAN interface and NAN is started */ bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); + +/** + * ieee80211_encrypt_tx_skb - Encrypt the transmit skb + * @skb: the skb + * Return: 0 if success and non-zero on error + */ +int ieee80211_encrypt_tx_skb(struct sk_buff *skb); #endif /* MAC80211_H */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e60b814dd89e..a4babf7624e5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -256,6 +256,8 @@ struct ieee80211_rx_data { u8 pn[IEEE80211_CCMP_PN_LEN]; } ccm_gcm; }; + + u8 link_addrs[3 * ETH_ALEN]; }; struct ieee80211_csa_settings { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 11d6c56c9d7e..6c4b549444c6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5127,6 +5127,11 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, hdr = (struct ieee80211_hdr *)rx->skb->data; } + /* Store a copy of the pre-translated link addresses for SW crypto */ + if (unlikely(is_unicast_ether_addr(hdr->addr1) && + !ieee80211_is_data(hdr->frame_control))) + memcpy(rx->link_addrs, &hdr->addrs, 3 * ETH_ALEN); + if (unlikely(rx->sta && rx->sta->sta.mlo) && is_unicast_ether_addr(hdr->addr1) && !ieee80211_is_probe_resp(hdr->frame_control) && diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 28dcdd7f0e05..dd691ff549c3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5315,6 +5315,38 @@ static int ieee80211_beacon_protect(struct sk_buff *skb, return 0; } +int ieee80211_encrypt_tx_skb(struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_sub_if_data *sdata; + struct sk_buff *check_skb; + struct ieee80211_tx_data tx; + ieee80211_tx_result res; + + if (!info->control.hw_key) + return 0; + + memset(&tx, 0, sizeof(tx)); + tx.key = container_of(info->control.hw_key, struct ieee80211_key, conf); + /* NULL it out now so we do full SW crypto */ + info->control.hw_key = NULL; + __skb_queue_head_init(&tx.skbs); + __skb_queue_tail(&tx.skbs, skb); + + sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); + tx.sdata = sdata; + tx.local = sdata->local; + res = ieee80211_tx_h_encrypt(&tx); + check_skb = __skb_dequeue(&tx.skbs); + /* we may crash after this, but it'd be a bug in crypto */ + WARN_ON(check_skb != skb); + if (WARN_ON_ONCE(res != TX_CONTINUE)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(ieee80211_encrypt_tx_skb); + static void ieee80211_beacon_get_finish(struct ieee80211_hw *hw, struct ieee80211_vif *vif, diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index fdf98c21d32c..64a57475ce50 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -315,7 +315,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) * Calculate AAD for CCMP/GCMP, returning qos_tid since we * need that in CCMP also for b_0. */ -static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu) +static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu, + bool aad_nonce_computed) { struct ieee80211_hdr *hdr = (void *)skb->data; __le16 mask_fc; @@ -358,7 +359,8 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu) * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); - memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); + if (!aad_nonce_computed) + memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; @@ -377,10 +379,10 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu) } static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad, - bool spp_amsdu) + bool spp_amsdu, bool aad_nonce_computed) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu); + u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu, aad_nonce_computed); /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC * mode authentication are not allowed to collide, yet both are derived @@ -395,7 +397,8 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad, * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7) */ b_0[1] = qos_tid | (ieee80211_is_mgmt(hdr->frame_control) << 4); - memcpy(&b_0[2], hdr->addr2, ETH_ALEN); + if (!aad_nonce_computed) + memcpy(&b_0[2], hdr->addr2, ETH_ALEN); memcpy(&b_0[8], pn, IEEE80211_CCMP_PN_LEN); } @@ -488,7 +491,8 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, pos += IEEE80211_CCMP_HDR_LEN; ccmp_special_blocks(skb, pn, b_0, aad, - key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU, + false); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, skb_put(skb, mic_len)); } @@ -566,9 +570,22 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 b_0[AES_BLOCK_SIZE]; + bool aad_nonce_computed = false; + + if (is_unicast_ether_addr(hdr->addr1) && + !ieee80211_is_data(hdr->frame_control)) { + /* AAD computation */ + memcpy(&aad[4], rx->link_addrs, 3 * ETH_ALEN); + /* Nonce computation */ + ether_addr_copy(&b_0[2], + &rx->link_addrs[ETH_ALEN]); + aad_nonce_computed = true; + } + /* hardware didn't decrypt/verify MIC */ ccmp_special_blocks(skb, pn, b_0, aad, - key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU, + aad_nonce_computed); if (ieee80211_aes_ccm_decrypt( key->u.ccmp.tfm, b_0, aad, @@ -593,14 +610,15 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, } static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad, - bool spp_amsdu) + bool spp_amsdu, bool aad_nonce_computed) { struct ieee80211_hdr *hdr = (void *)skb->data; - memcpy(j_0, hdr->addr2, ETH_ALEN); + if (!aad_nonce_computed) + memcpy(j_0, hdr->addr2, ETH_ALEN); memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN); - ccmp_gcmp_aad(skb, aad, spp_amsdu); + ccmp_gcmp_aad(skb, aad, spp_amsdu, aad_nonce_computed); } static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id) @@ -690,7 +708,8 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) pos += IEEE80211_GCMP_HDR_LEN; gcmp_special_blocks(skb, pn, j_0, aad, - key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU, + false); return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len, skb_put(skb, IEEE80211_GCMP_MIC_LEN)); } @@ -763,9 +782,21 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 j_0[AES_BLOCK_SIZE]; + bool aad_nonce_computed = false; + + if (is_unicast_ether_addr(hdr->addr1) && + !ieee80211_is_data(hdr->frame_control)) { + /* AAD computation */ + memcpy(&aad[4], rx->link_addrs, 3 * ETH_ALEN); + /* Nonce computation */ + ether_addr_copy(&j_0[0], + &rx->link_addrs[ETH_ALEN]); + aad_nonce_computed = true; + } /* hardware didn't decrypt/verify MIC */ gcmp_special_blocks(skb, pn, j_0, aad, - key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU, + aad_nonce_computed); if (ieee80211_aes_gcm_decrypt( key->u.gcmp.tfm, j_0, aad, -- cgit v1.2.3 From bd77375097357b46af00db1316ceab5e82ccbc8b Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:51 +0530 Subject: wifi: cfg80211: add support for IEEE 802.1X Authentication Protocol Add an extended feature flag NL80211_EXT_FEATURE_IEEE8021X_AUTH to allow a driver to indicate support for the IEEE 802.1X authentication protocol in non-AP STA mode, as defined in "IEEE P802.11bi/D4.0, 12.16.5". In case of SME in userspace, the Authentication frame body is prepared in userspace while the driver finalizes the Authentication frame once it receives the required fields and elements. The driver indicates support for IEEE 802.1X authentication using the extended feature flag so that userspace can initiate IEEE 802.1X authentication. When the feature flag is set, process IEEE 802.1X Authentication frames from userspace in non-AP STA mode. If the flag is not set, reject IEEE 802.1X Authentication frames. Define a new authentication type NL80211_AUTHTYPE_IEEE8021X for IEEE 802.1X authentication. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-4-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/uapi/linux/nl80211.h | 9 +++++++++ net/wireless/nl80211.c | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0aa2fb8f88de..1bf806f85372 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1358,6 +1358,7 @@ struct ieee80211_tdls_data { #define WLAN_AUTH_FILS_SK 4 #define WLAN_AUTH_FILS_SK_PFS 5 #define WLAN_AUTH_FILS_PK 6 +#define WLAN_AUTH_IEEE8021X 8 #define WLAN_AUTH_EPPKE 9 #define WLAN_AUTH_LEAP 128 diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index fe2c8c8d6dd6..0b7a06c2b9f7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5491,6 +5491,8 @@ enum nl80211_bss_status { * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key * @NL80211_AUTHTYPE_EPPKE: Enhanced Privacy Protection Key Exchange + * @NL80211_AUTHTYPE_IEEE8021X: IEEE 802.1X authentication utilizing + * Authentication frames * @__NL80211_AUTHTYPE_NUM: internal * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by @@ -5507,6 +5509,7 @@ enum nl80211_auth_type { NL80211_AUTHTYPE_FILS_SK_PFS, NL80211_AUTHTYPE_FILS_PK, NL80211_AUTHTYPE_EPPKE, + NL80211_AUTHTYPE_IEEE8021X, /* keep last */ __NL80211_AUTHTYPE_NUM, @@ -6820,6 +6823,11 @@ enum nl80211_feature_flags { * frames in both non‑AP STA and AP mode as specified in * "IEEE P802.11bi/D3.0, 12.16.6". * + * @NL80211_EXT_FEATURE_IEEE8021X_AUTH: Driver supports IEEE 802.1X + * authentication utilizing Authentication frames with user space SME + * (NL80211_CMD_AUTHENTICATE) in non-AP STA mode, as specified in + * "IEEE P802.11bi/D4.0, 12.16.5". + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6898,6 +6906,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_RATE_EHT, NL80211_EXT_FEATURE_EPPKE, NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION, + NL80211_EXT_FEATURE_IEEE8021X_AUTH, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f54b3cca6975..de7956dbe0a0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6550,6 +6550,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_EPPKE) && auth_type == NL80211_AUTHTYPE_EPPKE) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_IEEE8021X_AUTH) && + auth_type == NL80211_AUTHTYPE_IEEE8021X) + return false; return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && @@ -6571,6 +6575,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_EPPKE) && auth_type == NL80211_AUTHTYPE_EPPKE) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_IEEE8021X_AUTH) && + auth_type == NL80211_AUTHTYPE_IEEE8021X) + return false; return true; case NL80211_CMD_START_AP: if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -12103,7 +12111,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) auth_type == NL80211_AUTHTYPE_FILS_SK || auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK || - auth_type == NL80211_AUTHTYPE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE || + auth_type == NL80211_AUTHTYPE_IEEE8021X) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; @@ -12112,7 +12121,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) auth_type != NL80211_AUTHTYPE_FILS_SK && auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && auth_type != NL80211_AUTHTYPE_FILS_PK && - auth_type != NL80211_AUTHTYPE_EPPKE) + auth_type != NL80211_AUTHTYPE_EPPKE && + auth_type != NL80211_AUTHTYPE_IEEE8021X) return -EINVAL; req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); -- cgit v1.2.3 From 9347878b1513beee1a26bb249f5dc8326d450f75 Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:52 +0530 Subject: wifi: mac80211: Add support for IEEE 802.1X authentication protocol in non-AP STA mode Add support for the IEEE 802.1X authentication protocol in non-AP STA mode, as specified in "IEEE P802.11bi/D4.0, 12.16.5". IEEE 802.1X authentication involves multiple Authentication frame exchanges, with the non-AP STA and AP alternating transaction sequence numbers. The number of Authentication frame exchanges depends on the EAP method in use. For IEEE 802.1X authentication, process only Authentication frames with the expected transaction sequence number. For IEEE 802.1X Authentication, Table 9-71 specifies that the Encapsulation Length field as specified in Clause 9.4.1.82 shall be present in all IEEE 802.1X Authentication frames. Drop the frame in the mac80211 if the Encapsulation Length field is missing. After receiving the final Authentication frame with status code WLAN_STATUS_8021X_AUTH_SUCCESS from the AP, mac80211 marks the state as authenticated, as it indicates the EAP handshake has completed successfully over the Authentication frames as specified in Clause 12.16.5. In the PMKSA caching case, only two Authentication frames are exchanged if the AP identifies a valid PMKSA, then as specified in Clause 12.16.8.3, the AP shall set the Status Code to WLAN_STATUS_SUCCESS in the final Authentication frame and must not include an encapsulated EAPOL PDU. This frame will be the final Authentication frame from the AP when PMKSA caching is enabled, and mac80211 marks the state as authenticated. In case of authentication success or failure, forward the Authentication frame to userspace(e.g. wpa_supplicant), and let userspace validate the Authentication frame from the AP as per the specification. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-5-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + net/mac80211/mlme.c | 78 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1bf806f85372..3651b2e6c518 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1508,6 +1508,7 @@ enum ieee80211_statuscode { WLAN_STATUS_SAE_PK = 127, WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133, WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134, + WLAN_STATUS_8021X_AUTH_SUCCESS = 153, }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 810bea1aacc5..7957eacc5ab7 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4920,7 +4920,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - u16 auth_alg, auth_transaction, status_code; + u16 auth_alg, auth_transaction, status_code, encap_len; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, @@ -4929,6 +4929,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, .subtype = IEEE80211_STYPE_AUTH, }; bool sae_need_confirm = false; + bool auth_fail = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -4945,6 +4946,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); + /* + * IEEE 802.1X Authentication: + * Header + Authentication Algorithm Number(2 byte) + Authentication + * Transaction Sequence Number(2 byte) + Status Code(2 byte) + + * Encapsulation Length(2 byte). + */ + if (auth_alg == WLAN_AUTH_IEEE8021X && len < 24 + 8) + return; + info.link_id = ifmgd->auth_data->link_id; if (auth_alg != ifmgd->auth_data->algorithm || @@ -4960,7 +4970,24 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, goto notify_driver; } - if (status_code != WLAN_STATUS_SUCCESS) { + switch (auth_alg) { + case WLAN_AUTH_IEEE8021X: + if (status_code != WLAN_STATUS_SUCCESS && + status_code != WLAN_STATUS_8021X_AUTH_SUCCESS) + auth_fail = true; + + if (!auth_fail) { + /* Indicates length of encapsulated EAPOL PDU */ + encap_len = get_unaligned_le16(mgmt->u.auth.variable); + } + break; + default: + if (status_code != WLAN_STATUS_SUCCESS) + auth_fail = true; + break; + } + + if (auth_fail) { cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && @@ -4997,6 +5024,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: case WLAN_AUTH_EPPKE: + case WLAN_AUTH_IEEE8021X: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { @@ -5017,8 +5045,37 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { - if (!ieee80211_mark_sta_auth(sdata)) - return; /* ignore frame -- wait for timeout */ + switch (ifmgd->auth_data->algorithm) { + case WLAN_AUTH_IEEE8021X: + /* + * IEEE 802.1X authentication: + * - When the full EAP handshake completes over the + * Authentication process, the responder sets the + * Status Code to WLAN_STATUS_8021X_AUTH_SUCCESS as + * specified in "IEEE P802.11bi/D4.0, 12.16.5". + * + * - In the PMKSA caching case, only two Authentication + * frames are exchanged if the responder (e.g., AP) + * identifies a valid PMKSA, then as specified in + * "IEEE P802.11bi/D4.0, 12.16.8.3", the responder + * shall set the Status Code to SUCCESS in the final + * Authentication frame and must not include an + * encapsulated EAPOL PDU. + * + * Both conditions are treated as successful + * authentication, so mark the state to Authenticated. + */ + if (status_code != WLAN_STATUS_8021X_AUTH_SUCCESS && + !(status_code == WLAN_STATUS_SUCCESS && + encap_len == 0)) + break; + fallthrough; + default: + if (!ieee80211_mark_sta_auth(sdata)) + return; /* ignore frame -- wait for timeout */ + + break; + } } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 1) { sae_need_confirm = true; @@ -8460,6 +8517,10 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) } else if (auth_data->algorithm == WLAN_AUTH_EPPKE) { trans = auth_data->trans; status = auth_data->status; + } else if (auth_data->algorithm == WLAN_AUTH_IEEE8021X) { + trans = auth_data->trans; + status = auth_data->status; + auth_data->expected_transaction = trans + 1; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -9117,7 +9178,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, } if (ifmgd->auth_data && - ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE) + (ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE || + ifmgd->auth_data->algorithm == WLAN_AUTH_IEEE8021X)) new_sta->sta.epp_peer = true; new_sta->sta.mlo = mlo; @@ -9377,6 +9439,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, case NL80211_AUTHTYPE_EPPKE: auth_alg = WLAN_AUTH_EPPKE; break; + case NL80211_AUTHTYPE_IEEE8021X: + auth_alg = WLAN_AUTH_IEEE8021X; + break; default: return -EOPNOTSUPP; } @@ -9402,7 +9467,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE || - req->auth_type == NL80211_AUTHTYPE_EPPKE) { + req->auth_type == NL80211_AUTHTYPE_EPPKE || + req->auth_type == NL80211_AUTHTYPE_IEEE8021X) { __le16 *pos = (__le16 *) req->auth_data; auth_data->trans = le16_to_cpu(pos[0]); -- cgit v1.2.3 From 94d865739249c0b68b0046ea22e55b93fdf420c6 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 2 Mar 2026 09:11:46 +0200 Subject: wifi: cfg80211: make cluster id an array cfg80211_nan_conf::cluster_id is currently a pointer, but there is no real reason to not have it an array. It makes things easier as there is no need to check the pointer validity each time. If a cluster ID wasn't provided by user space it will be randomized. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260302091108.2b12e4ccf5bb.Ib16bf5cca55463d4c89e18099cf1dfe4de95d405@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/nan.c | 5 ++--- drivers/net/wireless/virtual/mac80211_hwsim.c | 2 +- include/net/cfg80211.h | 3 +-- net/mac80211/cfg.c | 12 ++---------- net/wireless/nl80211.c | 14 +++++++++++--- 5 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/intel/iwlwifi/mld/nan.c b/drivers/net/wireless/intel/iwlwifi/mld/nan.c index 2dbd3d58b0c6..4d8e85f2bd7c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/nan.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/nan.c @@ -54,9 +54,8 @@ static int iwl_mld_nan_config(struct iwl_mld *mld, ether_addr_copy(cmd.nmi_addr, vif->addr); cmd.master_pref = conf->master_pref; - if (conf->cluster_id) - memcpy(cmd.cluster_id, conf->cluster_id + 4, - sizeof(cmd.cluster_id)); + memcpy(cmd.cluster_id, conf->cluster_id + 4, + sizeof(cmd.cluster_id)); cmd.scan_period = conf->scan_period < 255 ? conf->scan_period : 255; cmd.dwell_time = diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index c6871c6c771a..475918ee8132 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -4151,7 +4151,7 @@ static int mac80211_hwsim_start_nan(struct ieee80211_hw *hw, ns_to_ktime(until_dw * NSEC_PER_USEC), HRTIMER_MODE_REL_SOFT); - if (conf->cluster_id && !is_zero_ether_addr(conf->cluster_id) && + if (!is_zero_ether_addr(conf->cluster_id) && is_zero_ether_addr(hwsim_nan_cluster_id)) { memcpy(hwsim_nan_cluster_id, conf->cluster_id, ETH_ALEN); } else if (is_zero_ether_addr(hwsim_nan_cluster_id)) { diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c21354647da0..8a63dea500ad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4023,7 +4023,6 @@ struct cfg80211_nan_band_config { * (i.e. BIT(NL80211_BAND_2GHZ)). * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. - * If NULL, the device will pick a random Cluster ID. * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. @@ -4039,7 +4038,7 @@ struct cfg80211_nan_band_config { struct cfg80211_nan_conf { u8 master_pref; u8 bands; - const u8 *cluster_id; + u8 cluster_id[ETH_ALEN] __aligned(2); u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index aa3b86644e8f..0c4979526c91 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -330,7 +330,6 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf) { - kfree(conf->cluster_id); kfree(conf->extra_nan_attrs); kfree(conf->vendor_elems); memset(conf, 0, sizeof(*conf)); @@ -372,9 +371,6 @@ static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst, memcpy(&dst->band_cfgs, &src->band_cfgs, sizeof(dst->band_cfgs)); - kfree(dst->cluster_id); - dst->cluster_id = NULL; - kfree(dst->extra_nan_attrs); dst->extra_nan_attrs = NULL; dst->extra_nan_attrs_len = 0; @@ -383,12 +379,8 @@ static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst, dst->vendor_elems = NULL; dst->vendor_elems_len = 0; - if (src->cluster_id) { - dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN, - GFP_KERNEL); - if (!dst->cluster_id) - goto no_mem; - } + if (is_zero_ether_addr(dst->cluster_id)) + ether_addr_copy(dst->cluster_id, src->cluster_id); if (src->extra_nan_attrs && src->extra_nan_attrs_len) { dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de7956dbe0a0..26cf29c8867b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -15767,9 +15768,16 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, return err; changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; - if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start) - conf->cluster_id = - nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start) { + ether_addr_copy(conf->cluster_id, + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID])); + } else if (start) { + conf->cluster_id[0] = 0x50; + conf->cluster_id[1] = 0x6f; + conf->cluster_id[2] = 0x9a; + conf->cluster_id[3] = 0x01; + get_random_bytes(&conf->cluster_id[4], 2); + } if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { conf->extra_nan_attrs = -- cgit v1.2.3 From 25ab7b6f34c74ea555b4489b57f7219612991433 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:02 +0100 Subject: xattr: remove rbtree-based simple_xattr infrastructure Now that all consumers (shmem, kernfs, pidfs) have been converted to use the rhashtable-based simple_xattrs with pointer-based lazy allocation, remove the legacy rbtree code path. The rhashtable implementation provides O(1) average-case lookup with RCU-based lockless reads, replacing the O(log n) rbtree with reader-writer spinlock contention. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-6-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xattr.c | 387 +++++++++++++------------------------------------- include/linux/xattr.h | 12 +- 2 files changed, 103 insertions(+), 296 deletions(-) (limited to 'include') diff --git a/fs/xattr.c b/fs/xattr.c index eb45ae0fd17f..64803097e1dc 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1200,20 +1200,18 @@ void simple_xattr_free(struct simple_xattr *xattr) static void simple_xattr_rcu_free(struct rcu_head *head) { - struct simple_xattr *xattr; + struct simple_xattr *xattr = container_of(head, struct simple_xattr, rcu); - xattr = container_of(head, struct simple_xattr, rcu); simple_xattr_free(xattr); } /** - * simple_xattr_free_rcu - free an xattr object after an RCU grace period + * simple_xattr_free_rcu - free an xattr object with RCU delay * @xattr: the xattr object * - * Schedule RCU-deferred freeing of an xattr entry. This is used by - * rhashtable-based callers of simple_xattr_set() that replace or remove - * an existing entry while concurrent RCU readers may still be accessing - * it. + * Free the xattr object after an RCU grace period. This must be used when + * the xattr was removed from a data structure that concurrent RCU readers + * may still be traversing. Can handle @xattr being NULL. */ void simple_xattr_free_rcu(struct simple_xattr *xattr) { @@ -1254,43 +1252,6 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) return new_xattr; } -/** - * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry - * @key: xattr name - * @node: current node - * - * Compare the xattr name with the xattr name attached to @node in the rbtree. - * - * Return: Negative value if continuing left, positive if continuing right, 0 - * if the xattr attached to @node matches @key. - */ -static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) -{ - const char *xattr_name = key; - const struct simple_xattr *xattr; - - xattr = rb_entry(node, struct simple_xattr, rb_node); - return strcmp(xattr->name, xattr_name); -} - -/** - * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes - * @new_node: new node - * @node: current node - * - * Compare the xattr attached to @new_node with the xattr attached to @node. - * - * Return: Negative value if continuing left, positive if continuing right, 0 - * if the xattr attached to @new_node matches the xattr attached to @node. - */ -static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, - const struct rb_node *node) -{ - struct simple_xattr *xattr; - xattr = rb_entry(new_node, struct simple_xattr, rb_node); - return rbtree_simple_xattr_cmp(xattr->name, node); -} - static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed) { const char *name = data; @@ -1336,41 +1297,19 @@ static const struct rhashtable_params simple_xattr_params = { int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { - struct simple_xattr *xattr = NULL; + struct simple_xattr *xattr; int ret = -ENODATA; - if (xattrs->use_rhashtable) { - guard(rcu)(); - xattr = rhashtable_lookup(&xattrs->ht, name, - simple_xattr_params); - if (xattr) { - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, - xattr->size); - } - } - } else { - struct rb_node *rbp; - - read_lock(&xattrs->lock); - rbp = rb_find(name, &xattrs->rb_root, - rbtree_simple_xattr_cmp); - if (rbp) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, - xattr->size); - } + guard(rcu)(); + xattr = rhashtable_lookup(&xattrs->ht, name, simple_xattr_params); + if (xattr) { + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); } - read_unlock(&xattrs->lock); } return ret; } @@ -1398,6 +1337,11 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * + * Note: Callers must externally serialize writes. All current callers hold + * the inode lock for write operations. The lookup->replace/remove sequence + * is not atomic with respect to the rhashtable's per-bucket locking, but + * is safe because writes are serialized by the caller. + * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ @@ -1406,7 +1350,7 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, size_t size, int flags) { struct simple_xattr *old_xattr = NULL; - int err = 0; + int err; CLASS(simple_xattr, new_xattr)(value, size); if (IS_ERR(new_xattr)) @@ -1418,119 +1362,52 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, return ERR_PTR(-ENOMEM); } - if (xattrs->use_rhashtable) { - /* - * Lookup is safe without RCU here since writes are - * serialized by the caller. - */ - old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, - simple_xattr_params); - - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) - return ERR_PTR(-EEXIST); - - if (new_xattr) { - err = rhashtable_replace_fast(&xattrs->ht, - &old_xattr->hash_node, - &new_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } else { - err = rhashtable_remove_fast(&xattrs->ht, - &old_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } - } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) - return ERR_PTR(-ENODATA); - - /* - * If XATTR_CREATE or no flags are specified together - * with a new value simply insert it. - */ - if (new_xattr) { - err = rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } - - /* - * If XATTR_CREATE or no flags are specified and - * neither an old or new xattr exist then we don't - * need to do anything. - */ - } - } else { - struct rb_node *parent = NULL, **rbp; - int ret; - - write_lock(&xattrs->lock); - rbp = &xattrs->rb_root.rb_node; - while (*rbp) { - parent = *rbp; - ret = rbtree_simple_xattr_cmp(name, *rbp); - if (ret < 0) - rbp = &(*rbp)->rb_left; - else if (ret > 0) - rbp = &(*rbp)->rb_right; - else - old_xattr = rb_entry(*rbp, struct simple_xattr, - rb_node); - if (old_xattr) - break; - } + /* Lookup is safe without RCU here since writes are serialized. */ + old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, + simple_xattr_params); - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) { - err = -EEXIST; - goto out_unlock; - } + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) + return ERR_PTR(-EEXIST); - if (new_xattr) - rb_replace_node(&old_xattr->rb_node, - &new_xattr->rb_node, - &xattrs->rb_root); - else - rb_erase(&old_xattr->rb_node, - &xattrs->rb_root); + if (new_xattr) { + err = rhashtable_replace_fast(&xattrs->ht, + &old_xattr->hash_node, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) { - err = -ENODATA; - goto out_unlock; - } - - /* - * If XATTR_CREATE or no flags are specified together - * with a new value simply insert it. - */ - if (new_xattr) { - rb_link_node(&new_xattr->rb_node, parent, rbp); - rb_insert_color(&new_xattr->rb_node, - &xattrs->rb_root); - } + err = rhashtable_remove_fast(&xattrs->ht, + &old_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) + return ERR_PTR(-ENODATA); - /* - * If XATTR_CREATE or no flags are specified and - * neither an old or new xattr exist then we don't - * need to do anything. - */ + /* + * If XATTR_CREATE or no flags are specified together with a + * new value simply insert it. + */ + if (new_xattr) { + err = rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); } -out_unlock: - write_unlock(&xattrs->lock); - if (err) - return ERR_PTR(err); + /* + * If XATTR_CREATE or no flags are specified and neither an + * old or new xattr exist then we don't need to do anything. + */ } + retain_and_null_ptr(new_xattr); return old_xattr; } @@ -1572,6 +1449,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + struct rhashtable_iter iter; struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; @@ -1595,77 +1473,34 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!xattrs) return size - remaining_size; - if (xattrs->use_rhashtable) { - struct rhashtable_iter iter; - - rhashtable_walk_enter(&xattrs->ht, &iter); - rhashtable_walk_start(&iter); - - while ((xattr = rhashtable_walk_next(&iter)) != NULL) { - if (IS_ERR(xattr)) { - if (PTR_ERR(xattr) == -EAGAIN) - continue; - err = PTR_ERR(xattr); - break; - } - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + rhashtable_walk_enter(&xattrs->ht, &iter); + rhashtable_walk_start(&iter); - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) + while ((xattr = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(xattr)) { + if (PTR_ERR(xattr) == -EAGAIN) continue; - - err = xattr_list_one(&buffer, &remaining_size, - xattr->name); - if (err) - break; + err = PTR_ERR(xattr); + break; } - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); - } else { - struct rb_node *rbp; - - read_lock(&xattrs->lock); - for (rbp = rb_first(&xattrs->rb_root); rbp; - rbp = rb_next(rbp)) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) - continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; - err = xattr_list_one(&buffer, &remaining_size, - xattr->name); - if (err) - break; - } - read_unlock(&xattrs->lock); + err = xattr_list_one(&buffer, &remaining_size, xattr->name); + if (err) + break; } - return err ? err : size - remaining_size; -} + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); -/** - * rbtree_simple_xattr_less - compare two xattr rbtree nodes - * @new_node: new node - * @node: current node - * - * Compare the xattr attached to @new_node with the xattr attached to @node. - * Note that this function technically tolerates duplicate entries. - * - * Return: True if insertion point in the rbtree is found. - */ -static bool rbtree_simple_xattr_less(struct rb_node *new_node, - const struct rb_node *node) -{ - return rbtree_simple_xattr_node_cmp(new_node, node) < 0; + return err ? err : size - remaining_size; } /** @@ -1676,33 +1511,29 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. */ int simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { - if (xattrs->use_rhashtable) - return rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params); - - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, - rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); - return 0; + return rhashtable_insert_fast(&xattrs->ht, &new_xattr->hash_node, + simple_xattr_params); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * - * Initialize relevant fields of a an xattr header. + * Initialize the rhashtable used to store xattr objects. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. */ -void simple_xattrs_init(struct simple_xattrs *xattrs) +int simple_xattrs_init(struct simple_xattrs *xattrs) { - xattrs->use_rhashtable = false; - xattrs->rb_root = RB_ROOT; - rwlock_init(&xattrs->lock); + return rhashtable_init(&xattrs->ht, &simple_xattr_params); } /** @@ -1710,7 +1541,8 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) * * Dynamically allocate a simple_xattrs header and initialize the * underlying rhashtable. This is intended for consumers that want - * rhashtable-based xattr storage. + * to lazily allocate xattr storage only when the first xattr is set, + * avoiding the per-inode rhashtable overhead when no xattrs are used. * * Return: On success a new simple_xattrs is returned. On failure an * ERR_PTR is returned. @@ -1718,14 +1550,15 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) struct simple_xattrs *simple_xattrs_alloc(void) { struct simple_xattrs *xattrs __free(kfree) = NULL; + int ret; xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL); if (!xattrs) return ERR_PTR(-ENOMEM); - xattrs->use_rhashtable = true; - if (rhashtable_init(&xattrs->ht, &simple_xattr_params)) - return ERR_PTR(-ENOMEM); + ret = simple_xattrs_init(xattrs); + if (ret) + return ERR_PTR(ret); return no_free_ptr(xattrs); } @@ -1784,28 +1617,10 @@ static void simple_xattr_ht_free(void *ptr, void *arg) */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { + might_sleep(); + if (freed_space) *freed_space = 0; - - if (xattrs->use_rhashtable) { - rhashtable_free_and_destroy(&xattrs->ht, - simple_xattr_ht_free, freed_space); - } else { - struct rb_node *rbp; - - rbp = rb_first(&xattrs->rb_root); - while (rbp) { - struct simple_xattr *xattr; - struct rb_node *rbp_next; - - rbp_next = rb_next(rbp); - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (freed_space) - *freed_space += simple_xattr_space(xattr->name, - xattr->size); - simple_xattr_free(xattr); - rbp = rbp_next; - } - } + rhashtable_free_and_destroy(&xattrs->ht, simple_xattr_ht_free, + freed_space); } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6e619e185e90..3b5a5fd684eb 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,18 +107,10 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - bool use_rhashtable; - union { - struct { - struct rb_root rb_root; - rwlock_t lock; - }; - struct rhashtable ht; - }; + struct rhashtable ht; }; struct simple_xattr { - struct rb_node rb_node; struct rhash_head hash_node; struct rcu_head rcu; char *name; @@ -126,7 +118,7 @@ struct simple_xattr { char value[] __counted_by(size); }; -void simple_xattrs_init(struct simple_xattrs *xattrs); +int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, const void *value, int flags); -- cgit v1.2.3 From 4fbe9e78bb415dd632ff63a9f620af0be58ef820 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:05 +0100 Subject: xattr: move user limits for xattrs to generic infra Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-9-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/kernfs/inode.c | 75 ++------------------------------------------- fs/kernfs/kernfs-internal.h | 3 +- fs/xattr.c | 65 +++++++++++++++++++++++++++++++++++++++ include/linux/kernfs.h | 2 -- include/linux/xattr.h | 18 +++++++++++ 5 files changed, 87 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index dfc3315b5afc..1de10500842d 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -45,8 +45,7 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc) ret->ia_mtime = ret->ia_atime; ret->ia_ctime = ret->ia_atime; - atomic_set(&ret->nr_user_xattrs, 0); - atomic_set(&ret->user_xattr_size, 0); + simple_xattr_limits_init(&ret->xattr_limits); /* If someone raced us, recognize it. */ if (!try_cmpxchg(&kn->iattr, &attr, ret)) @@ -355,69 +354,6 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, return kernfs_xattr_set(kn, name, value, size, flags); } -static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, - const char *full_name, - struct simple_xattrs *xattrs, - const void *value, size_t size, int flags) -{ - struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); - atomic_t *sz = &attr->user_xattr_size; - atomic_t *nr = &attr->nr_user_xattrs; - struct simple_xattr *old_xattr; - int ret; - - if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { - ret = -ENOSPC; - goto dec_count_out; - } - - if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { - ret = -ENOSPC; - goto dec_size_out; - } - - old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); - if (!old_xattr) - return 0; - - if (IS_ERR(old_xattr)) { - ret = PTR_ERR(old_xattr); - goto dec_size_out; - } - - ret = 0; - size = old_xattr->size; - simple_xattr_free_rcu(old_xattr); -dec_size_out: - atomic_sub(size, sz); -dec_count_out: - atomic_dec(nr); - return ret; -} - -static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, - const char *full_name, - struct simple_xattrs *xattrs, - const void *value, size_t size, int flags) -{ - struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); - atomic_t *sz = &attr->user_xattr_size; - atomic_t *nr = &attr->nr_user_xattrs; - struct simple_xattr *old_xattr; - - old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); - if (!old_xattr) - return 0; - - if (IS_ERR(old_xattr)) - return PTR_ERR(old_xattr); - - atomic_sub(old_xattr->size, sz); - atomic_dec(nr); - simple_xattr_free_rcu(old_xattr); - return 0; -} - static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -440,13 +376,8 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, if (IS_ERR_OR_NULL(xattrs)) return PTR_ERR(xattrs); - if (value) - return kernfs_vfs_user_xattr_add(kn, full_name, xattrs, - value, size, flags); - else - return kernfs_vfs_user_xattr_rm(kn, full_name, xattrs, - value, size, flags); - + return simple_xattr_set_limited(xattrs, &attrs->xattr_limits, + full_name, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 1324ed8c0661..1d3831e3a270 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -27,8 +27,7 @@ struct kernfs_iattrs { struct timespec64 ia_ctime; struct simple_xattrs *xattrs; - atomic_t nr_user_xattrs; - atomic_t user_xattr_size; + struct simple_xattr_limits xattr_limits; }; struct kernfs_root { diff --git a/fs/xattr.c b/fs/xattr.c index 328ed7558dfc..5e559b1c651f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1427,6 +1427,71 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, return old_xattr; } +static inline void simple_xattr_limits_dec(struct simple_xattr_limits *limits, + size_t size) +{ + atomic_sub(size, &limits->xattr_size); + atomic_dec(&limits->nr_xattrs); +} + +static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits, + size_t size) +{ + if (atomic_inc_return(&limits->nr_xattrs) > SIMPLE_XATTR_MAX_NR) { + atomic_dec(&limits->nr_xattrs); + return -ENOSPC; + } + + if (atomic_add_return(size, &limits->xattr_size) <= SIMPLE_XATTR_MAX_SIZE) + return 0; + + simple_xattr_limits_dec(limits, size); + return -ENOSPC; +} + +/** + * simple_xattr_set_limited - set an xattr with per-inode user.* limits + * @xattrs: the header of the xattr object + * @limits: per-inode limit counters for user.* xattrs + * @name: the name of the xattr to set or remove + * @value: the value to store (NULL to remove) + * @size: the size of @value + * @flags: XATTR_CREATE, XATTR_REPLACE, or 0 + * + * Like simple_xattr_set(), but enforces per-inode count and total value size + * limits for user.* xattrs. Uses speculative pre-increment of the atomic + * counters to avoid races without requiring external locks. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. + */ +int simple_xattr_set_limited(struct simple_xattrs *xattrs, + struct simple_xattr_limits *limits, + const char *name, const void *value, + size_t size, int flags) +{ + struct simple_xattr *old_xattr; + int ret; + + if (value) { + ret = simple_xattr_limits_inc(limits, size); + if (ret) + return ret; + } + + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) { + if (value) + simple_xattr_limits_dec(limits, size); + return PTR_ERR(old_xattr); + } + if (old_xattr) { + simple_xattr_limits_dec(limits, old_xattr->size); + simple_xattr_free_rcu(old_xattr); + } + return 0; +} + static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b5a5f32fdfd1..d8f57f0af5e4 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -99,8 +99,6 @@ enum kernfs_node_type { #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK -#define KERNFS_MAX_USER_XATTRS 128 -#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 3b5a5fd684eb..8b6601367eae 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -118,6 +118,20 @@ struct simple_xattr { char value[] __counted_by(size); }; +#define SIMPLE_XATTR_MAX_NR 128 +#define SIMPLE_XATTR_MAX_SIZE (128 << 10) + +struct simple_xattr_limits { + atomic_t nr_xattrs; /* current user.* xattr count */ + atomic_t xattr_size; /* current total user.* value bytes */ +}; + +static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) +{ + atomic_set(&limits->nr_xattrs, 0); + atomic_set(&limits->xattr_size, 0); +} + int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, @@ -132,6 +146,10 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); +int simple_xattr_set_limited(struct simple_xattrs *xattrs, + struct simple_xattr_limits *limits, + const char *name, const void *value, + size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattrs *xattrs, -- cgit v1.2.3 From cc2f5e2aeb6c69556837e45756b3ddded98b3898 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:48:02 -0800 Subject: pinctrl: pinconf-generic: fix an enum name description Correct an enum name in a kernel-doc comment to avoid kernel-doc warnings: Warning: include/linux/pinctrl/pinconf-generic.h:161 Enum value 'PIN_CONFIG_SKEW_DELAY_OUTPUT_PS' not described in enum 'pin_config_param' Warning: include/linux/pinctrl/pinconf-generic.h:161 Excess enum value '@PIN_CONFIG_SKEW_DELAY_OUPUT_PS' description in 'pin_config_param' Signed-off-by: Randy Dunlap Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 89277808ea61..531dc3e9b3f7 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -115,7 +115,7 @@ struct pinctrl_map; * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the clock skew only. The argument is in ps. - * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * @PIN_CONFIG_SKEW_DELAY_OUTPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. -- cgit v1.2.3 From 7c6084d7fa4e61dd7824c34529277a814c7b3836 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Wed, 7 Jan 2026 15:20:02 +0200 Subject: wifi: cfg80211: support key installation on non-netdev wdevs Currently key installation is only supported for netdev. For NAN, support most key operations (except setting default data key) on wdevs instead of netdevs, and adjust all the APIs and tracing to match. Since nothing currently sets NL80211_EXT_FEATURE_SECURE_NAN, this doesn't change anything (P2P Device already isn't allowed.) Signed-off-by: Avraham Stern Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260107150057.69a0cfad95fa.I00efdf3b2c11efab82ef6ece9f393382bcf33ba8@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 16 +++--- drivers/net/wireless/ath/wil6210/cfg80211.c | 13 +++-- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 18 +++---- drivers/net/wireless/marvell/libertas/cfg.c | 6 +-- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 12 ++--- drivers/net/wireless/microchip/wilc1000/cfg80211.c | 18 +++---- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 12 ++--- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 9 ++-- include/net/cfg80211.h | 10 ++-- net/mac80211/cfg.c | 20 +++---- net/wireless/ibss.c | 4 +- net/wireless/nl80211.c | 46 +++++++++------- net/wireless/rdev-ops.h | 32 +++++------ net/wireless/sme.c | 4 +- net/wireless/trace.h | 62 +++++++++++----------- net/wireless/util.c | 2 +- net/wireless/wext-compat.c | 6 +-- 17 files changed, 148 insertions(+), 142 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index eecba2201b10..739a24a6ad67 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1123,13 +1123,13 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq, wiphy_unlock(vif->ar->wiphy); } -static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, +static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { - struct ath6kl *ar = ath6kl_priv(ndev); - struct ath6kl_vif *vif = netdev_priv(ndev); + struct ath6kl *ar = ath6kl_priv(wdev->netdev); + struct ath6kl_vif *vif = netdev_priv(wdev->netdev); struct ath6kl_key *key = NULL; int seq_len; u8 key_usage; @@ -1248,12 +1248,12 @@ static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, (u8 *) mac_addr, SYNC_BOTH_WMIFLAG); } -static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, +static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct ath6kl *ar = ath6kl_priv(ndev); - struct ath6kl_vif *vif = netdev_priv(ndev); + struct ath6kl *ar = ath6kl_priv(wdev->netdev); + struct ath6kl_vif *vif = netdev_priv(wdev->netdev); ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index); @@ -1278,13 +1278,13 @@ static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, return ath6kl_wmi_deletekey_cmd(ar->wmi, vif->fw_vif_idx, key_index); } -static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, +static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback) (void *cookie, struct key_params *)) { - struct ath6kl_vif *vif = netdev_priv(ndev); + struct ath6kl_vif *vif = netdev_priv(wdev->netdev); struct ath6kl_key *key = NULL; struct key_params params; diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 2d8660ccc6f3..3d6e5aad48b1 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1619,15 +1619,14 @@ static void wil_del_rx_key(u8 key_index, enum wmi_key_usage key_usage, } static int wil_cfg80211_add_key(struct wiphy *wiphy, - struct net_device *ndev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int rc; - struct wil6210_vif *vif = ndev_to_vif(ndev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); - struct wireless_dev *wdev = vif_to_wdev(vif); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); enum wmi_key_usage key_usage = wil_detect_key_usage(wdev, pairwise); struct wil_sta_info *cs = wil_find_sta_by_key_usage(wil, vif->mid, key_usage, @@ -1695,13 +1694,12 @@ static int wil_cfg80211_add_key(struct wiphy *wiphy, } static int wil_cfg80211_del_key(struct wiphy *wiphy, - struct net_device *ndev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct wil6210_vif *vif = ndev_to_vif(ndev); struct wil6210_priv *wil = wiphy_to_wil(wiphy); - struct wireless_dev *wdev = vif_to_wdev(vif); + struct wil6210_vif *vif = wdev_to_vif(wil, wdev); enum wmi_key_usage key_usage = wil_detect_key_usage(wdev, pairwise); struct wil_sta_info *cs = wil_find_sta_by_key_usage(wil, vif->mid, key_usage, @@ -2071,7 +2069,8 @@ void wil_cfg80211_ap_recovery(struct wil6210_priv *wil) key_params.key = vif->gtk; key_params.key_len = vif->gtk_len; key_params.seq_len = IEEE80211_GCMP_PN_LEN; - rc = wil_cfg80211_add_key(wiphy, ndev, -1, vif->gtk_index, + rc = wil_cfg80211_add_key(wiphy, vif_to_wdev(vif), -1, + vif->gtk_index, false, NULL, &key_params); if (rc) wil_err(wil, "vif %d recovery add key failed (%d)\n", diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index f7e17994e59a..0b55d445895f 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -2758,11 +2758,11 @@ done: } static s32 -brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); struct brcmf_wsec_key *key; s32 err; @@ -2796,12 +2796,12 @@ brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev, } static s32 -brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); struct brcmf_pub *drvr = cfg->pub; struct brcmf_wsec_key *key; s32 val; @@ -2822,7 +2822,7 @@ brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev, } if (params->key_len == 0) - return brcmf_cfg80211_del_key(wiphy, ndev, -1, key_idx, + return brcmf_cfg80211_del_key(wiphy, wdev, -1, key_idx, pairwise, mac_addr); if (params->key_len > sizeof(key->data)) { @@ -2918,7 +2918,7 @@ done: } static s32 -brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, +brcmf_cfg80211_get_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, @@ -2926,7 +2926,7 @@ brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct key_params params; - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); struct brcmf_cfg80211_profile *profile = &ifp->vif->profile; struct brcmf_pub *drvr = cfg->pub; struct brcmf_cfg80211_security *sec; @@ -2976,10 +2976,10 @@ done: static s32 brcmf_cfg80211_config_default_mgmt_key(struct wiphy *wiphy, - struct net_device *ndev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_idx) { - struct brcmf_if *ifp = netdev_priv(ndev); + struct brcmf_if *ifp = netdev_priv(wdev->netdev); brcmf_dbg(TRACE, "Enter key_idx %d\n", key_idx); diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 56a82b26a1e9..72c92f72469d 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -1507,7 +1507,7 @@ static int lbs_cfg_set_default_key(struct wiphy *wiphy, } -static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev, +static int lbs_cfg_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { @@ -1516,7 +1516,7 @@ static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev, u16 key_type; int ret = 0; - if (netdev == priv->mesh_dev) + if (wdev->netdev == priv->mesh_dev) return -EOPNOTSUPP; lbs_deb_assoc("add_key: cipher 0x%x, mac_addr %pM\n", @@ -1568,7 +1568,7 @@ static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev, } -static int lbs_cfg_del_key(struct wiphy *wiphy, struct net_device *netdev, +static int lbs_cfg_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 71e71a5af453..c9a651bdf882 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -141,11 +141,11 @@ static void *mwifiex_cfg80211_get_adapter(struct wiphy *wiphy) * CFG802.11 operation handler to delete a network key. */ static int -mwifiex_cfg80211_del_key(struct wiphy *wiphy, struct net_device *netdev, +mwifiex_cfg80211_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); static const u8 bc_mac[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const u8 *peer_mac = pairwise ? mac_addr : bc_mac; @@ -480,11 +480,11 @@ mwifiex_cfg80211_set_default_key(struct wiphy *wiphy, struct net_device *netdev, * CFG802.11 operation handler to add a network key. */ static int -mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev, +mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); struct mwifiex_wep_key *wep_key; static const u8 bc_mac[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const u8 *peer_mac = pairwise ? mac_addr : bc_mac; @@ -518,11 +518,11 @@ mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev, */ static int mwifiex_cfg80211_set_default_mgmt_key(struct wiphy *wiphy, - struct net_device *netdev, + struct wireless_dev *wdev, int link_id, u8 key_index) { - struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev); + struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev); struct mwifiex_ds_encrypt_key encrypt_key; wiphy_dbg(wiphy, "set default mgmt key, key index=%d\n", key_index); diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index 21ef341e002b..3a774cc44b26 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -534,7 +534,7 @@ static int wilc_wfi_cfg_copy_wpa_info(struct wilc_wfi_key *key_info, return 0; } -static int add_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, +static int add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) @@ -544,7 +544,7 @@ static int add_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, const u8 *tx_mic = NULL; u8 mode = WILC_FW_SEC_NO; u8 op_mode; - struct wilc_vif *vif = netdev_priv(netdev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); struct wilc_priv *priv = &vif->priv; struct wilc_wfi_key *key; @@ -632,19 +632,19 @@ static int add_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, break; default: - netdev_err(netdev, "%s: Unsupported cipher\n", __func__); + netdev_err(wdev->netdev, "%s: Unsupported cipher\n", __func__); ret = -ENOTSUPP; } return ret; } -static int del_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, +static int del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct wilc_vif *vif = netdev_priv(netdev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); struct wilc_priv *priv = &vif->priv; if (!pairwise && (key_index == 4 || key_index == 5)) { @@ -680,12 +680,12 @@ static int del_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, return 0; } -static int get_key(struct wiphy *wiphy, struct net_device *netdev, int link_id, +static int get_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *)) { - struct wilc_vif *vif = netdev_priv(netdev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); struct wilc_priv *priv = &vif->priv; struct key_params key_params; @@ -725,10 +725,10 @@ static int set_default_key(struct wiphy *wiphy, struct net_device *netdev, return 0; } -static int set_default_mgmt_key(struct wiphy *wiphy, struct net_device *netdev, +static int set_default_mgmt_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index) { - struct wilc_vif *vif = netdev_priv(netdev); + struct wilc_vif *vif = netdev_priv(wdev->netdev); return wilc_set_default_mgmt_key_index(vif, key_index); } diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 340240847a2f..9e44c85d2051 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -532,11 +532,11 @@ qtnf_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, return ret; } -static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev, +static int qtnf_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; ret = qtnf_cmd_send_add_key(vif, key_index, pairwise, mac_addr, params); @@ -548,11 +548,11 @@ static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev, return ret; } -static int qtnf_del_key(struct wiphy *wiphy, struct net_device *dev, +static int qtnf_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; ret = qtnf_cmd_send_del_key(vif, key_index, pairwise, mac_addr); @@ -587,10 +587,10 @@ static int qtnf_set_default_key(struct wiphy *wiphy, struct net_device *dev, } static int -qtnf_set_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, +qtnf_set_default_mgmt_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index) { - struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); + struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; ret = qtnf_cmd_send_set_default_mgmt_key(vif, key_index); diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 83422c5c8c44..7c714ef73ea0 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -831,7 +831,7 @@ exit: return ret; } -static int cfg80211_rtw_add_key(struct wiphy *wiphy, struct net_device *ndev, +static int cfg80211_rtw_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { @@ -839,6 +839,7 @@ static int cfg80211_rtw_add_key(struct wiphy *wiphy, struct net_device *ndev, u32 param_len; struct ieee_param *param = NULL; int ret = 0; + struct net_device *ndev = wdev->netdev; struct adapter *padapter = rtw_netdev_priv(ndev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; @@ -909,7 +910,7 @@ addkey_end: return ret; } -static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev, +static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, @@ -918,11 +919,11 @@ static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev, return 0; } -static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct net_device *ndev, +static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { - struct adapter *padapter = rtw_netdev_priv(ndev); + struct adapter *padapter = rtw_netdev_priv(wdev->netdev); struct security_priv *psecuritypriv = &padapter->securitypriv; if (key_index == psecuritypriv->dot11PrivacyKeyIndex) { diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8a63dea500ad..8cd870ece351 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4924,24 +4924,24 @@ struct cfg80211_ops { struct wireless_dev *wdev, unsigned int link_id); - int (*add_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*add_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params); - int (*get_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*get_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)); - int (*del_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*del_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr); int (*set_default_key)(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast); int (*set_default_mgmt_key)(struct wiphy *wiphy, - struct net_device *netdev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index); int (*set_default_beacon_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct wireless_dev *wdev, int link_id, u8 key_index); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0c4979526c91..ee64ac8e0f61 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -608,11 +608,11 @@ static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, return ret; } -static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_add_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); struct ieee80211_local *local = sdata->local; @@ -790,11 +790,11 @@ ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, return NULL; } -static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_del_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; @@ -809,7 +809,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, +static int ieee80211_get_key(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, @@ -825,7 +825,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, int err = -ENOENT; struct ieee80211_key_seq kseq = {}; - sdata = IEEE80211_DEV_TO_SUB_IF(dev); + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); rcu_read_lock(); @@ -929,10 +929,10 @@ static int ieee80211_config_default_key(struct wiphy *wiphy, } static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, - struct net_device *dev, + struct wireless_dev *wdev, int link_id, u8 key_idx) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); @@ -945,10 +945,10 @@ static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, } static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, - struct net_device *dev, + struct wireless_dev *wdev, int link_id, u8 key_idx) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a7024af39b40..b1d748bdb504 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -3,7 +3,7 @@ * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2026 Intel Corporation */ #include @@ -172,7 +172,7 @@ void cfg80211_clear_ibss(struct net_device *dev, bool nowext) */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) - rdev_del_key(rdev, dev, -1, i, false, NULL); + rdev_del_key(rdev, wdev, -1, i, false, NULL); if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 26cf29c8867b..2225f5d0b124 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4960,7 +4960,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; u8 key_idx = 0; const u8 *mac_addr = NULL; bool pairwise; @@ -4971,7 +4971,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; bool bigtk_support = false; int link_id = nl80211_link_id_or_invalid(info->attrs); - struct wireless_dev *wdev = dev->ieee80211_ptr; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION)) @@ -5023,7 +5022,10 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) cookie.msg = msg; cookie.idx = key_idx; - if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + if ((wdev->netdev && + nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put_u8(msg, NL80211_ATTR_KEY_IDX, key_idx)) goto nla_put_failure; if (mac_addr && @@ -5034,7 +5036,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (err) goto free_msg; - err = rdev_get_key(rdev, dev, link_id, key_idx, pairwise, mac_addr, + err = rdev_get_key(rdev, wdev, link_id, key_idx, pairwise, mac_addr, &cookie, get_key_callback); if (err) @@ -5058,9 +5060,8 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct key_parse key; int err; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; int link_id = nl80211_link_id_or_invalid(info->attrs); - struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -5080,6 +5081,9 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->set_default_key) return -EOPNOTSUPP; + if (!wdev->netdev) + return -EINVAL; + err = nl80211_key_allowed(wdev); if (err) return err; @@ -5088,7 +5092,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = rdev_set_default_key(rdev, dev, link_id, key.idx, + err = rdev_set_default_key(rdev, wdev->netdev, link_id, key.idx, key.def_uni, key.def_multi); if (err) @@ -5113,7 +5117,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = rdev_set_default_mgmt_key(rdev, dev, link_id, key.idx); + err = rdev_set_default_mgmt_key(rdev, wdev, link_id, key.idx); if (err) return err; @@ -5136,7 +5140,8 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; - return rdev_set_default_beacon_key(rdev, dev, link_id, key.idx); + return rdev_set_default_beacon_key(rdev, wdev, link_id, + key.idx); } else if (key.p.mode == NL80211_KEY_SET_TX && wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { @@ -5152,7 +5157,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (err) return err; - return rdev_add_key(rdev, dev, link_id, key.idx, + return rdev_add_key(rdev, wdev, link_id, key.idx, NL80211_KEYTYPE_PAIRWISE, mac_addr, &key.p); } @@ -5164,11 +5169,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; struct key_parse key; const u8 *mac_addr = NULL; int link_id = nl80211_link_id_or_invalid(info->attrs); - struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -5219,7 +5223,7 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) key.type == NL80211_KEYTYPE_PAIRWISE); if (!err) { - err = rdev_add_key(rdev, dev, link_id, key.idx, + err = rdev_add_key(rdev, wdev, link_id, key.idx, key.type == NL80211_KEYTYPE_PAIRWISE, mac_addr, &key.p); if (err) @@ -5233,11 +5237,10 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = info->user_ptr[1]; u8 *mac_addr = NULL; struct key_parse key; int link_id = nl80211_link_id_or_invalid(info->attrs); - struct wireless_dev *wdev = dev->ieee80211_ptr; err = nl80211_parse_key(info, &key); if (err) @@ -5276,7 +5279,7 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) key.type == NL80211_KEYTYPE_PAIRWISE); if (!err) - err = rdev_del_key(rdev, dev, link_id, key.idx, + err = rdev_del_key(rdev, wdev, link_id, key.idx, key.type == NL80211_KEYTYPE_PAIRWISE, mac_addr); @@ -18071,6 +18074,9 @@ nl80211_epcs_cfg(struct sk_buff *skb, struct genl_info *info) NL80211_FLAG_CLEAR_SKB) \ SELECTOR(__sel, WDEV_UP, \ NL80211_FLAG_NEED_WDEV_UP) \ + SELECTOR(__sel, WDEV_UP_CLEAR, \ + NL80211_FLAG_NEED_WDEV_UP | \ + NL80211_FLAG_CLEAR_SKB) \ SELECTOR(__sel, WDEV_UP_LINK, \ NL80211_FLAG_NEED_WDEV_UP | \ NL80211_FLAG_MLO_VALID_LINK_ID) \ @@ -18403,7 +18409,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_SET_KEY, @@ -18411,7 +18417,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .doit = nl80211_set_key, .flags = GENL_UNS_ADMIN_PERM, /* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on key */ - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_CLEAR_SKB), }, { @@ -18419,7 +18425,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_CLEAR_SKB), }, { @@ -18427,7 +18433,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_key, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP), }, { .cmd = NL80211_CMD_SET_BEACON, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index a8f1e7ddc0c0..2bad8b60b7c9 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2021-2025 Intel Corporation + * Copyright (C) 2018, 2021-2026 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -77,42 +77,42 @@ rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; - trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, + trace_rdev_add_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr, params->mode); - ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, + ret = rdev->ops->add_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int -rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, +rdev_get_key(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; - trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, + trace_rdev_get_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr); - ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, + ret = rdev->ops->get_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; - trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, + trace_rdev_del_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr); - ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, + ret = rdev->ops->del_key(&rdev->wiphy, wdev, link_id, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -134,12 +134,12 @@ rdev_set_default_key(struct cfg80211_registered_device *rdev, static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int link_id, u8 key_index) + struct wireless_dev *wdev, int link_id, u8 key_index) { int ret; - trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, + trace_rdev_set_default_mgmt_key(&rdev->wiphy, wdev, link_id, key_index); - ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, + ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, wdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -147,14 +147,14 @@ rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, - struct net_device *netdev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index) { int ret; - trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, + trace_rdev_set_default_beacon_key(&rdev->wiphy, wdev, link_id, key_index); - ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, + ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, wdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 5b21432450d5..86e2ccaa678c 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg - * Copyright (C) 2009, 2020, 2022-2025 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020, 2022-2026 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -1386,7 +1386,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) - rdev_del_key(rdev, dev, -1, i, false, NULL); + rdev_del_key(rdev, wdev, -1, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 8ab78a899f57..af23f4fca90a 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2020-2025 Intel Corporation + * Copyright (C) 2018, 2020-2026 Intel Corporation */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 @@ -546,12 +546,12 @@ TRACE_EVENT(rdev_change_virtual_intf, ); DECLARE_EVENT_CLASS(key_handle, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr), + TP_ARGS(wiphy, wdev, link_id, key_index, pairwise, mac_addr), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(mac_addr) __field(int, link_id) __field(u8, key_index) @@ -559,38 +559,38 @@ DECLARE_EVENT_CLASS(key_handle, ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d, " "key_index: %u, pairwise: %s, mac addr: %pM", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id, __entry->key_index, BOOL_TO_STR(__entry->pairwise), __entry->mac_addr) ); DEFINE_EVENT(key_handle, rdev_get_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) + TP_ARGS(wiphy, wdev, link_id, key_index, pairwise, mac_addr) ); DEFINE_EVENT(key_handle, rdev_del_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) + TP_ARGS(wiphy, wdev, link_id, key_index, pairwise, mac_addr) ); TRACE_EVENT(rdev_add_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode), - TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode), + TP_ARGS(wiphy, wdev, link_id, key_index, pairwise, mac_addr, mode), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY MAC_ENTRY(mac_addr) __field(int, link_id) __field(u8, key_index) @@ -599,17 +599,17 @@ TRACE_EVENT(rdev_add_key, ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; __entry->mode = mode; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d, " "key_index: %u, mode: %u, pairwise: %s, " "mac addr: %pM", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id, __entry->key_index, __entry->mode, BOOL_TO_STR(__entry->pairwise), __entry->mac_addr) ); @@ -642,45 +642,45 @@ TRACE_EVENT(rdev_set_default_key, ); TRACE_EVENT(rdev_set_default_mgmt_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index), - TP_ARGS(wiphy, netdev, link_id, key_index), + TP_ARGS(wiphy, wdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; __entry->link_id = link_id; __entry->key_index = key_index; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " - "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->link_id, __entry->key_index) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d, key index: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id, + __entry->key_index) ); TRACE_EVENT(rdev_set_default_beacon_key, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index), - TP_ARGS(wiphy, netdev, link_id, key_index), + TP_ARGS(wiphy, wdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY - NETDEV_ENTRY + WDEV_ENTRY __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; - NETDEV_ASSIGN; + WDEV_ASSIGN; __entry->link_id = link_id; __entry->key_index = key_index; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " - "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->link_id, __entry->key_index) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d, key index: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id, + __entry->key_index) ); TRACE_EVENT(rdev_start_ap, diff --git a/net/wireless/util.c b/net/wireless/util.c index 702904048d5a..0a0cea018fc5 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1095,7 +1095,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; - if (rdev_add_key(rdev, dev, -1, i, false, NULL, + if (rdev_add_key(rdev, wdev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 98a4f4c7970d..22d9d9bae8f5 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -457,7 +457,7 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; else - err = rdev_del_key(rdev, dev, -1, idx, pairwise, + err = rdev_del_key(rdev, wdev, -1, idx, pairwise, addr); } wdev->wext.connect.privacy = false; @@ -496,7 +496,7 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) - err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params); + err = rdev_add_key(rdev, wdev, -1, idx, pairwise, addr, params); else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; @@ -549,7 +549,7 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) - err = rdev_set_default_mgmt_key(rdev, dev, -1, idx); + err = rdev_set_default_mgmt_key(rdev, wdev, -1, idx); if (!err) wdev->wext.default_mgmt_key = idx; return err; -- cgit v1.2.3 From 94798081732abfb5748471d5c3cced6ff187fa36 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:43 -0800 Subject: driver core: platform: add kerneldoc to struct platform_device_info Add kernel documentation for struct platform_device_info and its individual members. While at it remove an extra indent level from the structure definition. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-2-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- include/linux/platform_device.h | 53 ++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 813da101b5bf..5f54217930e1 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -118,22 +118,53 @@ extern int platform_get_irq_byname_optional(struct platform_device *dev, const char *name); extern int platform_add_devices(struct platform_device **, int); +/** + * struct platform_device_info - set of parameters for creating a platform device + * @parent: parent device for the new platform device. + * @fwnode: firmware node associated with the device. + * @of_node_reused: indicates that device tree node associated with the device + * is shared with another device, typically its ancestor. Setting this to + * %true prevents the device from being matched via the OF match table, + * and stops the device core from automatically binding pinctrl + * configuration to avoid disrupting the other device. + * @name: name of the device. + * @id: instance ID of the device. Use %PLATFORM_DEVID_NONE if there is only + * one instance of the device, or %PLATFORM_DEVID_AUTO to let the + * kernel automatically assign a unique instance ID. + * @res: set of resources to attach to the device. + * @num_res: number of entries in @res. + * @data: device-specific data for this platform device. + * @size_data: size of device-specific data. + * @dma_mask: DMA mask for the device. + * @properties: a set of software properties for the device. If provided, + * a managed software node will be automatically created and + * assigned to the device. The properties array must be terminated + * with a sentinel entry. + * + * This structure is used to hold information needed to create and register + * a platform device using platform_device_register_full(). + * + * platform_device_register_full() makes deep copies of @name, @res, @data and + * @properties, so the caller does not need to keep them after registration. + * If the registration is performed during initialization, these can be marked + * as __initconst. + */ struct platform_device_info { - struct device *parent; - struct fwnode_handle *fwnode; - bool of_node_reused; + struct device *parent; + struct fwnode_handle *fwnode; + bool of_node_reused; - const char *name; - int id; + const char *name; + int id; - const struct resource *res; - unsigned int num_res; + const struct resource *res; + unsigned int num_res; - const void *data; - size_t size_data; - u64 dma_mask; + const void *data; + size_t size_data; + u64 dma_mask; - const struct property_entry *properties; + const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( const struct platform_device_info *pdevinfo); -- cgit v1.2.3 From 0fc434bc2c45fceb9356f2138911db0f454b8ca6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:44 -0800 Subject: driver core: platform: allow attaching software nodes when creating devices Extend platform_device_info structure with an optional pointer to a software node to be used as a secondary firmware node for the device being created. If software node has not been registered yet it will be automatically registered. This reduces boilerplate needed when switching legacy board code to static device properties/GPIO references. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-3-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- drivers/base/platform.c | 9 ++++++++- include/linux/platform_device.h | 7 ++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b45d41b018ca..ec467ccd05b3 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -850,6 +850,9 @@ struct platform_device *platform_device_register_full( int ret; struct platform_device *pdev; + if (pdevinfo->swnode && pdevinfo->properties) + return ERR_PTR(-EINVAL); + pdev = platform_device_alloc(pdevinfo->name, pdevinfo->id); if (!pdev) return ERR_PTR(-ENOMEM); @@ -875,7 +878,11 @@ struct platform_device *platform_device_register_full( if (ret) goto err; - if (pdevinfo->properties) { + if (pdevinfo->swnode) { + ret = device_add_software_node(&pdev->dev, pdevinfo->swnode); + if (ret) + goto err; + } else if (pdevinfo->properties) { ret = device_create_managed_software_node(&pdev->dev, pdevinfo->properties, NULL); if (ret) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 5f54217930e1..754e4bf2771a 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -136,10 +136,14 @@ extern int platform_add_devices(struct platform_device **, int); * @data: device-specific data for this platform device. * @size_data: size of device-specific data. * @dma_mask: DMA mask for the device. + * @swnode: a secondary software node to be attached to the device. The node + * will be automatically registered and its lifetime tied to the platform + * device if it is not registered yet. * @properties: a set of software properties for the device. If provided, * a managed software node will be automatically created and * assigned to the device. The properties array must be terminated - * with a sentinel entry. + * with a sentinel entry. Specifying both @properties and @swnode is not + * allowed. * * This structure is used to hold information needed to create and register * a platform device using platform_device_register_full(). @@ -164,6 +168,7 @@ struct platform_device_info { size_t size_data; u64 dma_mask; + const struct software_node *swnode; const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( -- cgit v1.2.3 From 75d627e5571a5ef313f6f553b7749b124c923c3d Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 9 Feb 2026 23:26:19 -0300 Subject: dt-bindings: clock: rockchip: Add RV1103B CRU support Add support for the Rockchip RV1103B Clock and Reset Unit (CRU). The RV1103B CRU is compatible with the existing RV1126B binding. Add the compatible string to the schema and introduce the corresponding clock ID definitions. Signed-off-by: Fabio Estevam Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260210022620.172570-1-festevam@gmail.com Signed-off-by: Heiko Stuebner --- .../bindings/clock/rockchip,rv1126b-cru.yaml | 1 + include/dt-bindings/clock/rockchip,rv1103b-cru.h | 220 +++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 include/dt-bindings/clock/rockchip,rv1103b-cru.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml index 04b0a5c51e4e..b6d3a04be8f1 100644 --- a/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml +++ b/Documentation/devicetree/bindings/clock/rockchip,rv1126b-cru.yaml @@ -17,6 +17,7 @@ description: properties: compatible: enum: + - rockchip,rv1103b-cru - rockchip,rv1126b-cru reg: diff --git a/include/dt-bindings/clock/rockchip,rv1103b-cru.h b/include/dt-bindings/clock/rockchip,rv1103b-cru.h new file mode 100644 index 000000000000..35afdee7e961 --- /dev/null +++ b/include/dt-bindings/clock/rockchip,rv1103b-cru.h @@ -0,0 +1,220 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (c) 2024 Rockchip Electronics Co. Ltd. + * Author: Elaine Zhang + */ + +#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RV1103B_H +#define _DT_BINDINGS_CLK_ROCKCHIP_RV1103B_H + +#define PLL_GPLL 0 +#define ARMCLK 1 +#define PLL_DPLL 2 +#define XIN_OSC0_HALF 3 +#define CLK_GPLL_DIV24 4 +#define CLK_GPLL_DIV12 5 +#define CLK_GPLL_DIV6 6 +#define CLK_GPLL_DIV4 7 +#define CLK_GPLL_DIV3 8 +#define CLK_GPLL_DIV2P5 9 +#define CLK_GPLL_DIV2 10 +#define CLK_UART0_SRC 11 +#define CLK_UART1_SRC 12 +#define CLK_UART2_SRC 13 +#define CLK_UART0_FRAC 14 +#define CLK_UART1_FRAC 15 +#define CLK_UART2_FRAC 16 +#define CLK_SAI_SRC 17 +#define CLK_SAI_FRAC 18 +#define LSCLK_NPU_SRC 19 +#define CLK_NPU_SRC 20 +#define ACLK_VEPU_SRC 21 +#define CLK_VEPU_SRC 22 +#define ACLK_VI_SRC 23 +#define CLK_ISP_SRC 24 +#define DCLK_VICAP 25 +#define CCLK_EMMC 26 +#define CCLK_SDMMC0 27 +#define SCLK_SFC_2X 28 +#define LSCLK_PERI_SRC 29 +#define ACLK_PERI_SRC 30 +#define HCLK_HPMCU 31 +#define SCLK_UART0 32 +#define SCLK_UART1 33 +#define SCLK_UART2 34 +#define CLK_I2C_PMU 35 +#define CLK_I2C_PERI 36 +#define CLK_SPI0 37 +#define CLK_PWM0_SRC 38 +#define CLK_PWM1 39 +#define CLK_PWM2 40 +#define DCLK_DECOM_SRC 41 +#define CCLK_SDMMC1 42 +#define CLK_CORE_CRYPTO 43 +#define CLK_PKA_CRYPTO 44 +#define CLK_CORE_RGA 45 +#define MCLK_SAI_SRC 46 +#define CLK_FREQ_PWM0_SRC 47 +#define CLK_COUNTER_PWM0_SRC 48 +#define PCLK_TOP_ROOT 49 +#define CLK_REF_MIPI0 50 +#define CLK_MIPI0_OUT2IO 51 +#define CLK_REF_MIPI1 52 +#define CLK_MIPI1_OUT2IO 53 +#define MCLK_SAI_OUT2IO 54 +#define ACLK_NPU_ROOT 55 +#define HCLK_RKNN 56 +#define ACLK_RKNN 57 +#define LSCLK_VEPU_ROOT 58 +#define HCLK_VEPU 59 +#define ACLK_VEPU 60 +#define CLK_CORE_VEPU 61 +#define PCLK_IOC_VCCIO3 62 +#define PCLK_ACODEC 63 +#define PCLK_USBPHY 64 +#define LSCLK_VI_100M 65 +#define LSCLK_VI_ROOT 66 +#define HCLK_ISP 67 +#define ACLK_ISP 68 +#define CLK_CORE_ISP 69 +#define ACLK_VICAP 70 +#define HCLK_VICAP 71 +#define ISP0CLK_VICAP 72 +#define PCLK_CSI2HOST0 73 +#define PCLK_CSI2HOST1 74 +#define HCLK_EMMC 75 +#define HCLK_SFC 76 +#define HCLK_SFC_XIP 77 +#define HCLK_SDMMC0 78 +#define PCLK_CSIPHY 79 +#define PCLK_GPIO1 80 +#define DBCLK_GPIO1 81 +#define PCLK_IOC_VCCIO47 82 +#define LSCLK_DDR_ROOT 83 +#define CLK_TIMER_DDRMON 84 +#define LSCLK_PMU_ROOT 85 +#define PCLK_PMU 86 +#define XIN_RC_DIV 87 +#define CLK_32K 88 +#define PCLK_PMU_GPIO0 89 +#define DBCLK_PMU_GPIO0 90 +#define CLK_DDR_FAIL_SAFE 91 +#define PCLK_PMU_HP_TIMER 92 +#define CLK_PMU_32K_HP_TIMER 93 +#define PCLK_PWM0 94 +#define CLK_PWM0 95 +#define CLK_OSC_PWM0 96 +#define CLK_RC_PWM0 97 +#define CLK_FREQ_PWM0 98 +#define CLK_COUNTER_PWM0 99 +#define PCLK_I2C0 100 +#define CLK_I2C0 101 +#define PCLK_UART0 102 +#define PCLK_IOC_PMUIO0 103 +#define CLK_REFOUT 104 +#define CLK_PREROLL 105 +#define CLK_PREROLL_32K 106 +#define CLK_LPMCU_PMU 107 +#define PCLK_SPI2AHB 108 +#define HCLK_SPI2AHB 109 +#define SCLK_SPI2AHB 110 +#define PCLK_WDT_LPMCU 111 +#define TCLK_WDT_LPMCU 112 +#define HCLK_SFC_PMU1 113 +#define HCLK_SFC_XIP_PMU1 114 +#define SCLK_SFC_2X_PMU1 115 +#define CLK_LPMCU 116 +#define CLK_LPMCU_RTC 117 +#define PCLK_LPMCU_MAILBOX 118 +#define PCLK_IOC_PMUIO1 119 +#define PCLK_CRU_PMU1 120 +#define PCLK_PERI_ROOT 121 +#define PCLK_RTC_ROOT 122 +#define CLK_TIMER_ROOT 123 +#define PCLK_TIMER 124 +#define CLK_TIMER0 125 +#define CLK_TIMER1 126 +#define CLK_TIMER2 127 +#define CLK_TIMER3 128 +#define CLK_TIMER4 129 +#define CLK_TIMER5 130 +#define PCLK_STIMER 131 +#define CLK_STIMER0 132 +#define CLK_STIMER1 133 +#define PCLK_WDT_NS 134 +#define TCLK_WDT_NS 135 +#define PCLK_WDT_S 136 +#define TCLK_WDT_S 137 +#define PCLK_WDT_HPMCU 138 +#define TCLK_WDT_HPMCU 139 +#define PCLK_I2C1 140 +#define CLK_I2C1 141 +#define PCLK_I2C2 142 +#define CLK_I2C2 143 +#define PCLK_I2C3 144 +#define CLK_I2C3 145 +#define PCLK_I2C4 146 +#define CLK_I2C4 147 +#define PCLK_SPI0 148 +#define PCLK_PWM1 149 +#define CLK_OSC_PWM1 150 +#define PCLK_PWM2 151 +#define CLK_OSC_PWM2 152 +#define PCLK_UART2 153 +#define PCLK_UART1 154 +#define ACLK_RKDMA 155 +#define PCLK_TSADC 156 +#define CLK_TSADC 157 +#define CLK_TSADC_TSEN 158 +#define PCLK_SARADC 159 +#define CLK_SARADC 160 +#define PCLK_GPIO2 161 +#define DBCLK_GPIO2 162 +#define PCLK_IOC_VCCIO6 163 +#define ACLK_USBOTG 164 +#define CLK_REF_USBOTG 165 +#define HCLK_SDMMC1 166 +#define HCLK_SAI 167 +#define MCLK_SAI 168 +#define ACLK_CRYPTO 169 +#define HCLK_CRYPTO 170 +#define HCLK_RK_RNG_NS 171 +#define HCLK_RK_RNG_S 172 +#define PCLK_OTPC_NS 173 +#define CLK_OTPC_ROOT_NS 174 +#define CLK_SBPI_OTPC_NS 175 +#define CLK_USER_OTPC_NS 176 +#define PCLK_OTPC_S 177 +#define CLK_OTPC_ROOT_S 178 +#define CLK_SBPI_OTPC_S 179 +#define CLK_USER_OTPC_S 180 +#define CLK_OTPC_ARB 181 +#define PCLK_OTP_MASK 182 +#define HCLK_RGA 183 +#define ACLK_RGA 184 +#define ACLK_MAC 185 +#define PCLK_MAC 186 +#define CLK_MACPHY 187 +#define ACLK_SPINLOCK 188 +#define HCLK_CACHE 189 +#define PCLK_HPMCU_MAILBOX 190 +#define PCLK_HPMCU_INTMUX 191 +#define CLK_HPMCU 192 +#define CLK_HPMCU_RTC 193 +#define DCLK_DECOM 194 +#define ACLK_DECOM 195 +#define PCLK_DECOM 196 +#define ACLK_SYS_SRAM 197 +#define PCLK_DMA2DDR 198 +#define ACLK_DMA2DDR 199 +#define PCLK_DCF 200 +#define ACLK_DCF 201 +#define MCLK_ACODEC_TX 202 +#define SCLK_UART0_SRC 203 +#define SCLK_UART1_SRC 204 +#define SCLK_UART2_SRC 205 +#define XIN_RC_SRC 206 +#define CLK_UTMI_USBOTG 207 +#define CLK_REF_USBPHY 208 + +#endif // _DT_BINDINGS_CLK_ROCKCHIP_RV1103B_H -- cgit v1.2.3 From ce5df0b891edfa19620cd7e28bd69246c77ae78c Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:07 +0200 Subject: IB/core: Introduce FRMR pools Add a generic Fast Registration Memory Region pools mechanism to allow drivers to optimize memory registration performance. Drivers that have the ability to reuse MRs or their underlying HW objects can take advantage of the mechanism to keep a 'handle' for those objects and use them upon user request. We assume that to achieve this goal a driver and its HW should implement a modify operation for the MRs that is able to at least clear and set the MRs and in more advanced implementations also support changing a subset of the MRs properties. The mechanism is built using an RB-tree consisting of pools, each pool represents a set of MR properties that are shared by all of the MRs residing in the pool and are unmodifiable by the vendor driver or HW. The exposed API from ib_core to the driver has 4 operations: Init and cleanup - handles data structs and locks for the pools. Push and pop - store and retrieve 'handle' for a memory registration or deregistrations request. The FRMR pools mechanism implements the logic to search the RB-tree for a pool with matching properties and create a new one when needed and requires the driver to implement creation and destruction of a 'handle' when pool is empty or a handle is requested or is being destroyed. Later patch will introduce Netlink API to interact with the FRMR pools mechanism to allow users to both configure and track its usage. A vendor wishing to configure FRMR pool without exposing it or without exposing internal MR properties to users, should use the kernel_vendor_key field in the pools key. This can be useful in a few cases, e.g, when the FRMR handle has a vendor-specific un-modifiable property that the user registering the memory might not be aware of. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-2-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/frmr_pools.c | 319 +++++++++++++++++++++++++++++++++++ drivers/infiniband/core/frmr_pools.h | 48 ++++++ include/rdma/frmr_pools.h | 37 ++++ include/rdma/ib_verbs.h | 8 + 5 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/frmr_pools.c create mode 100644 drivers/infiniband/core/frmr_pools.h create mode 100644 include/rdma/frmr_pools.h (limited to 'include') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index aa3febdc8322..dce798d8cfe6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o lag.o iter.o + trace.o lag.o iter.o frmr_pools.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c new file mode 100644 index 000000000000..e08c8093a468 --- /dev/null +++ b/drivers/infiniband/core/frmr_pools.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "frmr_pools.h" + +static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle) +{ + u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE; + struct frmr_handles_page *page; + + if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) { + page = kzalloc_obj(*page, GFP_ATOMIC); + if (!page) + return -ENOMEM; + queue->num_pages++; + list_add_tail(&page->list, &queue->pages_list); + } else { + page = list_last_entry(&queue->pages_list, + struct frmr_handles_page, list); + } + + page->handles[tmp] = handle; + queue->ci++; + return 0; +} + +static u32 pop_handle_from_queue_locked(struct frmr_queue *queue) +{ + u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE; + struct frmr_handles_page *page; + u32 handle; + + page = list_last_entry(&queue->pages_list, struct frmr_handles_page, + list); + handle = page->handles[tmp]; + queue->ci--; + + if (!tmp) { + list_del(&page->list); + queue->num_pages--; + kfree(page); + } + + return handle; +} + +static bool pop_frmr_handles_page(struct ib_frmr_pool *pool, + struct frmr_queue *queue, + struct frmr_handles_page **page, u32 *count) +{ + spin_lock(&pool->lock); + if (list_empty(&queue->pages_list)) { + spin_unlock(&pool->lock); + return false; + } + + *page = list_first_entry(&queue->pages_list, struct frmr_handles_page, + list); + list_del(&(*page)->list); + queue->num_pages--; + + /* If this is the last page, count may be less than + * NUM_HANDLES_PER_PAGE. + */ + if (queue->ci >= NUM_HANDLES_PER_PAGE) + *count = NUM_HANDLES_PER_PAGE; + else + *count = queue->ci; + + queue->ci -= *count; + spin_unlock(&pool->lock); + return true; +} + +static void destroy_frmr_pool(struct ib_device *device, + struct ib_frmr_pool *pool) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct frmr_handles_page *page; + u32 count; + + while (pop_frmr_handles_page(pool, &pool->queue, &page, &count)) { + pools->pool_ops->destroy_frmrs(device, page->handles, count); + kfree(page); + } + + kfree(pool); +} + +/* + * Initialize the FRMR pools for a device. + * + * @device: The device to initialize the FRMR pools for. + * @pool_ops: The pool operations to use. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pools_init(struct ib_device *device, + const struct ib_frmr_pool_ops *pool_ops) +{ + struct ib_frmr_pools *pools; + + pools = kzalloc_obj(*pools); + if (!pools) + return -ENOMEM; + + pools->rb_root = RB_ROOT; + rwlock_init(&pools->rb_lock); + pools->pool_ops = pool_ops; + + device->frmr_pools = pools; + return 0; +} +EXPORT_SYMBOL(ib_frmr_pools_init); + +/* + * Clean up the FRMR pools for a device. + * + * @device: The device to clean up the FRMR pools for. + * + * Call cleanup only after all FRMR handles have been pushed back to the pool + * and no other FRMR operations are allowed to run in parallel. + * Ensuring this allows us to save synchronization overhead in pop and push + * operations. + */ +void ib_frmr_pools_cleanup(struct ib_device *device) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool, *next; + + if (!pools) + return; + + rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node) + destroy_frmr_pool(device, pool); + + kfree(pools); + device->frmr_pools = NULL; +} +EXPORT_SYMBOL(ib_frmr_pools_cleanup); + +static inline int compare_keys(struct ib_frmr_key *key1, + struct ib_frmr_key *key2) +{ + int res; + + res = cmp_int(key1->ats, key2->ats); + if (res) + return res; + + res = cmp_int(key1->access_flags, key2->access_flags); + if (res) + return res; + + res = cmp_int(key1->vendor_key, key2->vendor_key); + if (res) + return res; + + res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key); + if (res) + return res; + + /* + * allow using handles that support more DMA blocks, up to twice the + * requested number + */ + res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks); + if (res > 0) { + if (key1->num_dma_blocks - key2->num_dma_blocks < + key2->num_dma_blocks) + return 0; + } + + return res; +} + +static int frmr_pool_cmp_find(const void *key, const struct rb_node *node) +{ + struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); + + return compare_keys(&pool->key, (struct ib_frmr_key *)key); +} + +static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node) +{ + struct ib_frmr_pool *new_pool = + rb_entry(new, struct ib_frmr_pool, node); + struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); + + return compare_keys(&pool->key, &new_pool->key); +} + +static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools, + struct ib_frmr_key *key) +{ + struct ib_frmr_pool *pool; + struct rb_node *node; + + /* find operation is done under read lock for performance reasons. + * The case of threads failing to find the same pool and creating it + * is handled by the create_frmr_pool function. + */ + read_lock(&pools->rb_lock); + node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find); + pool = rb_entry_safe(node, struct ib_frmr_pool, node); + read_unlock(&pools->rb_lock); + + return pool; +} + +static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device, + struct ib_frmr_key *key) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool; + struct rb_node *existing; + + pool = kzalloc_obj(*pool); + if (!pool) + return ERR_PTR(-ENOMEM); + + memcpy(&pool->key, key, sizeof(*key)); + INIT_LIST_HEAD(&pool->queue.pages_list); + spin_lock_init(&pool->lock); + + write_lock(&pools->rb_lock); + existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add); + write_unlock(&pools->rb_lock); + + /* If a different thread has already created the pool, return it. + * The insert operation is done under the write lock so we are sure + * that the pool is not inserted twice. + */ + if (existing) { + kfree(pool); + return rb_entry(existing, struct ib_frmr_pool, node); + } + + return pool; +} + +static int get_frmr_from_pool(struct ib_device *device, + struct ib_frmr_pool *pool, struct ib_mr *mr) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + u32 handle; + int err; + + spin_lock(&pool->lock); + if (pool->queue.ci == 0) { + spin_unlock(&pool->lock); + err = pools->pool_ops->create_frmrs(device, &pool->key, &handle, + 1); + if (err) + return err; + } else { + handle = pop_handle_from_queue_locked(&pool->queue); + spin_unlock(&pool->lock); + } + + mr->frmr.pool = pool; + mr->frmr.handle = handle; + + return 0; +} + +/* + * Pop an FRMR handle from the pool. + * + * @device: The device to pop the FRMR handle from. + * @mr: The MR to pop the FRMR handle from. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool; + + WARN_ON_ONCE(!device->frmr_pools); + pool = ib_frmr_pool_find(pools, &mr->frmr.key); + if (!pool) { + pool = create_frmr_pool(device, &mr->frmr.key); + if (IS_ERR(pool)) + return PTR_ERR(pool); + } + + return get_frmr_from_pool(device, pool, mr); +} +EXPORT_SYMBOL(ib_frmr_pool_pop); + +/* + * Push an FRMR handle back to the pool. + * + * @device: The device to push the FRMR handle to. + * @mr: The MR containing the FRMR handle to push back to the pool. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr) +{ + struct ib_frmr_pool *pool = mr->frmr.pool; + int ret; + + spin_lock(&pool->lock); + ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle); + spin_unlock(&pool->lock); + + return ret; +} +EXPORT_SYMBOL(ib_frmr_pool_push); diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h new file mode 100644 index 000000000000..0433db5061bd --- /dev/null +++ b/drivers/infiniband/core/frmr_pools.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef RDMA_CORE_FRMR_POOLS_H +#define RDMA_CORE_FRMR_POOLS_H + +#include +#include +#include +#include +#include + +#define NUM_HANDLES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32)) + +struct frmr_handles_page { + struct list_head list; + u32 handles[NUM_HANDLES_PER_PAGE]; +}; + +/* FRMR queue holds a list of frmr_handles_page. + * num_pages: number of pages in the queue. + * ci: current index in the handles array across all pages. + */ +struct frmr_queue { + struct list_head pages_list; + u32 num_pages; + unsigned long ci; +}; + +struct ib_frmr_pool { + struct rb_node node; + struct ib_frmr_key key; /* Pool key */ + + /* Protect access to the queue */ + spinlock_t lock; + struct frmr_queue queue; +}; + +struct ib_frmr_pools { + struct rb_root rb_root; + rwlock_t rb_lock; + const struct ib_frmr_pool_ops *pool_ops; +}; + +#endif /* RDMA_CORE_FRMR_POOLS_H */ diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h new file mode 100644 index 000000000000..9ef41eb43e4b --- /dev/null +++ b/include/rdma/frmr_pools.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef FRMR_POOLS_H +#define FRMR_POOLS_H + +#include +#include + +struct ib_device; +struct ib_mr; + +struct ib_frmr_key { + u64 vendor_key; + /* A pool with non-zero kernel_vendor_key is a kernel-only pool. */ + u64 kernel_vendor_key; + size_t num_dma_blocks; + int access_flags; + u8 ats:1; +}; + +struct ib_frmr_pool_ops { + int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key, + u32 *handles, u32 count); + void (*destroy_frmrs)(struct ib_device *device, u32 *handles, + u32 count); +}; + +int ib_frmr_pools_init(struct ib_device *device, + const struct ib_frmr_pool_ops *pool_ops); +void ib_frmr_pools_cleanup(struct ib_device *device); +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr); +int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr); + +#endif /* FRMR_POOLS_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1b77fd88d0fb..ba34b131e9be 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -1905,6 +1906,11 @@ struct ib_mr { struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ struct ib_dmah *dmah; + struct { + struct ib_frmr_pool *pool; + struct ib_frmr_key key; + u32 handle; + } frmr; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2907,6 +2913,8 @@ struct ib_device { struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; + + struct ib_frmr_pools *frmr_pools; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, -- cgit v1.2.3 From 020d189d16a62ed56115cce7e255459cf0eeb4e6 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:10 +0200 Subject: RDMA/core: Add pinned handles to FRMR pools Add a configuration of pinned handles on a specific FRMR pool. The configured amount of pinned handles will not be aged and will stay available for users to claim. Upon setting the amount of pinned handles to an FRMR pool, we will make sure we have at least the pinned amount of handles associated with the pool and create more, if necessary. The count for pinned handles take into account handles that are used by user MRs and handles in the queue. Introduce a new FRMR operation of build_key that allows drivers to manipulate FRMR keys supplied by the user, allowing failing for unsupported properties and masking of properties that are modifiable. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-5-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/frmr_pools.c | 127 +++++++++++++++++++++++++++++++++++ drivers/infiniband/core/frmr_pools.h | 3 + include/rdma/frmr_pools.h | 2 + 3 files changed, 132 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c index 5a9c60f19e4e..0e1330807b88 100644 --- a/drivers/infiniband/core/frmr_pools.c +++ b/drivers/infiniband/core/frmr_pools.c @@ -97,6 +97,50 @@ static void destroy_all_handles_in_queue(struct ib_device *device, } } +static bool age_pinned_pool(struct ib_device *device, struct ib_frmr_pool *pool) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + u32 total, to_destroy, destroyed = 0; + bool has_work = false; + u32 *handles; + u32 handle; + + spin_lock(&pool->lock); + total = pool->queue.ci + pool->inactive_queue.ci + pool->in_use; + if (total <= pool->pinned_handles) { + spin_unlock(&pool->lock); + return false; + } + + to_destroy = total - pool->pinned_handles; + + handles = kcalloc(to_destroy, sizeof(*handles), GFP_ATOMIC); + if (!handles) { + spin_unlock(&pool->lock); + return true; + } + + /* Destroy all excess handles in the inactive queue */ + while (pool->inactive_queue.ci && destroyed < to_destroy) { + handles[destroyed++] = pop_handle_from_queue_locked( + &pool->inactive_queue); + } + + /* Move all handles from regular queue to inactive queue */ + while (pool->queue.ci) { + handle = pop_handle_from_queue_locked(&pool->queue); + push_handle_to_queue_locked(&pool->inactive_queue, handle); + has_work = true; + } + + spin_unlock(&pool->lock); + + if (destroyed) + pools->pool_ops->destroy_frmrs(device, handles, destroyed); + kfree(handles); + return has_work; +} + static void pool_aging_work(struct work_struct *work) { struct ib_frmr_pool *pool = container_of( @@ -104,6 +148,11 @@ static void pool_aging_work(struct work_struct *work) struct ib_frmr_pools *pools = pool->device->frmr_pools; bool has_work = false; + if (pool->pinned_handles) { + has_work = age_pinned_pool(pool->device, pool); + goto out; + } + destroy_all_handles_in_queue(pool->device, pool, &pool->inactive_queue); /* Move all pages from regular queue to inactive queue */ @@ -120,6 +169,7 @@ static void pool_aging_work(struct work_struct *work) } spin_unlock(&pool->lock); +out: /* Reschedule if there are handles to age in next aging period */ if (has_work) queue_delayed_work( @@ -298,6 +348,83 @@ static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device, return pool; } +int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, + u32 pinned_handles) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_key driver_key = {}; + struct ib_frmr_pool *pool; + u32 needed_handles; + u32 current_total; + int i, ret = 0; + u32 *handles; + + if (!pools) + return -EINVAL; + + ret = ib_check_mr_access(device, key->access_flags); + if (ret) + return ret; + + if (pools->pool_ops->build_key) { + ret = pools->pool_ops->build_key(device, key, &driver_key); + if (ret) + return ret; + } else { + memcpy(&driver_key, key, sizeof(*key)); + } + + pool = ib_frmr_pool_find(pools, &driver_key); + if (!pool) { + pool = create_frmr_pool(device, &driver_key); + if (IS_ERR(pool)) + return PTR_ERR(pool); + } + + spin_lock(&pool->lock); + current_total = pool->in_use + pool->queue.ci + pool->inactive_queue.ci; + + if (current_total < pinned_handles) + needed_handles = pinned_handles - current_total; + else + needed_handles = 0; + + pool->pinned_handles = pinned_handles; + spin_unlock(&pool->lock); + + if (!needed_handles) + goto schedule_aging; + + handles = kcalloc(needed_handles, sizeof(*handles), GFP_KERNEL); + if (!handles) + return -ENOMEM; + + ret = pools->pool_ops->create_frmrs(device, key, handles, + needed_handles); + if (ret) { + kfree(handles); + return ret; + } + + spin_lock(&pool->lock); + for (i = 0; i < needed_handles; i++) { + ret = push_handle_to_queue_locked(&pool->queue, + handles[i]); + if (ret) + goto end; + } + +end: + spin_unlock(&pool->lock); + kfree(handles); + +schedule_aging: + /* Ensure aging is scheduled to adjust to new pinned handles count */ + mod_delayed_work(pools->aging_wq, &pool->aging_work, 0); + + return ret; +} + static int get_frmr_from_pool(struct ib_device *device, struct ib_frmr_pool *pool, struct ib_mr *mr) { diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h index a30f7ce45d38..f7519beb6abd 100644 --- a/drivers/infiniband/core/frmr_pools.h +++ b/drivers/infiniband/core/frmr_pools.h @@ -45,6 +45,7 @@ struct ib_frmr_pool { u32 max_in_use; u32 in_use; + u32 pinned_handles; }; struct ib_frmr_pools { @@ -55,4 +56,6 @@ struct ib_frmr_pools { struct workqueue_struct *aging_wq; }; +int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, + u32 pinned_handles); #endif /* RDMA_CORE_FRMR_POOLS_H */ diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h index 9ef41eb43e4b..af1b88801fa4 100644 --- a/include/rdma/frmr_pools.h +++ b/include/rdma/frmr_pools.h @@ -26,6 +26,8 @@ struct ib_frmr_pool_ops { u32 *handles, u32 count); void (*destroy_frmrs)(struct ib_device *device, u32 *handles, u32 count); + int (*build_key)(struct ib_device *device, const struct ib_frmr_key *in, + struct ib_frmr_key *out); }; int ib_frmr_pools_init(struct ib_device *device, -- cgit v1.2.3 From ba51cf9fcf511df8c7026feda4b7d65999d3517c Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:12 +0200 Subject: net/mlx5: Drop MR cache related code Following mlx5_ib move to using FRMR pools, drop all unused code of MR cache. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-7-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 67 +------------------------- include/linux/mlx5/driver.h | 11 ----- 2 files changed, 1 insertion(+), 77 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..4b59f3f7c6f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -110,74 +110,9 @@ static struct mlx5_profile profile[] = { }, [2] = { - .mask = MLX5_PROF_MASK_QP_SIZE | - MLX5_PROF_MASK_MR_CACHE, + .mask = MLX5_PROF_MASK_QP_SIZE, .log_max_qp = LOG_MAX_SUPPORTED_QPS, .num_cmd_caches = MLX5_NUM_COMMAND_CACHES, - .mr_cache[0] = { - .size = 500, - .limit = 250 - }, - .mr_cache[1] = { - .size = 500, - .limit = 250 - }, - .mr_cache[2] = { - .size = 500, - .limit = 250 - }, - .mr_cache[3] = { - .size = 500, - .limit = 250 - }, - .mr_cache[4] = { - .size = 500, - .limit = 250 - }, - .mr_cache[5] = { - .size = 500, - .limit = 250 - }, - .mr_cache[6] = { - .size = 500, - .limit = 250 - }, - .mr_cache[7] = { - .size = 500, - .limit = 250 - }, - .mr_cache[8] = { - .size = 500, - .limit = 250 - }, - .mr_cache[9] = { - .size = 500, - .limit = 250 - }, - .mr_cache[10] = { - .size = 500, - .limit = 250 - }, - .mr_cache[11] = { - .size = 500, - .limit = 250 - }, - .mr_cache[12] = { - .size = 64, - .limit = 32 - }, - .mr_cache[13] = { - .size = 32, - .limit = 16 - }, - .mr_cache[14] = { - .size = 16, - .limit = 8 - }, - .mr_cache[15] = { - .size = 8, - .limit = 4 - }, }, [3] = { .mask = MLX5_PROF_MASK_QP_SIZE, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..27d64f09683f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -705,23 +705,12 @@ struct mlx5_st; enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, - MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, -}; - -enum { - MKEY_CACHE_LAST_STD_ENTRY = 20, - MLX5_IMR_KSM_CACHE_ENTRY, - MAX_MKEY_CACHE_ENTRIES }; struct mlx5_profile { u64 mask; u8 log_max_qp; u8 num_cmd_caches; - struct { - int size; - int limit; - } mr_cache[MAX_MKEY_CACHE_ENTRIES]; }; struct mlx5_hca_cap { -- cgit v1.2.3 From 50c035976af3d361cff27dd54b1736debd42c19a Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:13 +0200 Subject: RDMA/nldev: Add command to get FRMR pools Add support for a new command in netlink to dump to user the state of the FRMR pools on the devices. Expose each pool with its key and the usage statistics for it. Signed-off-by: Michael Guralnik Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-8-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 165 +++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 17 ++++ 2 files changed, 182 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 2220a2dfab24..6637c76165be 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -37,11 +37,13 @@ #include #include #include +#include #include "core_priv.h" #include "cma_priv.h" #include "restrack.h" #include "uverbs.h" +#include "frmr_pools.h" /* * This determines whether a non-privileged user is allowed to specify a @@ -172,6 +174,16 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_FRMR_POOLS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE] = { .type = NLA_U64 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2637,6 +2649,156 @@ static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, return ib_del_sub_device_and_put(device); } +static int fill_frmr_pool_key(struct sk_buff *msg, struct ib_frmr_key *key) +{ + struct nlattr *key_attr; + + key_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY); + if (!key_attr) + return -EMSGSIZE; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS, key->ats)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS, + key->access_flags)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY, + key->vendor_key, RDMA_NLDEV_ATTR_PAD)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS, + key->num_dma_blocks, RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, key_attr); + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_frmr_pool_entry(struct sk_buff *msg, struct ib_frmr_pool *pool) +{ + if (fill_frmr_pool_key(msg, &pool->key)) + return -EMSGSIZE; + + spin_lock(&pool->lock); + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES, + pool->queue.ci + pool->inactive_queue.ci)) + goto err_unlock; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE, + pool->max_in_use, RDMA_NLDEV_ATTR_PAD)) + goto err_unlock; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, + pool->in_use, RDMA_NLDEV_ATTR_PAD)) + goto err_unlock; + spin_unlock(&pool->lock); + + return 0; + +err_unlock: + spin_unlock(&pool->lock); + return -EMSGSIZE; +} + +static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_frmr_pools *pools; + int err, ret = 0, idx = 0; + struct ib_frmr_pool *pool; + struct nlattr *table_attr; + struct nlattr *entry_attr; + struct ib_device *device; + int start = cb->args[0]; + struct rb_node *node; + struct nlmsghdr *nlh; + bool filled = false; + + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + device = ib_device_get_by_index( + sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX])); + if (!device) + return -EINVAL; + + pools = device->frmr_pools; + if (!pools) { + ib_device_put(device); + return 0; + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_FRMR_POOLS_GET), + 0, NLM_F_MULTI); + + if (!nlh || fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start_noflag(skb, RDMA_NLDEV_ATTR_FRMR_POOLS); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + read_lock(&pools->rb_lock); + for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) { + pool = rb_entry(node, struct ib_frmr_pool, node); + if (pool->key.kernel_vendor_key) + continue; + + if (idx < start) { + idx++; + continue; + } + + filled = true; + + entry_attr = nla_nest_start_noflag( + skb, RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY); + if (!entry_attr) { + ret = -EMSGSIZE; + goto end_msg; + } + + if (fill_frmr_pool_entry(skb, pool)) { + nla_nest_cancel(skb, entry_attr); + ret = -EMSGSIZE; + goto end_msg; + } + + nla_nest_end(skb, entry_attr); + idx++; + } +end_msg: + read_unlock(&pools->rb_lock); + + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more entries to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!filled) + goto err; + + ib_device_put(device); + return skb->len; + +err: + nlmsg_cancel(skb, nlh); + ib_device_put(device); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2743,6 +2905,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_deldev, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_FRMR_POOLS_GET] = { + .dump = nldev_frmr_pools_get_dumpit, + }, }; static int fill_mon_netdev_rename(struct sk_buff *msg, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index f41f0228fcd0..8f17ffe0190c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -308,6 +308,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_MONITOR, + RDMA_NLDEV_CMD_FRMR_POOLS_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -582,6 +584,21 @@ enum rdma_nldev_attr { RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, /* u8 */ RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, /* u8 */ + + /* + * FRMR Pools attributes + */ + RDMA_NLDEV_ATTR_FRMR_POOLS, /* nested table */ + RDMA_NLDEV_ATTR_FRMR_POOL_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY, /* nested table */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS, /* u8 */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS, /* u32 */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY, /* u64 */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS, /* u64 */ + RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES, /* u32 */ + RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE, /* u64 */ + RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, /* u64 */ + /* * Always the end */ -- cgit v1.2.3 From d2ea675e86bab6563bbc0841e6c74eef54e83be7 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:14 +0200 Subject: RDMA/core: Add netlink command to modify FRMR aging Allow users to set FRMR pools aging timer through netlink. This functionality will allow user to control how long handles reside in the kernel before being destroyed, thus being able to tune the tradeoff between memory and HW object consumption and memory registration optimization. Since FRMR pools is highly beneficial for application restart scenarios, this command allows users to modify the aging timer to their application restart time, making sure the FRMR handles deregistered on application teardown are kept for long enough in the pools for reuse in the application startup. Signed-off-by: Michael Guralnik Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-9-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/frmr_pools.c | 31 ++++++++++++++++++++++++++++-- drivers/infiniband/core/frmr_pools.h | 2 ++ drivers/infiniband/core/nldev.c | 37 ++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 3 +++ 4 files changed, 71 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c index 0e1330807b88..5e992ff3d7cf 100644 --- a/drivers/infiniband/core/frmr_pools.c +++ b/drivers/infiniband/core/frmr_pools.c @@ -174,7 +174,7 @@ out: if (has_work) queue_delayed_work( pools->aging_wq, &pool->aging_work, - secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS)); + secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); } static void destroy_frmr_pool(struct ib_device *device, @@ -213,6 +213,8 @@ int ib_frmr_pools_init(struct ib_device *device, return -ENOMEM; } + pools->aging_period_sec = FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS; + device->frmr_pools = pools; return 0; } @@ -245,6 +247,31 @@ void ib_frmr_pools_cleanup(struct ib_device *device) } EXPORT_SYMBOL(ib_frmr_pools_cleanup); +int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool; + struct rb_node *node; + + if (!pools) + return -EINVAL; + + if (period_sec == 0) + return -EINVAL; + + WRITE_ONCE(pools->aging_period_sec, period_sec); + + read_lock(&pools->rb_lock); + for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) { + pool = rb_entry(node, struct ib_frmr_pool, node); + mod_delayed_work(pools->aging_wq, &pool->aging_work, + secs_to_jiffies(period_sec)); + } + read_unlock(&pools->rb_lock); + + return 0; +} + static inline int compare_keys(struct ib_frmr_key *key1, struct ib_frmr_key *key2) { @@ -513,7 +540,7 @@ int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr) if (ret == 0 && schedule_aging) queue_delayed_work(pools->aging_wq, &pool->aging_work, - secs_to_jiffies(FRMR_POOLS_DEFAULT_AGING_PERIOD_SECS)); + secs_to_jiffies(READ_ONCE(pools->aging_period_sec))); return ret; } diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h index f7519beb6abd..67e1402169ae 100644 --- a/drivers/infiniband/core/frmr_pools.h +++ b/drivers/infiniband/core/frmr_pools.h @@ -54,8 +54,10 @@ struct ib_frmr_pools { const struct ib_frmr_pool_ops *pool_ops; struct workqueue_struct *aging_wq; + u32 aging_period_sec; }; int ib_frmr_pools_set_pinned(struct ib_device *device, struct ib_frmr_key *key, u32 pinned_handles); +int ib_frmr_pools_set_aging_period(struct ib_device *device, u32 period_sec); #endif /* RDMA_CORE_FRMR_POOLS_H */ diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 6637c76165be..8d004b7568b7 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -184,6 +184,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2799,6 +2800,38 @@ err: return ret; } +static int nldev_frmr_pools_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 aging_period; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err) + return err; + + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + if (!tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]) + return -EINVAL; + + device = ib_device_get_by_index( + sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX])); + if (!device) + return -EINVAL; + + aging_period = nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]); + + err = ib_frmr_pools_set_aging_period(device, aging_period); + + ib_device_put(device); + return err; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2908,6 +2941,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_FRMR_POOLS_GET] = { .dump = nldev_frmr_pools_get_dumpit, }, + [RDMA_NLDEV_CMD_FRMR_POOLS_SET] = { + .doit = nldev_frmr_pools_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; static int fill_mon_netdev_rename(struct sk_buff *msg, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 8f17ffe0190c..f9c295caf2b1 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -310,6 +310,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_FRMR_POOLS_GET, /* can dump */ + RDMA_NLDEV_CMD_FRMR_POOLS_SET, + RDMA_NLDEV_NUM_OPS }; @@ -598,6 +600,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_FRMR_POOL_QUEUE_HANDLES, /* u32 */ RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE, /* u64 */ RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, /* u64 */ + RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD, /* u32 */ /* * Always the end -- cgit v1.2.3 From f6c2996709ca864a8226f54377b4b07da002b807 Mon Sep 17 00:00:00 2001 From: Ricardo Robaina Date: Thu, 26 Feb 2026 14:15:46 -0300 Subject: audit: fix whitespace alignment in include/uapi/linux/audit.h Fixed minor indentation inconsistencies in the audit macros to align with standard kernel coding style using 8-character hard tabs. Signed-off-by: Ricardo Robaina [PM: fixed a space before tab issue in the patch] Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 14a1c1fe013a..71cbdc542ce9 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -350,7 +350,7 @@ enum { #define AUDIT_STATUS_ENABLED 0x0001 #define AUDIT_STATUS_FAILURE 0x0002 #define AUDIT_STATUS_PID 0x0004 -#define AUDIT_STATUS_RATE_LIMIT 0x0008 +#define AUDIT_STATUS_RATE_LIMIT 0x0008 #define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 #define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 #define AUDIT_STATUS_LOST 0x0040 @@ -386,8 +386,8 @@ enum { * These bits disambiguate different calling conventions that share an * ELF machine type, bitness, and endianness */ -#define __AUDIT_ARCH_CONVENTION_MASK 0x30000000 -#define __AUDIT_ARCH_CONVENTION_MIPS64_N32 0x20000000 +#define __AUDIT_ARCH_CONVENTION_MASK 0x30000000 +#define __AUDIT_ARCH_CONVENTION_MIPS64_N32 0x20000000 /* distinguish syscall tables */ #define __AUDIT_ARCH_64BIT 0x80000000 -- cgit v1.2.3 From d8e760b7996df37b6c1f25ca8a4a5645f144f63c Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Thu, 26 Feb 2026 21:14:27 +0530 Subject: drm/amdgpu: update type for num_syncobj_handles in drm_amdgpu_userq_signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update the type for num_syncobj_handles from __u64 to _u16 with required padding. This breaks the UAPI for big-endian platforms but this is deliberate and harmless since userqueues is still a beta feature. It is enabled via module parameter and need the right fw support to work. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ebbd861ef0bc..3ab41571f511 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -479,7 +479,9 @@ struct drm_amdgpu_userq_signal { * @num_syncobj_handles: A count that represents the number of syncobj handles in * @syncobj_handles. */ - __u64 num_syncobj_handles; + __u16 num_syncobj_handles; + __u16 pad0; + __u32 pad1; /** * @bo_read_handles: The list of BO handles that the submitted user queue job * is using for read only. This will update BO fences in the kernel. -- cgit v1.2.3 From c561d2320492e0dbe50a37437a525a2e91c471bd Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Thu, 26 Feb 2026 21:18:51 +0530 Subject: drm/amdgpu: update type for num_syncobj_handles in drm_amdgpu_userq_wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update the type for num_syncobj_handles from __u32 to _u16 with required padding. This breaks the UAPI for big-endian platforms but this is deliberate and harmless since userqueues is still a beta feature. It is enabled via module parameter and need the right fw support to work. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 3ab41571f511..9f3090db2f16 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -565,7 +565,8 @@ struct drm_amdgpu_userq_wait { * @num_syncobj_handles: A count that represents the number of syncobj handles in * @syncobj_handles. */ - __u32 num_syncobj_handles; + __u16 num_syncobj_handles; + __u16 pad0; /** * @num_bo_read_handles: A count that represents the number of read BO handles in * @bo_read_handles. -- cgit v1.2.3 From fe3c03b84ae69f34992a5e72cbb8384b9ebad738 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:51:54 -0800 Subject: cred: fix kernel-doc warnings in cred.h Use the correct function parameter names, function names, or kernel-doc format, and add function return comment sections to avoid kernel-doc warnings: Warning: include/linux/cred.h:43 function parameter 'gi' not described in 'get_group_info' Warning: include/linux/cred.h:43 No description found for return value of 'get_group_info' Warning: include/linux/cred.h:213 No description found for return value of 'get_cred_many' Warning: include/linux/cred.h:260 function parameter '_cred' not described in 'put_cred_many' Warning: include/linux/cred.h:260 expecting prototype for put_cred(). Prototype was for put_cred_many() instead Signed-off-by: Randy Dunlap [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/cred.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index ed1609d78cd7..c6676265a985 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -33,12 +33,14 @@ struct group_info { /** * get_group_info - Get a reference to a group info structure - * @group_info: The group info to reference + * @gi: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. + * + * Returns: @gi */ static inline struct group_info *get_group_info(struct group_info *gi) { @@ -209,6 +211,8 @@ DEFINE_CLASS(override_creds, * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. + * + * Returns: @cred when the references are acquired, NULL otherwise. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { @@ -246,8 +250,8 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) } /** - * put_cred - Release a reference to a set of credentials - * @cred: The credentials to release + * put_cred_many - Release a reference to a set of credentials + * @_cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref -- cgit v1.2.3 From 0b16e69d17d8c35c5c9d5918bf596c75a44655d3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Feb 2026 17:20:36 -0800 Subject: KVM: x86: Use scratch field in MMIO fragment to hold small write values When exiting to userspace to service an emulated MMIO write, copy the to-be-written value to a scratch field in the MMIO fragment if the size of the data payload is 8 bytes or less, i.e. can fit in a single chunk, instead of pointing the fragment directly at the source value. This fixes a class of use-after-free bugs that occur when the emulator initiates a write using an on-stack, local variable as the source, the write splits a page boundary, *and* both pages are MMIO pages. Because KVM's ABI only allows for physically contiguous MMIO requests, accesses that split MMIO pages are separated into two fragments, and are sent to userspace one at a time. When KVM attempts to complete userspace MMIO in response to KVM_RUN after the first fragment, KVM will detect the second fragment and generate a second userspace exit, and reference the on-stack variable. The issue is most visible if the second KVM_RUN is performed by a separate task, in which case the stack of the initiating task can show up as truly freed data. ================================================================== BUG: KASAN: use-after-free in complete_emulated_mmio+0x305/0x420 Read of size 1 at addr ffff888009c378d1 by task syz-executor417/984 CPU: 1 PID: 984 Comm: syz-executor417 Not tainted 5.10.0-182.0.0.95.h2627.eulerosv2r13.x86_64 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0xbe/0xfd print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 check_memory_region+0xfd/0x1f0 memcpy+0x20/0x60 complete_emulated_mmio+0x305/0x420 kvm_arch_vcpu_ioctl_run+0x63f/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 RIP: 0033:0x42477d Code: <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007faa8e6890e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004d7338 RCX: 000000000042477d RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005 RBP: 00000000004d7330 R08: 00007fff28d546df R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004d733c R13: 0000000000000000 R14: 000000000040a200 R15: 00007fff28d54720 The buggy address belongs to the page: page:0000000029f6a428 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x9c37 flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc0000000 0000000000000000 ffffea0000270dc8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888009c37780: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37800: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff888009c37880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888009c37900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== The bug can also be reproduced with a targeted KVM-Unit-Test by hacking KVM to fill a large on-stack variable in complete_emulated_mmio(), i.e. by overwrite the data value with garbage. Limit the use of the scratch fields to 8-byte or smaller accesses, and to just writes, as larger accesses and reads are not affected thanks to implementation details in the emulator, but add a sanity check to ensure those details don't change in the future. Specifically, KVM never uses on-stack variables for accesses larger that 8 bytes, e.g. uses an operand in the emulator context, and *all* reads are buffered through the mem_read cache. Note! Using the scratch field for reads is not only unnecessary, it's also extremely difficult to handle correctly. As above, KVM buffers all reads through the mem_read cache, and heavily relies on that behavior when re-emulating the instruction after a userspace MMIO read exit. If a read splits a page, the first page is NOT an MMIO page, and the second page IS an MMIO page, then the MMIO fragment needs to point at _just_ the second chunk of the destination, i.e. its position in the mem_read cache. Taking the "obvious" approach of copying the fragment value into the destination when re-emulating the instruction would clobber the first chunk of the destination, i.e. would clobber the data that was read from guest memory. Fixes: f78146b0f923 ("KVM: Fix page-crossing MMIO") Suggested-by: Yashu Zhang Reported-by: Yashu Zhang Closes: https://lore.kernel.org/all/369eaaa2b3c1425c85e8477066391bc7@huawei.com Cc: stable@vger.kernel.org Tested-by: Tom Lendacky Tested-by: Rick Edgecombe Link: https://patch.msgid.link/20260225012049.920665-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 14 +++++++++++++- include/linux/kvm_host.h | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a03530795707..9bb9d7f078fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8225,7 +8225,13 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; frag->gpa = gpa; - frag->data = val; + if (write && bytes <= 8u) { + frag->val = 0; + frag->data = &frag->val; + memcpy(&frag->val, val, bytes); + } else { + frag->data = val; + } frag->len = bytes; return X86EMUL_CONTINUE; } @@ -8240,6 +8246,9 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, gpa_t gpa; int rc; + if (WARN_ON_ONCE((bytes > 8u || !ops->write) && object_is_on_stack(val))) + return X86EMUL_UNHANDLEABLE; + if (ops->read_write_prepare && ops->read_write_prepare(vcpu, val, bytes)) return X86EMUL_CONTINUE; @@ -11846,6 +11855,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) frag++; vcpu->mmio_cur_fragment++; } else { + if (WARN_ON_ONCE(frag->data == &frag->val)) + return -EIO; + /* Go forward to the next mmio piece. */ frag->data += len; frag->gpa += len; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..abb309372035 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,7 +318,8 @@ static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) struct kvm_mmio_fragment { gpa_t gpa; void *data; - unsigned len; + u64 val; + unsigned int len; }; struct kvm_vcpu { -- cgit v1.2.3 From 8341c989ac77d712c7d6e2bce29e8a4bcb2eeae4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Feb 2026 15:11:20 +0000 Subject: net: remove addr_len argument of recvmsg() handlers Use msg->msg_namelen as a place holder instead of a temporary variable, notably in inet[6]_recvmsg(). This removes stack canaries and allows tail-calls. $ scripts/bloat-o-meter -t vmlinux.old vmlinux add/remove: 0/0 grow/shrink: 2/19 up/down: 26/-532 (-506) Function old new delta rawv6_recvmsg 744 767 +23 vsock_dgram_recvmsg 55 58 +3 vsock_connectible_recvmsg 50 47 -3 unix_stream_recvmsg 161 158 -3 unix_seqpacket_recvmsg 62 59 -3 unix_dgram_recvmsg 42 39 -3 tcp_recvmsg 546 543 -3 mptcp_recvmsg 1568 1565 -3 ping_recvmsg 806 800 -6 tcp_bpf_recvmsg_parser 983 974 -9 ip_recv_error 588 576 -12 ipv6_recv_rxpmtu 442 428 -14 udp_recvmsg 1243 1224 -19 ipv6_recv_error 1046 1024 -22 udpv6_recvmsg 1487 1461 -26 raw_recvmsg 465 437 -28 udp_bpf_recvmsg 1027 984 -43 sock_common_recvmsg 103 27 -76 inet_recvmsg 257 175 -82 inet6_recvmsg 257 175 -82 tcp_bpf_recvmsg 663 568 -95 Total: Before=25143834, After=25143328, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260227151120.1346573-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls.h | 2 +- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c | 8 ++++---- drivers/net/ovpn/tcp.c | 2 +- include/net/inet_common.h | 3 +-- include/net/ip.h | 2 +- include/net/ipv6.h | 6 ++---- include/net/ping.h | 5 ++--- include/net/sock.h | 2 +- include/net/tcp.h | 2 +- net/core/sock.c | 7 +------ net/ieee802154/socket.c | 6 +++--- net/ipv4/af_inet.c | 17 ++++++----------- net/ipv4/ip_sockglue.c | 4 ++-- net/ipv4/ping.c | 9 ++++----- net/ipv4/raw.c | 6 +++--- net/ipv4/tcp.c | 5 ++--- net/ipv4/tcp_bpf.c | 17 ++++++++--------- net/ipv4/udp.c | 9 ++++----- net/ipv4/udp_bpf.c | 16 ++++++++-------- net/ipv4/udp_impl.h | 3 +-- net/ipv6/af_inet6.c | 11 +++-------- net/ipv6/datagram.c | 9 ++++----- net/ipv6/ping.c | 3 +-- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 10 +++++----- net/ipv6/udp_impl.h | 3 +-- net/l2tp/l2tp_ip.c | 4 ++-- net/l2tp/l2tp_ip6.c | 6 +++--- net/mptcp/protocol.c | 4 ++-- net/phonet/datagram.c | 4 ++-- net/phonet/pep.c | 2 +- net/sctp/socket.c | 12 ++++++------ net/tls/tls.h | 2 +- net/tls/tls_sw.c | 3 +-- net/unix/af_unix.c | 4 ++-- net/unix/unix_bpf.c | 2 +- net/vmw_vsock/af_vsock.c | 4 ++-- net/vmw_vsock/vsock_bpf.c | 2 +- net/xfrm/espintcp.c | 2 +- 39 files changed, 99 insertions(+), 127 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h index 21e0dfeff158..1de5744a49b0 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h @@ -567,7 +567,7 @@ void chtls_shutdown(struct sock *sk, int how); void chtls_destroy_sock(struct sock *sk); int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int chtls_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len); + size_t len, int flags); void chtls_splice_eof(struct socket *sock); int send_tx_flowc_wr(struct sock *sk, int compl, u32 snd_nxt, u32 rcv_nxt); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index ee19933e2cca..c8e99409a52a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -1332,7 +1332,7 @@ static void chtls_cleanup_rbuf(struct sock *sk, int copied) } static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); struct chtls_hws *hws = &csk->tlshws; @@ -1656,7 +1656,7 @@ found_ok_skb: } int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct tcp_sock *tp = tcp_sk(sk); struct chtls_sock *csk; @@ -1670,7 +1670,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, buffers_freed = 0; if (unlikely(flags & MSG_OOB)) - return tcp_prot.recvmsg(sk, msg, len, flags, addr_len); + return tcp_prot.recvmsg(sk, msg, len, flags); if (unlikely(flags & MSG_PEEK)) return peekmsg(sk, msg, len, flags); @@ -1684,7 +1684,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, csk = rcu_dereference_sk_user_data(sk); if (is_tls_rx(csk)) - return chtls_pt_recvmsg(sk, msg, len, flags, addr_len); + return chtls_pt_recvmsg(sk, msg, len, flags); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c index 5499c1572f3e..65054cc84be5 100644 --- a/drivers/net/ovpn/tcp.c +++ b/drivers/net/ovpn/tcp.c @@ -158,7 +158,7 @@ err_nopeer: } static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { int err = 0, off, copied = 0, ret; struct ovpn_socket *sock; diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5dd2bf24449e..3d747896be30 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -59,8 +59,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); -int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); +int inet_recv_error(struct sock *sk, struct msghdr *msg, int len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); diff --git a/include/net/ip.h b/include/net/ip.h index 69d5cef46004..52264c459357 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -812,7 +812,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len); void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 53c5056508be..1c0ce5151275 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1129,10 +1129,8 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr, int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); -int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); -int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len); +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); diff --git a/include/net/ping.h b/include/net/ping.h index 05bfd594a64c..bcbdb5a136e3 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -20,8 +20,7 @@ /* Compatibility glue so we can support IPv6 when it's compiled as a module */ struct pingv6_ops { - int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); + int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len); void (*ip6_datagram_recv_common_ctl)(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); @@ -64,7 +63,7 @@ int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags); int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index a7a8b31e9877..6c3f1340e8ef 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1321,7 +1321,7 @@ struct proto { int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len); + size_t len, int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); diff --git a/include/net/tcp.h b/include/net/tcp.h index 9921c47491a5..9cf8785ef0b4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -500,7 +500,7 @@ void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); void tcp_update_recv_tstamps(struct sk_buff *skb, diff --git a/net/core/sock.c b/net/core/sock.c index fba4d5b8553c..f4e2ff23d60e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3967,13 +3967,8 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - int addr_len = 0; - int err; - err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); - if (err >= 0) - msg->msg_namelen = addr_len; - return err; + return sk->sk_prot->recvmsg(sk, msg, size, flags); } EXPORT_SYMBOL(sock_common_recvmsg); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index e542fbe113e7..85dce296d751 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -313,7 +313,7 @@ out: } static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -703,7 +703,7 @@ out: } static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -737,7 +737,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); - *addr_len = sizeof(*saddr); + msg->msg_namelen = sizeof(*saddr); } if (ro->want_lqi) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8036e76aa1e4..babcd75a08e2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -877,22 +877,17 @@ void inet_splice_eof(struct socket *sock) EXPORT_SYMBOL_GPL(inet_splice_eof); INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, - size_t, int, int *)); + size_t, int)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; - int addr_len = 0; - int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); - err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, - sk, msg, size, flags, &addr_len); - if (err >= 0) - msg->msg_namelen = addr_len; - return err; + return INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, + sk, msg, size, flags); } EXPORT_SYMBOL(inet_recvmsg); @@ -1577,15 +1572,15 @@ __be32 inet_current_timestamp(void) } EXPORT_SYMBOL(inet_current_timestamp); -int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +int inet_recv_error(struct sock *sk, struct msghdr *msg, int len) { unsigned int family = READ_ONCE(sk->sk_family); if (family == AF_INET) - return ip_recv_error(sk, msg, len, addr_len); + return ip_recv_error(sk, msg, len); #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) - return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); + return pingv6_ops.ipv6_recv_error(sk, msg, len); #endif return -EINVAL; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 697e18242d6c..a55ef327ec93 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -520,7 +520,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, /* * Handle MSG_ERRQUEUE */ -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) { struct sock_exterr_skb *serr; struct sk_buff *skb; @@ -557,7 +557,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 71d5e17719de..92ab0e0f6f71 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -848,8 +848,7 @@ do_confirm: goto out; } -int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) +int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; @@ -864,7 +863,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, goto out; if (flags & MSG_ERRQUEUE) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) @@ -892,7 +891,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } if (inet_cmsg_flags(isk)) @@ -913,7 +912,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); - *addr_len = sizeof(*sin6); + msg->msg_namelen = sizeof(*sin6); } if (inet6_sk(sk)->rxopt.all) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e20c41206e29..34859e537b49 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -739,7 +739,7 @@ out: */ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; @@ -751,7 +751,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; if (flags & MSG_ERRQUEUE) { - err = ip_recv_error(sk, msg, len, addr_len); + err = ip_recv_error(sk, msg, len); goto out; } @@ -777,7 +777,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f84d9a45cc9d..1790d2fa75ad 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2961,14 +2961,13 @@ recv_sndq: goto out; } -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index c449a044895e..a0215a2da324 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -221,8 +221,7 @@ static bool is_next_msg_fin(struct sk_psock *psock) static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, - int flags, - int *addr_len) + int flags) { int peek = flags & MSG_PEEK; struct sk_psock *psock; @@ -232,14 +231,14 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, u32 seq; if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return tcp_recvmsg(sk, msg, len, flags); lock_sock(sk); tcp = tcp_sk(sk); @@ -352,24 +351,24 @@ static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) } static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return tcp_recvmsg(sk, msg, len, flags); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return tcp_recvmsg(sk, msg, len, flags); } lock_sock(sk); msg_bytes_ready: @@ -389,7 +388,7 @@ msg_bytes_ready: goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return tcp_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6c6b68a66dcd..27384024ebc0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2070,8 +2070,7 @@ EXPORT_IPV6_MOD(udp_read_skb); * return it, otherwise we block. */ -int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); @@ -2082,7 +2081,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, bool checksum_valid = false; if (flags & MSG_ERRQUEUE) - return ip_recv_error(sk, msg, len, addr_len); + return ip_recv_error(sk, msg, len); try_again: off = sk_peek_offset(sk, flags); @@ -2145,11 +2144,11 @@ try_again: sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, (struct sockaddr *)sin, - addr_len); + &msg->msg_namelen); } if (udp_test_bit(GRO_ENABLED, sk)) diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 91233e37cd97..912f0bfef4af 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -12,13 +12,13 @@ static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) - return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len); + return udpv6_prot_saved->recvmsg(sk, msg, len, flags); #endif - return udp_prot.recvmsg(sk, msg, len, flags, addr_len); + return udp_prot.recvmsg(sk, msg, len, flags); } static bool udp_sk_has_data(struct sock *sk) @@ -61,23 +61,23 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, } static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) - return sk_udp_recvmsg(sk, msg, len, flags, addr_len); + return sk_udp_recvmsg(sk, msg, len, flags); if (!psock_has_data(psock)) { - ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); + ret = sk_udp_recvmsg(sk, msg, len, flags); goto out; } @@ -92,7 +92,7 @@ msg_bytes_ready: if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; - ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); + ret = sk_udp_recvmsg(sk, msg, len, flags); goto out; } copied = -EAGAIN; diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index c7142213fc21..17a6fa8b1409 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -18,8 +18,7 @@ int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23cc9b4cb2f1..0b995a961359 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -661,25 +661,20 @@ int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, - size_t, int, int *)); + size_t, int)); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; const struct proto *prot; - int addr_len = 0; - int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); - err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, - sk, msg, size, flags, &addr_len); - if (err >= 0) - msg->msg_namelen = addr_len; - return err; + return INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + sk, msg, size, flags); } const struct proto_ops inet6_stream_ops = { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c564b68a0562..3cb84022a461 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -452,7 +452,7 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, /* * Handle MSG_ERRQUEUE */ -int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; @@ -503,7 +503,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) &sin->sin6_addr); sin->sin6_scope_id = 0; } - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -545,8 +545,7 @@ EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ -int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, - int *addr_len) +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; @@ -579,7 +578,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index e4afc651731a..6e90d0bf9f3d 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -24,8 +24,7 @@ #include /* Compatibility glue so we can support IPv6 when it's compiled as a module */ -static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, - int *addr_len) +static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 27a268059168..0ac704691100 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -432,7 +432,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) */ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); @@ -444,10 +444,10 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return -EOPNOTSUPP; if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len, addr_len); + return ipv6_recv_error(sk, msg, len); if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) - return ipv6_recv_rxpmtu(sk, msg, len, addr_len); + return ipv6_recv_rxpmtu(sk, msg, len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) @@ -481,7 +481,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); - *addr_len = sizeof(*sin6); + msg->msg_namelen = sizeof(*sin6); } sock_recv_cmsgs(msg, sk, skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 48f73401adf4..5a3984e59c90 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -465,7 +465,7 @@ static int udp6_skb_len(struct sk_buff *skb) */ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -478,10 +478,10 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int is_udp4; if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len, addr_len); + return ipv6_recv_error(sk, msg, len); if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) - return ipv6_recv_rxpmtu(sk, msg, len, addr_len); + return ipv6_recv_rxpmtu(sk, msg, len); try_again: off = sk_peek_offset(sk, flags); @@ -553,11 +553,11 @@ try_again: ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); } - *addr_len = sizeof(*sin6); + msg->msg_namelen = sizeof(*sin6); BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, (struct sockaddr *)sin6, - addr_len); + &msg->msg_namelen); } if (udp_test_bit(GRO_ENABLED, sk)) diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 8a406be25a3a..1bd4a573e1bb 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -22,8 +22,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); +int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index cac1ff59cb83..acb2570c11f6 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -537,7 +537,7 @@ no_route: } static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len) + size_t len, int flags) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; @@ -570,7 +570,7 @@ static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - *addr_len = sizeof(*sin); + msg->msg_namelen = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 05a396ba6a3e..bdaae1b64d25 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -679,7 +679,7 @@ do_confirm: } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); @@ -691,7 +691,7 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len, addr_len); + return ipv6_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) @@ -719,7 +719,7 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); - *addr_len = sizeof(*lsa); + msg->msg_namelen = sizeof(*lsa); } if (np->rxopt.all) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cf1852b99963..3da3da2c81b1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2259,7 +2259,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; @@ -2269,7 +2269,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 976fe250b509..22cf23f06832 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -109,7 +109,7 @@ static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; @@ -143,7 +143,7 @@ static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_name != NULL) { __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); - *addr_len = sizeof(sa); + msg->msg_namelen = sizeof(sa); } out: diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 120e711ea78c..4dbf0914df7d 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1262,7 +1262,7 @@ struct sk_buff *pep_read(struct sock *sk) } static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct sk_buff *skb; int err; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 05fb00c9c335..d2665bbd41a2 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2087,7 +2087,7 @@ static int sctp_skb_pull(struct sk_buff *skb, int len) * 5 for complete description of the flags. */ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); @@ -2096,11 +2096,11 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int err = 0; int skb_len; - pr_debug("%s: sk:%p, msghdr:%p, len:%zd, flags:0x%x, addr_len:%p)\n", - __func__, sk, msg, len, flags, addr_len); + pr_debug("%s: sk:%p, msghdr:%p, len:%zd, flags:0x%x)\n", + __func__, sk, msg, len, flags); if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) @@ -2141,9 +2141,9 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, sock_recv_cmsgs(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; - sp->pf->event_msgname(event, msg->msg_name, addr_len); + sp->pf->event_msgname(event, msg->msg_name, &msg->msg_namelen); } else { - sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); + sp->pf->skb_msgname(head_skb, msg->msg_name, &msg->msg_namelen); } /* Check if we allow SCTP_NXTINFO. */ diff --git a/net/tls/tls.h b/net/tls/tls.h index 2f86baeb71fc..e8f81a006520 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -161,7 +161,7 @@ void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags); bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5fe07f110fe8..a656ce235758 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2031,8 +2031,7 @@ static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, - int *addr_len) + int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3756a93dc63a..3a7e00c063c3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2665,7 +2665,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) - return prot->recvmsg(sk, msg, size, flags, NULL); + return prot->recvmsg(sk, msg, size, flags); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } @@ -3139,7 +3139,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) - return prot->recvmsg(sk, msg, size, flags, NULL); + return prot->recvmsg(sk, msg, size, flags); #endif return unix_stream_read_generic(&state, true); } diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index e0d30d6d22ac..d14cd5454a8d 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -49,7 +49,7 @@ static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len) + size_t len, int flags) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2f7d94d682cb..f0ab2f13e9db 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1502,7 +1502,7 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) - return prot->recvmsg(sk, msg, len, flags, NULL); + return prot->recvmsg(sk, msg, len, flags); #endif return __vsock_dgram_recvmsg(sock, msg, len, flags); @@ -2575,7 +2575,7 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) - return prot->recvmsg(sk, msg, len, flags, NULL); + return prot->recvmsg(sk, msg, len, flags); #endif return __vsock_connectible_recvmsg(sock, msg, len, flags); diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index 07b96d56f3a5..9049d2648646 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -74,7 +74,7 @@ static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int } static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len) + size_t len, int flags) { struct sk_psock *psock; struct vsock_sock *vsk; diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index e1b11ab59f6e..998832419097 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -133,7 +133,7 @@ static int espintcp_parse(struct strparser *strp, struct sk_buff *skb) } static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) + int flags) { struct espintcp_ctx *ctx = espintcp_getctx(sk); struct sk_buff *skb; -- cgit v1.2.3 From 44a2ec96d374806ee74454ea915615536a76b152 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:13 +0000 Subject: net: stmmac: remove plat_dat->port_node There are repeated instances of: fwnode = priv->plat->port_node; if (!fwnode) fwnode = dev_fwnode(priv->device); However, the only place that ->port_node is set is stmmac_probe_config_dt(): struct device_node *np = pdev->dev.of_node; ... /* PHYLINK automatically parses the phy-handle property */ plat->port_node = of_fwnode_handle(np); which is equivalent to dev_fwnode(&pdev->dev) and, as priv->device will be &pdev->dev, is also equivalent to dev_fwnode(priv->device). Thus, plat_dat->port_node doesn't provide any extra benefit over using dev_fwnode(priv->device) directly. There is one case where port_node is used directly, which can be found in stmmac_pcs_setup(). This may cause a change of behaviour as PCI drivers do not populate plat_dat->port_node, but dev_fwnode(priv->device) may be valid. PCI-based stmmac should be tested. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuX3-0000000Avme-3oej@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 +++---------- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 7 ++----- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 --- include/linux/stmmac.h | 1 - 4 files changed, 5 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5c144ac259af..4e788f54bbbc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1252,10 +1252,7 @@ static int stmmac_init_phy(struct net_device *dev) xpcs_get_an_mode(priv->hw->xpcs, mode) == DW_AN_C73) return 0; - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - + fwnode = dev_fwnode(priv->device); if (fwnode) phy_fwnode = fwnode_get_phy_node(fwnode); else @@ -1313,7 +1310,6 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; struct phylink_config *config; - struct fwnode_handle *fwnode; struct phylink_pcs *pcs; struct phylink *phylink; @@ -1400,11 +1396,8 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) config->wol_mac_support |= WAKE_MAGIC; } - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - - phylink = phylink_create(config, fwnode, priv->plat->phy_interface, + phylink = phylink_create(config, dev_fwnode(priv->device), + priv->plat->phy_interface, &stmmac_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index a7c2496b39f2..485a0d790baa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -430,7 +430,7 @@ int stmmac_pcs_setup(struct net_device *ndev) struct dw_xpcs *xpcs = NULL; int addr, ret; - devnode = priv->plat->port_node; + devnode = dev_fwnode(priv->device); if (priv->plat->pcs_init) { ret = priv->plat->pcs_init(priv); @@ -649,10 +649,7 @@ int stmmac_mdio_register(struct net_device *ndev) stmmac_xgmac2_mdio_read_c45(new_bus, 0, 0, 0); /* If fixed-link is set, skip PHY scanning */ - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - + fwnode = dev_fwnode(priv->device); if (fwnode) { fixed_node = fwnode_get_named_child_node(fwnode, "fixed-link"); if (fixed_node) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 5c9fd91a1db9..c34998486293 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -446,9 +446,6 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) * they are not converted to phylink. */ plat->phy_node = of_parse_phandle(np, "phy-handle", 0); - /* PHYLINK automatically parses the phy-handle property */ - plat->port_node = of_fwnode_handle(np); - /* Get max speed of operation from device tree */ of_property_read_u32(np, "max-speed", &plat->max_speed); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b96ae9dadfab..77e51eaa5ec5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -225,7 +225,6 @@ struct plat_stmmacenet_data { phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; - struct fwnode_handle *port_node; struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; -- cgit v1.2.3 From 1558705afbb293549fdedd539682bc5240e1964b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:59 +0000 Subject: net: stmmac: make dma_cfg mixed/fixed burst boolean struct stmmac_dma_cfg mixed_burst/fixed_burst members are both boolean in nature - of_property_read_bool() are used to read these from DT, and they are only tested for non-zero values. Use bool to avoid unnecessary padding in this structure. Update dwmac-intel to initialise these using true rather than '1', and remove the '0' initialisers as the struct is already zero initialised on allocation. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuXn-0000000AvnX-4A1u@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 4 +--- include/linux/stmmac.h | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 92d77b0c2f54..ece2a0c38562 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -636,8 +636,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; - plat->dma_cfg->fixed_burst = 0; - plat->dma_cfg->mixed_burst = 0; plat->dma_cfg->aal = 0; plat->dma_cfg->dche = true; @@ -1106,7 +1104,7 @@ static int quark_default_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 16; plat->dma_cfg->pblx8 = true; - plat->dma_cfg->fixed_burst = 1; + plat->dma_cfg->fixed_burst = true; /* AXI (TODO) */ return 0; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 77e51eaa5ec5..2fc169c7117e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -97,8 +97,8 @@ struct stmmac_dma_cfg { int txpbl; int rxpbl; bool pblx8; - int fixed_burst; - int mixed_burst; + bool fixed_burst; + bool mixed_burst; bool aal; bool eame; bool multi_msi_en; -- cgit v1.2.3 From 1c36d186a0c81f3b55b2722736163233b05f8756 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:30 +0000 Subject: ipmr: Define net->ipv4.{ipmr_notifier_ops,ipmr_seq} under CONFIG_IP_MROUTE. net->ipv4.ipmr_notifier_ops and net->ipv4.ipmr_seq are used only in net/ipv4/ipmr.c. Let's move these definitions under CONFIG_IP_MROUTE. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-13-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e971c7bf164..380ff34c0233 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -279,6 +279,8 @@ struct netns_ipv4 { struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; #endif + struct fib_notifier_ops *ipmr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; @@ -290,9 +292,6 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* writes protected by rtnl_mutex */ - struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ - atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; -- cgit v1.2.3 From 4480d5fa1f6ebe7dfc546e14371d63c8b915a82d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:31 +0000 Subject: ipmr/ip6mr: Convert net->ipv[46].ipmr_seq to atomic_t. We will no longer hold RTNL for ipmr_mfc_add() and ipmr_mfc_delete(). MFC entry can be loosely connected with VIF by its index for mrt->vif_table[] (stored in mfc_parent), but the two tables are not synchronised. i.e. Even if VIF 1 is removed, MFC for VIF 1 is not automatically removed. The only field that the MFC/VIF interfaces share is net->ipv[46].ipmr_seq, which is protected by RTNL. Adding a new mutex for both just to protect a single field is overkill. Let's convert the field to atomic_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 8 ++++---- include/net/netns/ipv4.h | 2 +- include/net/netns/ipv6.h | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv6/ip6mr.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0075f6e5c3da..0baa6f994da9 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -76,7 +76,7 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { @@ -89,7 +89,7 @@ static inline int mr_call_vif_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } @@ -198,7 +198,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { @@ -209,7 +209,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 380ff34c0233..94dca64fec41 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -280,7 +280,7 @@ struct netns_ipv4 { struct fib_rules_ops *mr_rules_ops; #endif struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 34bdb1308e8f..499e4288170f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -118,7 +118,7 @@ struct netns_ipv6 { struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 07f2d4f8dcbe..6ec73796d84d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3226,7 +3226,7 @@ static const struct net_protocol pim_protocol = { static unsigned int ipmr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); + return atomic_read(&net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, @@ -3247,7 +3247,7 @@ static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv4.ipmr_seq = 0; + atomic_set(&net->ipv4.ipmr_seq, 0); ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e047a4680ab0..85010ff21c98 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1280,7 +1280,7 @@ static int ip6mr_device_event(struct notifier_block *this, static unsigned int ip6mr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); + return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1305,7 +1305,7 @@ static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv6.ipmr_seq = 0; + atomic_set(&net->ipv6.ipmr_seq, 0); ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) -- cgit v1.2.3 From 3c1e53e55418d4ca4040e281501643a96e227974 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:32 +0000 Subject: ipmr: Add dedicated mutex for mrt->{mfc_hash,mfc_cache_list}. We will no longer hold RTNL for ipmr_rtm_route() to modify the MFC hash table. Only __dev_get_by_index() in rtm_to_ipmr_mfcc() is the RTNL dependant, otherwise, we just need protection for mrt->mfc_hash and mrt->mfc_cache_list. Let's add a new mutex for ipmr_mfc_add(), ipmr_mfc_delete(), and mroute_clean_tables() (setsockopt(MRT_FLUSH or MRT_DONE)). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-15-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/ipmr.c | 28 ++++++++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 94dca64fec41..4c249aeaf7f1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -281,6 +281,7 @@ struct netns_ipv4 { #endif struct fib_notifier_ops *ipmr_notifier_ops; atomic_t ipmr_seq; + struct mutex mfc_mutex; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 6ec73796d84d..d4983d8a9b2a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1329,6 +1329,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { + mutex_lock(&net->ipv4.mfc_mutex); + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) @@ -1341,6 +1343,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } + + mutex_unlock(&net->ipv4.mfc_mutex); } if (flags & MRT_FLUSH_MFC) { @@ -1498,12 +1502,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, } if (parent == 0) parent = mfc.mfcc_parent; + + mutex_lock(&net->ipv4.mfc_mutex); + if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); + + mutex_unlock(&net->ipv4.mfc_mutex); break; case MRT_FLUSH: { LIST_HEAD(dev_kill_list); @@ -2913,21 +2922,26 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - int ret, mrtsock, parent; - struct mr_table *tbl; + int ret, mrtsock = 0, parent; + struct mr_table *tbl = NULL; struct mfcctl mfcc; - mrtsock = 0; - tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; + + mutex_lock(&net->ipv4.mfc_mutex); + if (nlh->nlmsg_type == RTM_NEWROUTE) - return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); + ret = ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else - return ipmr_mfc_delete(tbl, &mfcc, parent); + ret = ipmr_mfc_delete(tbl, &mfcc, parent); + + mutex_unlock(&net->ipv4.mfc_mutex); + + return ret; } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) @@ -3269,6 +3283,8 @@ static int __net_init ipmr_net_init(struct net *net) LIST_HEAD(dev_kill_list); int err; + mutex_init(&net->ipv4.mfc_mutex); + err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; -- cgit v1.2.3 From bddafc06ca5ee1be4d10061f7954c6d6be5dc1d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:33 +0000 Subject: ipmr: Don't hold RTNL for ipmr_rtm_route(). ipmr_mfc_add() and ipmr_mfc_delete() are already protected by a dedicated mutex. rtm_to_ipmr_mfcc() calls __ipmr_get_table(), __dev_get_by_index(), amd ipmr_find_vif(). Once __dev_get_by_index() is converted to dev_get_by_index_rcu(), we can move the other two functions under that same RCU section and drop RTNL for ipmr_rtm_route(). Let's do that conversion and drop ASSERT_RTNL() in mr_call_mfc_notifiers(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 1 - net/ipv4/ipmr.c | 34 +++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0baa6f994da9..cf3374580f74 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -208,7 +208,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, .tb_id = tb_id }; - ASSERT_RTNL(); atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d4983d8a9b2a..8a08d09b4c30 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1211,7 +1211,6 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -1238,7 +1237,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -2853,10 +2851,10 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; + int ret, rem, iif = 0; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; - int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); @@ -2883,11 +2881,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: - dev = __dev_get_by_index(net, nla_get_u32(attr)); - if (!dev) { - ret = -ENODEV; - goto out; - } + iif = nla_get_u32(attr); break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { @@ -2903,16 +2897,30 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, break; } } + + rcu_read_lock(); + mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; - goto out; + goto unlock; } + + if (iif) { + dev = dev_get_by_index_rcu(net, iif); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); + } + *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; - if (dev) - mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); +unlock: + rcu_read_unlock(); out: return ret; } @@ -3343,9 +3351,9 @@ static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, -- cgit v1.2.3 From 425e080a1c34859395efcc377efead05dc6fae3b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 1 Mar 2026 06:37:55 +0000 Subject: dccp Remove inet_hashinfo2_init_mod(). Commit c92c81df93df ("net: dccp: fix kernel crash on module load") added inet_hashinfo2_init_mod() for DCCP. Commit 22d6c9eebf2e ("net: Unexport shared functions for DCCP.") removed EXPORT_SYMBOL_GPL() it but forgot to remove the function itself. Let's remove inet_hashinfo2_init_mod(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260301063756.1581685-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 34 ++++++++-------------------------- 2 files changed, 8 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ac05a52d9e13..8bddf58b1a85 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -286,7 +286,6 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); -int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fca980772c81..52847950b28a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1246,22 +1246,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, __inet_check_established); } -static void init_hashinfo_lhash2(struct inet_hashinfo *h) -{ - int i; - - for (i = 0; i <= h->lhash2_mask; i++) { - spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, - i + LISTENING_NULLS_BASE); - } -} - void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { + unsigned int i; + h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, @@ -1271,7 +1262,12 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, &h->lhash2_mask, low_limit, high_limit); - init_hashinfo_lhash2(h); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, + i + LISTENING_NULLS_BASE); + } /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", @@ -1282,20 +1278,6 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, INET_TABLE_PERTURB_SIZE); } -int inet_hashinfo2_init_mod(struct inet_hashinfo *h) -{ - h->lhash2 = kmalloc_objs(*h->lhash2, INET_LHTABLE_SIZE); - if (!h->lhash2) - return -ENOMEM; - - h->lhash2_mask = INET_LHTABLE_SIZE - 1; - /* INET_LHTABLE_SIZE must be a power of 2 */ - BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); - - init_hashinfo_lhash2(h); - return 0; -} - int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); -- cgit v1.2.3 From c69855ada28656fdd7e197b6e24cd40a04fe14d3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:08:45 -0800 Subject: atm: atmdev: add function parameter names and description kernel-doc reports function parameters not described for parameters that are not named. Add parameter names for these functions and then describe the function parameters in kernel-doc format. Fixes these warnings: Warning: include/linux/atmdev.h:316 function parameter '' not described in 'register_atm_ioctl' Warning: include/linux/atmdev.h:321 function parameter '' not described in 'deregister_atm_ioctl' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260228220845.2978547-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/linux/atmdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 70807c679f1a..82a32526df64 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -309,17 +309,19 @@ struct atm_ioctl { /** * register_atm_ioctl - register handler for ioctl operations + * @ioctl: ioctl handler to register * * Special (non-device) handlers of ioctl's should * register here. If you're a normal device, you should * set .ioctl in your atmdev_ops instead. */ -void register_atm_ioctl(struct atm_ioctl *); +void register_atm_ioctl(struct atm_ioctl *ioctl); /** * deregister_atm_ioctl - remove the ioctl handler + * @ioctl: ioctl handler to deregister */ -void deregister_atm_ioctl(struct atm_ioctl *); +void deregister_atm_ioctl(struct atm_ioctl *ioctl); /* register_atmdevice_notifier - register atm_dev notify events -- cgit v1.2.3 From e0afbfe321d5131c56005f56fbf5d548340da749 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Feb 2026 14:21:54 +0100 Subject: drm/client: Export drm_client_buffer_create() The helper drm_client_buffer_create() will be required by various drivers for fbdev emulation. Signed-off-by: Thomas Zimmermann Acked-by: Patrik Jakobsson Link: https://patch.msgid.link/20260206133458.226467-2-tzimmermann@suse.de --- drivers/gpu/drm/drm_client.c | 3 ++- include/drm/drm_client.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index 6236ec46d62a..46c465bce98c 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -204,7 +204,7 @@ void drm_client_buffer_delete(struct drm_client_buffer *buffer) } EXPORT_SYMBOL(drm_client_buffer_delete); -static struct drm_client_buffer * +struct drm_client_buffer * drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format, u32 handle, u32 pitch) { @@ -265,6 +265,7 @@ err_delete: kfree(buffer); return ERR_PTR(ret); } +EXPORT_SYMBOL(drm_client_buffer_create); /** * drm_client_buffer_vmap_local - Map DRM client buffer into address space diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h index c972a8a3385b..49a21f3dcb36 100644 --- a/include/drm/drm_client.h +++ b/include/drm/drm_client.h @@ -195,6 +195,9 @@ struct drm_client_buffer { struct drm_framebuffer *fb; }; +struct drm_client_buffer * +drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, + u32 format, u32 handle, u32 pitch); struct drm_client_buffer * drm_client_buffer_create_dumb(struct drm_client_dev *client, u32 width, u32 height, u32 format); void drm_client_buffer_delete(struct drm_client_buffer *buffer); -- cgit v1.2.3 From 2b12ffb669553972f5cd017c69a2b81593b09106 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Fri, 27 Feb 2026 00:15:02 -0800 Subject: net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout The GF stats periodic query is used as mechanism to monitor HWC health check. If this HWC command times out, it is a strong indication that the device/SoC is in a faulty state and requires recovery. Today, when a timeout is detected, the driver marks hwc_timeout_occurred, clears cached stats, and stops rescheduling the periodic work. However, the device itself is left in the same failing state. Extend the timeout handling path to trigger the existing MANA VF recovery service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item. This is expected to initiate the appropriate recovery flow by suspende resume first and if it fails then trigger a bus rescan. This change is intentionally limited to HWC command timeouts and does not trigger recovery for errors reported by the SoC as a normal command response. Signed-off-by: Dipayaan Roy Reviewed-by: Haiyang Zhang Reviewed-by: Simon Horman Link: https://patch.msgid.link/aaFShvKnwR5FY8dH@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 65 +++++++++++++------------ drivers/net/ethernet/microsoft/mana/mana_en.c | 9 +++- include/net/mana/gdma.h | 16 +++++- 3 files changed, 55 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 37d2f108a839..aef8612b73cb 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev) dev_info(&pdev->dev, "MANA reset cycle completed\n"); out: - gc->in_service = false; + clear_bit(GC_IN_SERVICE, &gc->flags); } -struct mana_serv_work { - struct work_struct serv_work; - struct pci_dev *pdev; - enum gdma_eqe_type type; -}; - static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev) { switch (type) { @@ -558,12 +552,42 @@ static void mana_serv_func(struct work_struct *w) module_put(THIS_MODULE); } +int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type) +{ + struct mana_serv_work *mns_wk; + + if (test_and_set_bit(GC_IN_SERVICE, &gc->flags)) { + dev_info(gc->dev, "Already in service\n"); + return -EBUSY; + } + + if (!try_module_get(THIS_MODULE)) { + dev_info(gc->dev, "Module is unloading\n"); + clear_bit(GC_IN_SERVICE, &gc->flags); + return -ENODEV; + } + + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); + if (!mns_wk) { + module_put(THIS_MODULE); + clear_bit(GC_IN_SERVICE, &gc->flags); + return -ENOMEM; + } + + dev_info(gc->dev, "Start MANA service type:%d\n", type); + mns_wk->pdev = to_pci_dev(gc->dev); + mns_wk->type = type; + pci_dev_get(mns_wk->pdev); + INIT_WORK(&mns_wk->serv_work, mana_serv_func); + schedule_work(&mns_wk->serv_work); + return 0; +} + static void mana_gd_process_eqe(struct gdma_queue *eq) { u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); struct gdma_context *gc = eq->gdma_dev->gdma_context; struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; - struct mana_serv_work *mns_wk; union gdma_eqe_info eqe_info; enum gdma_eqe_type type; struct gdma_event event; @@ -623,30 +647,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) "Service is to be processed in probe\n"); break; } - - if (gc->in_service) { - dev_info(gc->dev, "Already in service\n"); - break; - } - - if (!try_module_get(THIS_MODULE)) { - dev_info(gc->dev, "Module is unloading\n"); - break; - } - - mns_wk = kzalloc_obj(*mns_wk, GFP_ATOMIC); - if (!mns_wk) { - module_put(THIS_MODULE); - break; - } - - dev_info(gc->dev, "Start MANA service type:%d\n", type); - gc->in_service = true; - mns_wk->pdev = to_pci_dev(gc->dev); - mns_wk->type = type; - pci_dev_get(mns_wk->pdev); - INIT_WORK(&mns_wk->serv_work, mana_serv_func); - schedule_work(&mns_wk->serv_work); + mana_schedule_serv_work(gc, type); break; default: diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 933e9d681ded..56ee993e3a43 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -875,7 +875,7 @@ static void mana_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct gdma_context *gc = ac->gdma_dev->gdma_context; /* Already in service, hence tx queue reset is not required.*/ - if (gc->in_service) + if (test_bit(GC_IN_SERVICE, &gc->flags)) return; /* Note: If there are pending queue reset work for this port(apc), @@ -3525,6 +3525,7 @@ static void mana_gf_stats_work_handler(struct work_struct *work) { struct mana_context *ac = container_of(to_delayed_work(work), struct mana_context, gf_stats_work); + struct gdma_context *gc = ac->gdma_dev->gdma_context; int err; err = mana_query_gf_stats(ac); @@ -3532,6 +3533,12 @@ static void mana_gf_stats_work_handler(struct work_struct *work) /* HWC timeout detected - reset stats and stop rescheduling */ ac->hwc_timeout_occurred = true; memset(&ac->hc_stats, 0, sizeof(ac->hc_stats)); + dev_warn(gc->dev, + "Gf stats wk handler: gf stats query timed out.\n"); + /* As HWC timed out, indicating a faulty HW state and needs a + * reset. + */ + mana_schedule_serv_work(gc, GDMA_EQE_HWC_RESET_REQUEST); return; } schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 766f4fb25e26..ec17004b10c0 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -215,6 +215,12 @@ enum gdma_page_type { #define GDMA_INVALID_DMA_REGION 0 +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + struct gdma_mem_info { struct device *dev; @@ -386,6 +392,7 @@ struct gdma_irq_context { enum gdma_context_flags { GC_PROBE_SUCCEEDED = 0, + GC_IN_SERVICE = 1, }; struct gdma_context { @@ -411,7 +418,6 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; - bool in_service; phys_addr_t bar0_pa; void __iomem *bar0_va; @@ -473,6 +479,8 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit); +int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type); + struct gdma_wqe { u32 reserved :24; u32 last_vbytes :8; @@ -615,6 +623,9 @@ enum { /* Driver can handle hardware recovery events during probe */ #define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22) +/* Driver supports self recovery on Hardware Channel timeouts */ +#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY BIT(25) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ @@ -628,7 +639,8 @@ enum { GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \ GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \ - GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY) + GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \ + GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY) #define GDMA_DRV_CAP_FLAGS2 0 -- cgit v1.2.3 From dc5f903b3ab6675721c8aa943d5cd0cb5ca2f5c8 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 27 Feb 2026 19:17:12 +0200 Subject: drm/i915: add VMA to parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's unclear what the direction of the VMA abstraction in the parent interface should be, but convert i915_vma_fence_id() to parent interface for starters. This paves the way for making struct i915_vma opaque towards display. Reviewed-by: Michał Grzelak Link: https://patch.msgid.link/036f4b2d20cc1b0a7ab814beb5bb914c53b6eb53.1772212579.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_fbc.c | 5 ++--- drivers/gpu/drm/i915/display/intel_parent.c | 9 +++++++++ drivers/gpu/drm/i915/display/intel_parent.h | 3 +++ drivers/gpu/drm/i915/i915_driver.c | 1 + drivers/gpu/drm/i915/i915_vma.c | 10 ++++++++++ drivers/gpu/drm/i915/i915_vma.h | 7 ++----- drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h | 2 -- include/drm/intel/display_parent_interface.h | 7 +++++++ 8 files changed, 34 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c index 91de38379282..3e9b3e532499 100644 --- a/drivers/gpu/drm/i915/display/intel_fbc.c +++ b/drivers/gpu/drm/i915/display/intel_fbc.c @@ -45,7 +45,6 @@ #include #include -#include "i915_vma.h" #include "i9xx_plane_regs.h" #include "intel_de.h" #include "intel_display_device.h" @@ -1463,7 +1462,7 @@ static void intel_fbc_update_state(struct intel_atomic_state *state, !intel_fbc_has_fences(display)); if (plane_state->flags & PLANE_HAS_FENCE) - fbc_state->fence_id = i915_vma_fence_id(plane_state->ggtt_vma); + fbc_state->fence_id = intel_parent_vma_fence_id(display, plane_state->ggtt_vma); else fbc_state->fence_id = -1; @@ -1490,7 +1489,7 @@ static bool intel_fbc_is_fence_ok(const struct intel_plane_state *plane_state) */ return DISPLAY_VER(display) >= 9 || (plane_state->flags & PLANE_HAS_FENCE && - i915_vma_fence_id(plane_state->ggtt_vma) != -1); + intel_parent_vma_fence_id(display, plane_state->ggtt_vma) != -1); } static bool intel_fbc_is_cfb_ok(const struct intel_plane_state *plane_state) diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index 89f78ca1cd15..0c5962cb2f6d 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -317,6 +317,15 @@ void intel_parent_stolen_node_free(struct intel_display *display, const struct i display->parent->stolen->node_free(node); } +/* vma */ +int intel_parent_vma_fence_id(struct intel_display *display, const struct i915_vma *vma) +{ + if (!display->parent->vma) + return -1; + + return display->parent->vma->fence_id(vma); +} + /* generic */ void intel_parent_fence_priority_display(struct intel_display *display, struct dma_fence *fence) { diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 2317482ef072..6e7d09133aee 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -102,6 +102,9 @@ u64 intel_parent_stolen_node_size(struct intel_display *display, const struct in struct intel_stolen_node *intel_parent_stolen_node_alloc(struct intel_display *display); void intel_parent_stolen_node_free(struct intel_display *display, const struct intel_stolen_node *node); +/* vma */ +int intel_parent_vma_fence_id(struct intel_display *display, const struct i915_vma *vma); + /* generic */ bool intel_parent_has_auxccs(struct intel_display *display); bool intel_parent_has_fenced_regions(struct intel_display *display); diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 5f77e891604d..18f912043f90 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -775,6 +775,7 @@ static const struct intel_display_parent_interface parent = { .rpm = &i915_display_rpm_interface, .rps = &i915_display_rps_interface, .stolen = &i915_display_stolen_interface, + .vma = &i915_display_vma_interface, .fence_priority_display = fence_priority_display, .has_auxccs = has_auxccs, diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index afc192d9931b..6a3a4d4244dc 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -27,6 +27,7 @@ #include #include +#include #include "display/intel_fb.h" #include "display/intel_frontbuffer.h" @@ -2332,3 +2333,12 @@ int __init i915_vma_module_init(void) return 0; } + +static int i915_vma_fence_id(const struct i915_vma *vma) +{ + return vma->fence ? vma->fence->id : -1; +} + +const struct intel_display_vma_interface i915_display_vma_interface = { + .fence_id = i915_vma_fence_id, +}; diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index 8054047840aa..fa2d9b429db6 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -404,11 +404,6 @@ i915_vma_unpin_fence(struct i915_vma *vma) __i915_vma_unpin_fence(vma); } -static inline int i915_vma_fence_id(const struct i915_vma *vma) -{ - return vma->fence ? vma->fence->id : -1; -} - void i915_vma_parked(struct intel_gt *gt); static inline bool i915_vma_is_scanout(const struct i915_vma *vma) @@ -481,4 +476,6 @@ int i915_vma_module_init(void); I915_SELFTEST_DECLARE(int i915_vma_get_pages(struct i915_vma *vma)); I915_SELFTEST_DECLARE(void i915_vma_put_pages(struct i915_vma *vma)); +extern const struct intel_display_vma_interface i915_display_vma_interface; + #endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h index c4b5adaaa99a..da1d97b48fee 100644 --- a/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h @@ -26,8 +26,6 @@ struct i915_vma { struct xe_ggtt_node *node; }; -#define i915_vma_fence_id(vma) -1 - static inline u32 i915_ggtt_offset(const struct i915_vma *vma) { return xe_ggtt_node_addr(vma->node); diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index b4b0f58ae3ee..d02ab7cc1c92 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -149,6 +149,10 @@ struct intel_display_stolen_interface { void (*node_free)(const struct intel_stolen_node *node); }; +struct intel_display_vma_interface { + int (*fence_id)(const struct i915_vma *vma); +}; + /** * struct intel_display_parent_interface - services parent driver provides to display * @@ -198,6 +202,9 @@ struct intel_display_parent_interface { /** @stolen: Stolen memory. */ const struct intel_display_stolen_interface *stolen; + /** @vma: VMA interface. Optional. */ + const struct intel_display_vma_interface *vma; + /* Generic independent functions */ struct { /** @fence_priority_display: Set display priority. Optional. */ -- cgit v1.2.3 From 83419c8fdbbc1dacd12fa614c5a3561e498aac5f Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sat, 28 Feb 2026 13:47:55 -0500 Subject: bpf: Factor out program return value calculation Factor the return value range calculation logic in check_return_code out of the function in preparation for separating the return value validation logic for BPF_EXIT and bpf_throw(). No functional changes. The change made in return_retval_code's handling of PROG_TRACING program types (not error'ing on the default case) is a no-op because the match on the program attach type is exhaustive. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260228184759.108145-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 226 ++++++++++++++++++++++++------------------- 2 files changed, 126 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1e30096ea7b..090aa26d1c98 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -265,6 +265,7 @@ struct bpf_reference_state { struct bpf_retval_range { s32 minval; s32 maxval; + bool return_32bit; }; /* state of the program: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34f89ed29c47..89e6d2f6bdf6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2953,7 +2953,11 @@ static void init_reg_state(struct bpf_verifier_env *env, static struct bpf_retval_range retval_range(s32 minval, s32 maxval) { - return (struct bpf_retval_range){ minval, maxval }; + /* + * return_32bit is set to false by default and set explicitly + * by the caller when necessary. + */ + return (struct bpf_retval_range){ minval, maxval, false }; } #define BPF_MAIN_FUNC (-1) @@ -11175,10 +11179,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, - bool return_32bit) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { - if (return_32bit) + if (range.return_32bit) return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; else return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; @@ -11222,7 +11225,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; /* enforce R0 return value range, and bpf_callback_t returns 64bit */ - if (!retval_range_within(callee->callback_ret_range, r0, false)) { + if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; @@ -17841,6 +17844,115 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + +static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + /* Default return value range. */ + *range = retval_range(0, 1); + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (env->prog->expected_attach_type) { + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + *range = retval_range(1, 1); + break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + *range = retval_range(0, 3); + break; + default: + break; + } + break; + case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) + *range = retval_range(0, 3); + break; + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (!env->prog->aux->attach_btf_id) + return false; + *range = retval_range(0, 0); + break; + case BPF_PROG_TYPE_TRACING: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: + *range = retval_range(0, 0); + break; + case BPF_TRACE_RAW_TP: + case BPF_MODIFY_RETURN: + return false; + case BPF_TRACE_ITER: + default: + break; + } + break; + case BPF_PROG_TYPE_KPROBE: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: + break; + default: + return false; + } + break; + case BPF_PROG_TYPE_SK_LOOKUP: + *range = retval_range(SK_DROP, SK_PASS); + break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* no range found, any return value is allowed */ + if (!get_func_retval_range(env->prog, range)) + return false; + /* no restricted range, any return value is allowed */ + if (range->minval == S32_MIN && range->maxval == S32_MAX) + return false; + range->return_32bit = true; + } else if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + *range = retval_range(1, 1); + } + break; + + case BPF_PROG_TYPE_NETFILTER: + *range = retval_range(NF_DROP, NF_ACCEPT); + break; + case BPF_PROG_TYPE_STRUCT_OPS: + *range = retval_range(0, 0); + break; + case BPF_PROG_TYPE_EXT: + /* freplace program can return anything as its return value + * depends on the to-be-replaced kernel func or bpf program. + */ + default: + return false; + } + + /* Continue calculating. */ + + return true; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { const char *exit_ctx = "At program exit"; @@ -17849,18 +17961,17 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; - bool return_32bit = false; const struct btf_type *reg_type, *ret_type = NULL; + int err; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) - /* See below, can be 0 or 0-1 depending on hook. */ + /* See return_retval_range, can be 0 or 0-1 depending on hook. */ break; if (!prog->aux->attach_func_proto->type) return 0; @@ -17918,101 +18029,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return 0; } - switch (prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = retval_range(1, 1); - if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || - env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = retval_range(0, 3); - break; - case BPF_PROG_TYPE_CGROUP_SKB: - if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = retval_range(0, 3); - enforce_attach_type_range = tnum_range(2, 3); - } - break; - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - break; - case BPF_PROG_TYPE_RAW_TRACEPOINT: - if (!env->prog->aux->attach_btf_id) - return 0; - range = retval_range(0, 0); - break; - case BPF_PROG_TYPE_TRACING: - switch (env->prog->expected_attach_type) { - case BPF_TRACE_FENTRY: - case BPF_TRACE_FEXIT: - case BPF_TRACE_FSESSION: - range = retval_range(0, 0); - break; - case BPF_TRACE_RAW_TP: - case BPF_MODIFY_RETURN: - return 0; - case BPF_TRACE_ITER: - break; - default: - return -ENOTSUPP; - } - break; - case BPF_PROG_TYPE_KPROBE: - switch (env->prog->expected_attach_type) { - case BPF_TRACE_KPROBE_SESSION: - case BPF_TRACE_UPROBE_SESSION: - range = retval_range(0, 1); - break; - default: - return 0; - } - break; - case BPF_PROG_TYPE_SK_LOOKUP: - range = retval_range(SK_DROP, SK_PASS); - break; + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type) + return 0; - case BPF_PROG_TYPE_LSM: - if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { - /* no range found, any return value is allowed */ - if (!get_func_retval_range(env->prog, &range)) - return 0; - /* no restricted range, any return value is allowed */ - if (range.minval == S32_MIN && range.maxval == S32_MAX) - return 0; - return_32bit = true; - } else if (!env->prog->aux->attach_func_proto->type) { - /* Make sure programs that attach to void - * hooks don't try to modify return value. - */ - range = retval_range(1, 1); - } - break; + if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)) + enforce_attach_type_range = tnum_range(2, 3); - case BPF_PROG_TYPE_NETFILTER: - range = retval_range(NF_DROP, NF_ACCEPT); - break; - case BPF_PROG_TYPE_STRUCT_OPS: - if (!ret_type) - return 0; - range = retval_range(0, 0); - break; - case BPF_PROG_TYPE_EXT: - /* freplace program can return anything as its return value - * depends on the to-be-replaced kernel func or bpf program. - */ - default: + if (!return_retval_range(env, &range)) return 0; - } enforce_retval: if (reg->type != SCALAR_VALUE) { @@ -18025,7 +18049,7 @@ enforce_retval: if (err) return err; - if (!retval_range_within(range, reg, return_32bit)) { + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && prog->expected_attach_type == BPF_LSM_CGROUP && -- cgit v1.2.3 From 2864fb6aa947703d290b52b1b030b0b74d0a6128 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Mon, 2 Mar 2026 13:32:08 +0000 Subject: power: supply: max17042: initial support for Maxim MAX77759 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Maxim MAX77759 is a companion PMIC intended for use in mobile phones and tablets. It is used on Google Pixel 6 and 6 Pro (oriole and raven). Amongst others, it contains a fuel gauge that is similar to the ones supported by this driver. The fuel gauge can measure battery charge and discharge current, battery voltage, battery temperature, and the Type C connector's temperature. The MAX77759 incorporates the Maxim ModelGauge m5 algorithm. It, as well as previous generations like m3 on max17047/max17050, requires the host to save/restore some register values across power cycles to maintain full accuracy. Extending the driver for such support is out of scope in this initial commit. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-9-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/max17042_battery.c | 59 ++++++++++++++++++++++++++++++--- include/linux/power/max17042_battery.h | 24 ++++++++++++-- 2 files changed, 77 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index e21d2bd7e231..b9a21cef2cc6 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -650,7 +650,8 @@ static void max17042_write_config_regs(struct max17042_chip *chip) regmap_write(map, MAX17042_RelaxCFG, config->relax_cfg); if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047 || chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050 || - chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055) + chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055 || + chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) regmap_write(map, MAX17047_FullSOCThr, config->full_soc_thresh); } @@ -787,7 +788,8 @@ static inline void max17042_override_por_values(struct max17042_chip *chip) if ((chip->chip_type == MAXIM_DEVICE_TYPE_MAX17042) || (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047) || - (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050)) { + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050) || + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759)) { max17042_override_por(map, MAX17042_IAvg_empty, config->iavg_empty); max17042_override_por(map, MAX17042_TempNom, config->temp_nom); max17042_override_por(map, MAX17042_TempLim, config->temp_lim); @@ -796,7 +798,8 @@ static inline void max17042_override_por_values(struct max17042_chip *chip) if ((chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047) || (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050) || - (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055)) { + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055) || + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759)) { max17042_override_por(map, MAX17047_V_empty, config->vempty); } } @@ -1019,6 +1022,45 @@ static const struct regmap_config max17042_regmap_config = { .val_format_endian = REGMAP_ENDIAN_NATIVE, }; +static const struct regmap_range max77759_fg_registers[] = { + regmap_reg_range(MAX17042_STATUS, MAX77759_MixAtFull), + regmap_reg_range(MAX17042_VFSOC0Enable, MAX17042_VFSOC0Enable), + regmap_reg_range(MAX17042_MLOCKReg1, MAX17042_MLOCKReg2), + regmap_reg_range(MAX17042_MODELChrTbl, MAX17055_TimerH), + regmap_reg_range(MAX77759_IIn, MAX77759_IIn), + regmap_reg_range(MAX17055_AtQResidual, MAX17055_AtAvCap), + regmap_reg_range(MAX17042_OCVInternal, MAX17042_OCVInternal), + regmap_reg_range(MAX17042_VFSOC, MAX17042_VFSOC), +}; + +static const struct regmap_range max77759_fg_ro_registers[] = { + regmap_reg_range(MAX17042_FSTAT, MAX17042_FSTAT), + regmap_reg_range(MAX17042_OCVInternal, MAX17042_OCVInternal), + regmap_reg_range(MAX17042_VFSOC, MAX17042_VFSOC), +}; + +static const struct regmap_access_table max77759_fg_write_table = { + .yes_ranges = max77759_fg_registers, + .n_yes_ranges = ARRAY_SIZE(max77759_fg_registers), + .no_ranges = max77759_fg_ro_registers, + .n_no_ranges = ARRAY_SIZE(max77759_fg_ro_registers), +}; + +static const struct regmap_access_table max77759_fg_rd_table = { + .yes_ranges = max77759_fg_registers, + .n_yes_ranges = ARRAY_SIZE(max77759_fg_registers), +}; + +static const struct regmap_config max77759_fg_regmap_cfg = { + .reg_bits = 8, + .val_bits = 16, + .max_register = 0xff, + .wr_table = &max77759_fg_write_table, + .rd_table = &max77759_fg_rd_table, + .val_format_endian = REGMAP_ENDIAN_NATIVE, + .cache_type = REGCACHE_NONE, +}; + static const struct power_supply_desc max17042_psy_desc = { .name = "max170xx_battery", .type = POWER_SUPPLY_TYPE_BATTERY, @@ -1045,6 +1087,7 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq { struct i2c_adapter *adapter = client->adapter; const struct power_supply_desc *max17042_desc = &max17042_psy_desc; + const struct regmap_config *regmap_config; struct power_supply_config psy_cfg = {}; struct max17042_chip *chip; int ret; @@ -1060,7 +1103,12 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq chip->dev = dev; chip->chip_type = chip_type; - chip->regmap = devm_regmap_init_i2c(client, &max17042_regmap_config); + + if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) + regmap_config = &max77759_fg_regmap_cfg; + else + regmap_config = &max17042_regmap_config; + chip->regmap = devm_regmap_init_i2c(client, regmap_config); if (IS_ERR(chip->regmap)) return dev_err_probe(dev, PTR_ERR(chip->regmap), "Failed to initialize regmap\n"); @@ -1241,6 +1289,8 @@ static const struct of_device_id max17042_dt_match[] __used = { .data = (void *) MAXIM_DEVICE_TYPE_MAX17055 }, { .compatible = "maxim,max77705-battery", .data = (void *) MAXIM_DEVICE_TYPE_MAX17047 }, + { .compatible = "maxim,max77759-fg", + .data = (void *) MAXIM_DEVICE_TYPE_MAX77759 }, { .compatible = "maxim,max77849-battery", .data = (void *) MAXIM_DEVICE_TYPE_MAX17047 }, { }, @@ -1253,6 +1303,7 @@ static const struct i2c_device_id max17042_id[] = { { "max17047", MAXIM_DEVICE_TYPE_MAX17047 }, { "max17050", MAXIM_DEVICE_TYPE_MAX17050 }, { "max17055", MAXIM_DEVICE_TYPE_MAX17055 }, + { "max77759-fg", MAXIM_DEVICE_TYPE_MAX77759 }, { "max77849-battery", MAXIM_DEVICE_TYPE_MAX17047 }, { } }; diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index c417abd2ab70..05097f08ea36 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -105,7 +105,7 @@ enum max17042_register { MAX17042_OCV = 0xEE, - MAX17042_OCVInternal = 0xFB, /* MAX17055 VFOCV */ + MAX17042_OCVInternal = 0xFB, /* MAX17055/77759 VFOCV */ MAX17042_VFSOC = 0xFF, }; @@ -156,7 +156,7 @@ enum max17055_register { MAX17055_AtAvCap = 0xDF, }; -/* Registers specific to max17047/50/55 */ +/* Registers specific to max17047/50/55/77759 */ enum max17047_register { MAX17047_QRTbl00 = 0x12, MAX17047_FullSOCThr = 0x13, @@ -167,12 +167,32 @@ enum max17047_register { MAX17047_QRTbl30 = 0x42, }; +enum max77759_register { + MAX77759_AvgTA0 = 0x26, + MAX77759_AtTTF = 0x33, + MAX77759_Tconvert = 0x34, + MAX77759_AvgCurrent0 = 0x3B, + MAX77759_THMHOT = 0x40, + MAX77759_CTESample = 0x41, + MAX77759_ISys = 0x43, + MAX77759_AvgVCell0 = 0x44, + MAX77759_RlxSOC = 0x47, + MAX77759_AvgISys = 0x4B, + MAX77759_QH0 = 0x4C, + MAX77759_MixAtFull = 0x4F, + MAX77759_VSys = 0xB1, + MAX77759_TAlrtTh2 = 0xB2, + MAX77759_VByp = 0xB3, + MAX77759_IIn = 0xD0, +}; + enum max170xx_chip_type { MAXIM_DEVICE_TYPE_UNKNOWN = 0, MAXIM_DEVICE_TYPE_MAX17042, MAXIM_DEVICE_TYPE_MAX17047, MAXIM_DEVICE_TYPE_MAX17050, MAXIM_DEVICE_TYPE_MAX17055, + MAXIM_DEVICE_TYPE_MAX77759, MAXIM_DEVICE_TYPE_NUM }; -- cgit v1.2.3 From 83a86e27c34d06ec2dc117fb293e80f78402df49 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Mon, 2 Mar 2026 13:32:09 +0000 Subject: power: supply: max17042: consider task period (max77759) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several (register) values reported by the fuel gauge depend on its internal task period and it needs to be taken into account when calculating results. All relevant example formulas in the data sheet assume the default task period (of 5760) and final results need to be adjusted based on the task period in effect. Update the code as and where necessary. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-10-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/max17042_battery.c | 20 ++++++++++++++++++++ include/linux/power/max17042_battery.h | 1 + 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index b9a21cef2cc6..bafbf8706055 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -61,6 +61,7 @@ struct max17042_chip { struct work_struct work; int init_complete; int irq; + int task_period; }; static enum power_supply_property max17042_battery_props[] = { @@ -331,6 +332,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -340,6 +343,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -349,6 +354,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -358,6 +365,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = sign_extend64(data, 15) * 5000000ll; + data64 *= chip->task_period; + data64 = div_s64(data64, MAX17042_DEFAULT_TASK_PERIOD); val->intval = div_s64(data64, chip->pdata->r_sns); break; case POWER_SUPPLY_PROP_TEMP: @@ -1142,6 +1151,17 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq regmap_write(chip->regmap, MAX17042_LearnCFG, 0x0007); } + chip->task_period = MAX17042_DEFAULT_TASK_PERIOD; + if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) { + ret = regmap_read(chip->regmap, MAX17042_TaskPeriod, &val); + if (ret) + return dev_err_probe(dev, ret, + "failed to read task period\n"); + chip->task_period = val; + } + dev_dbg(dev, "task period: %#.4x (%d)\n", chip->task_period, + chip->task_period); + chip->battery = devm_power_supply_register(dev, max17042_desc, &psy_cfg); if (IS_ERR(chip->battery)) diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 05097f08ea36..d5b08313cf11 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -17,6 +17,7 @@ #define MAX17042_DEFAULT_VMAX (4500) /* LiHV cell max */ #define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ #define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ +#define MAX17042_DEFAULT_TASK_PERIOD (5760) /* Consider RepCap which is less then 10 units below FullCAP full */ #define MAX17042_FULL_THRESHOLD 10 -- cgit v1.2.3 From 48f7a50c027dd2abb9e7b8a6ecc8e531d87f2c21 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 17 Feb 2026 14:11:11 +0100 Subject: stop_machine: Fix the documentation for a NULL cpus argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent refactoring of the kernel-docs for stop machine changed the description of the cpus parameter from "NULL = any online cpu" to "NULL = run on each online CPU". However the callback is only executed on a single CPU, not all of them. The old wording was a bit ambiguous and could have been read both ways. Reword the documentation to be correct again and hopefully also clearer. Fixes: fc6f89dc7078 ("stop_machine: Improve kernel-doc function-header comments") Signed-off-by: Thomas Weißschuh Signed-off-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior --- include/linux/stop_machine.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 72820503514c..01011113d226 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline void print_stop_info(const char *log_lvl, struct task_struct *task * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Description: This causes a thread to be scheduled on every CPU, which * will run with interrupts disabled. Each CPU specified by @cpus will @@ -133,7 +133,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Same as above. Avoids nested calls to cpus_read_lock(). * -- cgit v1.2.3 From b93311673263bb98a200ab1cb6304f969bdada5c Mon Sep 17 00:00:00 2001 From: Jouni Högander Date: Wed, 25 Feb 2026 09:42:20 +0200 Subject: drm/dp: Add definition for Panel Replay full-line granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DP specification is saying value 0xff 0xff in PANEL REPLAY SELECTIVE UPDATE X GRANULARITY CAPABILITY registers (0xb2 and 0xb3) means full-line granularity. Add definition for this. Cc: dri-devel@lists.freedesktop.org Signed-off-by: Jouni Högander Reviewed-by: Uma Shankar Acked-by: Maarten Lankhorst Link: https://patch.msgid.link/20260225074221.1744330-1-jouni.hogander@intel.com --- include/drm/display/drm_dp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h index e4eebabab975..8b15d3eeb716 100644 --- a/include/drm/display/drm_dp.h +++ b/include/drm/display/drm_dp.h @@ -571,6 +571,8 @@ # define DP_PANEL_REPLAY_LINK_OFF_SUPPORTED_IN_PR_AFTER_ADAPTIVE_SYNC_SDP (1 << 7) #define DP_PANEL_REPLAY_CAP_X_GRANULARITY 0xb2 +# define DP_PANEL_REPLAY_FULL_LINE_GRANULARITY 0xffff + #define DP_PANEL_REPLAY_CAP_Y_GRANULARITY 0xb4 /* Link Configuration */ -- cgit v1.2.3 From 9cc60ec453fe5d58d4faa70829814769a8af24d4 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Wed, 25 Feb 2026 17:37:58 +0800 Subject: dma-mapping: benchmark: modify the framework to adapt to more map modes This patch adjusts the DMA map benchmark framework to make the DMA map benchmark framework more flexible and adaptable to other mapping modes in the future. By abstracting the framework into five interfaces: prepare, unprepare, initialize_data, do_map, and do_unmap. The new map schema can be introduced more easily without major modifications to the existing code structure. Reviewed-by: Barry Song Signed-off-by: Qinxin Xia Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260225093800.3625054-2-xiaqinxin@huawei.com --- include/uapi/linux/map_benchmark.h | 8 ++- kernel/dma/map_benchmark.c | 131 ++++++++++++++++++++++++++++++------- 2 files changed, 115 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h index c2d91088a40d..e076748f2120 100644 --- a/include/uapi/linux/map_benchmark.h +++ b/include/uapi/linux/map_benchmark.h @@ -17,6 +17,11 @@ #define DMA_MAP_TO_DEVICE 1 #define DMA_MAP_FROM_DEVICE 2 +enum { + DMA_MAP_BENCH_SINGLE_MODE, + DMA_MAP_BENCH_MODE_MAX +}; + struct map_benchmark { __u64 avg_map_100ns; /* average map latency in 100ns */ __u64 map_stddev; /* standard deviation of map latency */ @@ -29,7 +34,8 @@ struct map_benchmark { __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ + __u8 map_mode; /* the mode of dma map */ + __u8 expansion[75]; /* For future use */ }; #endif /* _UAPI_DMA_BENCHMARK_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 0f33b3ea7daf..b80e0fb399b1 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -5,6 +5,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -31,17 +32,105 @@ struct map_benchmark_data { atomic64_t loops; }; +struct map_benchmark_ops { + void *(*prepare)(struct map_benchmark_data *map); + void (*unprepare)(void *mparam); + void (*initialize_data)(void *mparam); + int (*do_map)(void *mparam); + void (*do_unmap)(void *mparam); +}; + +struct dma_single_map_param { + struct device *dev; + dma_addr_t addr; + void *xbuf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_single_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct dma_single_map_param *params __free(kfree) = kzalloc(sizeof(*params), + GFP_KERNEL); + if (!params) + return NULL; + + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->xbuf = alloc_pages_exact(params->npages * PAGE_SIZE, GFP_KERNEL); + if (!params->xbuf) + return NULL; + + return_ptr(params); +} + +static void dma_single_map_benchmark_unprepare(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + free_pages_exact(params->xbuf, params->npages * PAGE_SIZE); + kfree(params); +} + +static void dma_single_map_benchmark_initialize_data(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + /* + * for a non-coherent device, if we don't stain them in the + * cache, this will give an underestimate of the real-world + * overhead of BIDIRECTIONAL or TO_DEVICE mappings; + * 66 means everything goes well! 66 is lucky. + */ + if (params->dma_dir != DMA_FROM_DEVICE) + memset(params->xbuf, 0x66, params->npages * PAGE_SIZE); +} + +static int dma_single_map_benchmark_do_map(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + params->addr = dma_map_single(params->dev, params->xbuf, + params->npages * PAGE_SIZE, params->dma_dir); + if (unlikely(dma_mapping_error(params->dev, params->addr))) { + pr_err("dma_map_single failed on %s\n", dev_name(params->dev)); + return -ENOMEM; + } + + return 0; +} + +static void dma_single_map_benchmark_do_unmap(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + dma_unmap_single(params->dev, params->addr, + params->npages * PAGE_SIZE, params->dma_dir); +} + +static struct map_benchmark_ops dma_single_map_benchmark_ops = { + .prepare = dma_single_map_benchmark_prepare, + .unprepare = dma_single_map_benchmark_unprepare, + .initialize_data = dma_single_map_benchmark_initialize_data, + .do_map = dma_single_map_benchmark_do_map, + .do_unmap = dma_single_map_benchmark_do_unmap, +}; + +static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = { + [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops, +}; + static int map_benchmark_thread(void *data) { - void *buf; - dma_addr_t dma_addr; struct map_benchmark_data *map = data; - int npages = map->bparam.granule; - u64 size = npages * PAGE_SIZE; + __u8 map_mode = map->bparam.map_mode; int ret = 0; - buf = alloc_pages_exact(size, GFP_KERNEL); - if (!buf) + struct map_benchmark_ops *mb_ops = dma_map_benchmark_ops[map_mode]; + void *mparam = mb_ops->prepare(map); + + if (!mparam) return -ENOMEM; while (!kthread_should_stop()) { @@ -49,23 +138,12 @@ static int map_benchmark_thread(void *data) ktime_t map_stime, map_etime, unmap_stime, unmap_etime; ktime_t map_delta, unmap_delta; - /* - * for a non-coherent device, if we don't stain them in the - * cache, this will give an underestimate of the real-world - * overhead of BIDIRECTIONAL or TO_DEVICE mappings; - * 66 means evertything goes well! 66 is lucky. - */ - if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, size); - + mb_ops->initialize_data(mparam); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, size, map->dir); - if (unlikely(dma_mapping_error(map->dev, dma_addr))) { - pr_err("dma_map_single failed on %s\n", - dev_name(map->dev)); - ret = -ENOMEM; + ret = mb_ops->do_map(mparam); + if (ret) goto out; - } + map_etime = ktime_get(); map_delta = ktime_sub(map_etime, map_stime); @@ -73,7 +151,8 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, size, map->dir); + mb_ops->do_unmap(mparam); + unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -108,7 +187,7 @@ static int map_benchmark_thread(void *data) } out: - free_pages_exact(buf, size); + mb_ops->unprepare(mparam); return ret; } @@ -209,6 +288,12 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case DMA_MAP_BENCHMARK: + if (map->bparam.map_mode < 0 || + map->bparam.map_mode >= DMA_MAP_BENCH_MODE_MAX) { + pr_err("invalid map mode\n"); + return -EINVAL; + } + if (map->bparam.threads == 0 || map->bparam.threads > DMA_MAP_MAX_THREADS) { pr_err("invalid thread number\n"); -- cgit v1.2.3 From a8d14dd6e621f47344d0eda72f7ce9203bdef4f1 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Wed, 25 Feb 2026 17:37:59 +0800 Subject: dma-mapping: benchmark: add support for dma_map_sg Support for dma scatter-gather mapping and is intended for testing mapping performance. It achieves by introducing the dma_sg_map_param structure and related functions, which enable the implementation of scatter-gather mapping preparation, mapping, and unmapping operations. Additionally, the dma_map_benchmark_ops array is updated to include operations for scatter-gather mapping. This commit aims to provide a wider range of mapping performance test to cater to different scenarios. Reviewed-by: Barry Song Signed-off-by: Qinxin Xia Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260225093800.3625054-3-xiaqinxin@huawei.com --- include/uapi/linux/map_benchmark.h | 5 +- kernel/dma/map_benchmark.c | 115 +++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h index e076748f2120..4b17829a9f17 100644 --- a/include/uapi/linux/map_benchmark.h +++ b/include/uapi/linux/map_benchmark.h @@ -19,6 +19,7 @@ enum { DMA_MAP_BENCH_SINGLE_MODE, + DMA_MAP_BENCH_SG_MODE, DMA_MAP_BENCH_MODE_MAX }; @@ -33,7 +34,9 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u32 granule; /* - SINGLE_MODE: number of pages mapped/unmapped per operation + * - SG_MODE: number of scatterlist entries (each maps one page) + */ __u8 map_mode; /* the mode of dma map */ __u8 expansion[75]; /* For future use */ }; diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index b80e0fb399b1..29eeb5fdf199 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -117,8 +118,122 @@ static struct map_benchmark_ops dma_single_map_benchmark_ops = { .do_unmap = dma_single_map_benchmark_do_unmap, }; +struct dma_sg_map_param { + struct sg_table sgt; + struct device *dev; + void **buf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_sg_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct scatterlist *sg; + int i; + + struct dma_sg_map_param *params = kzalloc(sizeof(*params), GFP_KERNEL); + + if (!params) + return NULL; + /* + * Set the number of scatterlist entries based on the granule. + * In SG mode, 'granule' represents the number of scatterlist entries. + * Each scatterlist entry corresponds to a single page. + */ + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->buf = kmalloc_array(params->npages, sizeof(*params->buf), + GFP_KERNEL); + if (!params->buf) + goto out; + + if (sg_alloc_table(¶ms->sgt, params->npages, GFP_KERNEL)) + goto free_buf; + + for_each_sgtable_sg(¶ms->sgt, sg, i) { + params->buf[i] = (void *)__get_free_page(GFP_KERNEL); + if (!params->buf[i]) + goto free_page; + + sg_set_buf(sg, params->buf[i], PAGE_SIZE); + } + + return params; + +free_page: + while (i-- > 0) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); +free_buf: + kfree(params->buf); +out: + kfree(params); + return NULL; +} + +static void dma_sg_map_benchmark_unprepare(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int i; + + for (i = 0; i < params->npages; i++) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); + + kfree(params->buf); + kfree(params); +} + +static void dma_sg_map_benchmark_initialize_data(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + struct scatterlist *sg; + int i = 0; + + if (params->dma_dir == DMA_FROM_DEVICE) + return; + + for_each_sgtable_sg(¶ms->sgt, sg, i) + memset(params->buf[i], 0x66, PAGE_SIZE); +} + +static int dma_sg_map_benchmark_do_map(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int ret = 0; + + int sg_mapped = dma_map_sg(params->dev, params->sgt.sgl, + params->npages, params->dma_dir); + if (!sg_mapped) { + pr_err("dma_map_sg failed on %s\n", dev_name(params->dev)); + ret = -ENOMEM; + } + + return ret; +} + +static void dma_sg_map_benchmark_do_unmap(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + + dma_unmap_sg(params->dev, params->sgt.sgl, params->npages, + params->dma_dir); +} + +static struct map_benchmark_ops dma_sg_map_benchmark_ops = { + .prepare = dma_sg_map_benchmark_prepare, + .unprepare = dma_sg_map_benchmark_unprepare, + .initialize_data = dma_sg_map_benchmark_initialize_data, + .do_map = dma_sg_map_benchmark_do_map, + .do_unmap = dma_sg_map_benchmark_do_unmap, +}; + static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = { [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops, + [DMA_MAP_BENCH_SG_MODE] = &dma_sg_map_benchmark_ops, }; static int map_benchmark_thread(void *data) -- cgit v1.2.3 From 831fb31b76aea1453229dfd7cbd1946ffe1c03b5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Feb 2026 21:09:31 +0100 Subject: ipv6: make ipv6_anycast_destination logic usable without dst_entry nft_fib_ipv6 uses ipv6_anycast_destination(), but upcoming patch removes the dst_entry usage in favor of fib6_result. Move the 'plen > 127' logic to a new helper and call it from the existing one. Signed-off-by: Florian Westphal --- include/net/ip6_route.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index a55f9bf95fe3..0c8eeb6abe7a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -252,15 +252,22 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb) return rt->rt6i_flags & RTF_LOCAL; } +static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst, + u32 rt6i_flags, + const struct in6_addr *daddr) +{ + return rt6i_flags & RTF_ANYCAST || + (rt6i_dst->plen < 127 && + !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && + ipv6_addr_equal(&rt6i_dst->addr, daddr)); +} + static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); - return rt->rt6i_flags & RTF_ANYCAST || - (rt->rt6i_dst.plen < 127 && - !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); + return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr); } int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 1ac252ad036cdb18f5fb7f76bb6061adfed9cedf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:04 +0200 Subject: rculist_bl: add hlist_bl_for_each_entry_continue_rcu Change the old hlist_bl_first_rcu to hlist_bl_first_rcu_dereference to indicate that it is a RCU dereference. Add hlist_bl_next_rcu and hlist_bl_first_rcu to use RCU pointers and use them to fix sparse warnings. Add hlist_bl_for_each_entry_continue_rcu. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/linux/rculist_bl.h | 49 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 0b952d06eb0b..36363b876e53 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -8,21 +8,31 @@ #include #include +/* return the first ptr or next element in an RCU protected list */ +#define hlist_bl_first_rcu(head) \ + (*((struct hlist_bl_node __rcu **)(&(head)->first))) +#define hlist_bl_next_rcu(node) \ + (*((struct hlist_bl_node __rcu **)(&(node)->next))) + static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); - rcu_assign_pointer(h->first, + rcu_assign_pointer(hlist_bl_first_rcu(h), (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } -static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) -{ - return (struct hlist_bl_node *) - ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); -} +#define hlist_bl_first_rcu_dereference(head) \ +({ \ + struct hlist_bl_head *__head = (head); \ + \ + (struct hlist_bl_node *) \ + ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \ + hlist_bl_is_locked(__head)) & \ + ~LIST_BL_LOCKMASK); \ +}) /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization @@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, { struct hlist_bl_node *first; - /* don't need hlist_bl_first_rcu because we're under lock */ + /* don't need hlist_bl_first_rcu* because we're under lock */ first = hlist_bl_first(h); n->next = first; @@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = hlist_bl_first_rcu(head); \ + for (pos = hlist_bl_first_rcu_dereference(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) + +/** + * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given + * type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_bl_node to use as a loop cursor. + * @member: the name of the hlist_bl_node within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position which must have been in the list when the RCU read + * lock was taken. + * This would typically require either that you obtained the node from a + * previous walk of the list in the same RCU read-side critical section, or + * that you held some sort of non-RCU reference (such as a reference count) + * to keep the node alive *and* in the list. + */ +#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \ + for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) #endif -- cgit v1.2.3 From b655388111cf7e43f70e49db64bdaa42bcb8a038 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:05 +0200 Subject: ipvs: add resizable hash tables Add infrastructure for resizable hash tables based on hlist_bl which we will use in followup patches. The tables allow RCU lookups during resizing, bucket modifications are protected with per-bucket bit lock and additional custom locking, the tables are resized when load reaches thresholds determined based on load factor parameter. Compared to other implementations we rely on: * fast entry removal by using node unlinking without pre-lookup * entry rehashing when hash key changes * entries can contain multiple hash nodes * custom locking depending on different contexts * adjustable load factor to customize the grow/shrink process Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 198 ++++++++++++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_conn.c | 5 - net/netfilter/ipvs/ip_vs_core.c | 179 ++++++++++++++++++++++++++++++++++++ 3 files changed, 377 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ad8a16146ac5..c373fbdd2d0f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -11,6 +11,7 @@ #include /* for __uXX types */ #include /* for struct list_head */ +#include /* for struct hlist_bl_head */ #include /* for struct rwlock_t */ #include /* for struct atomic_t */ #include /* for struct refcount_t */ @@ -30,6 +31,7 @@ #endif #include /* Netw namespace */ #include +#include #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +struct ip_vs_aligned_lock { + spinlock_t l; /* Protect buckets */ +} ____cacheline_aligned_in_smp; + /* For arrays per family */ enum { IP_VS_AF_INET, @@ -484,6 +490,198 @@ struct ip_vs_est_kt_data { int est_row; /* estimated row */ }; +/* IPVS resizable hash tables */ +struct ip_vs_rht { + struct hlist_bl_head *buckets; + struct ip_vs_rht __rcu *new_tbl; /* New/Same table */ + seqcount_t *seqc; /* Protects moves */ + struct ip_vs_aligned_lock *lock; /* Protect seqc */ + int mask; /* Buckets mask */ + int size; /* Buckets */ + int seqc_mask; /* seqc mask */ + int lock_mask; /* lock mask */ + u32 table_id; + int u_thresh; /* upper threshold */ + int l_thresh; /* lower threshold */ + int lfactor; /* Load Factor (shift)*/ + int bits; /* size = 1 << bits */ + siphash_key_t hash_key; + struct rcu_head rcu_head; +}; + +/** + * ip_vs_rht_for_each_table() - Walk the hash tables + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * Walk tables assuming others can not change the installed tables + */ +#define ip_vs_rht_for_each_table(table, t, p) \ + for (p = NULL, t = rcu_dereference_protected(table, 1); \ + t != p; \ + p = t, t = rcu_dereference_protected(t->new_tbl, 1)) + +/** + * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * We usually search in one table and also in second table on resizing + */ +#define ip_vs_rht_for_each_table_rcu(table, t, p) \ + for (p = NULL, t = rcu_dereference(table); \ + t != p; \ + p = t, t = rcu_dereference(t->new_tbl)) + +/** + * ip_vs_rht_for_each_bucket() - Walk all table buckets + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: bucket index, used as cursor, u32 var + * @head: bucket address, used as cursor, struct hlist_bl_head *var + */ +#define ip_vs_rht_for_each_bucket(t, bucket, head) \ + for (bucket = 0, head = (t)->buckets; \ + bucket < t->size; bucket++, head++) + +/** + * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @sc: temp seqcount_t *var + * @seq: temp unsigned int var for sequence count + * @retry: temp int var + */ +#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \ + for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \ + retry && ({ seq = read_seqcount_begin(sc); 1; }); \ + retry = read_seqcount_retry(sc, seq)) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \ + struct ip_vs_rht *_t, *_p; \ + unsigned int _seq; \ + seqcount_t *_sc; \ + u32 _bucket; \ + int _retry +/** + * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_buckets_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) \ + ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \ + _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_bucket_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \ + unsigned int _seq; \ + seqcount_t *_sc; \ + int _retry +/** + * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock + * @t: current table, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \ + if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \ + {} \ + else \ + ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_safe_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket +/** + * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete entries but moving is disabled + * Using cond_resched_rcu() should be safe if tables do not change + */ +#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket + +/** + * ip_vs_rht_walk_buckets() - Walk all buckets + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Use if others can not add/delete/move entries + */ +#define ip_vs_rht_walk_buckets(table, head) \ + ip_vs_rht_for_each_table(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/* Entries can be in one of two tables, so we flip bit when new table is + * created and store it as highest bit in hash keys + */ +#define IP_VS_RHT_TABLE_ID_MASK BIT(31) + +/* Check if hash key is from this table */ +static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key) +{ + return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK); +} + +/* Build per-table hash key from hash value */ +static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash) +{ + return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK); +} + +void ip_vs_rht_free(struct ip_vs_rht *t); +void ip_vs_rht_rcu_free(struct rcu_head *head); +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks); +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits); +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits); +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2); + struct dst_entry; struct iphdr; struct ip_vs_conn; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index deaf16a90e2f..a6fd3b64428f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -76,11 +76,6 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define IP_VS_ADDRSTRLEN (8+1) #endif -struct ip_vs_aligned_lock -{ - spinlock_t l; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 869f18e0e835..f5b7a2047291 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows) INIT_LIST_HEAD(&table[rows]); } +/* IPVS Resizable Hash Tables: + * - list_bl buckets with bit lock + * + * Goals: + * - RCU lookup for entry can run in parallel with add/del/move operations + * - hash keys can be on non-contiguous memory + * - support entries with duplicate keys + * - unlink entries without lookup, use the saved table and bucket id + * - resizing can trigger on load change or depending on key refresh period + * - customizable load factor to balance between speed and memory usage + * - add/del/move operations should be allowed for any context + * + * Resizing: + * - new table is attached to the current table and all entries are moved + * with new hash key. Finally, the new table is installed as current one and + * the old table is released after RCU grace period. + * - RCU read-side critical sections will walk two tables while resizing is + * in progress + * - new entries are added to the new table + * - entries will be deleted from the old or from the new table, the table_id + * can be saved into entry as part of the hash key to know where the entry is + * hashed + * - move operations may delay readers or to cause retry for the modified + * bucket. As result, searched entry will be found but walkers that operate + * on multiple entries may see same entry twice if bucket walking is retried. + * - for fast path the number of entries (load) can be compared to u_thresh + * and l_thresh to decide when to trigger table growing/shrinking. They + * are calculated based on load factor (shift count), negative value allows + * load to be below 100% to reduce collisions by maintaining larger table + * while positive value tolerates collisions by using smaller table and load + * above 100%: u_thresh(load) = size * (2 ^ lfactor) + * + * Locking: + * - lock: protect seqc if other context except resizer can move entries + * - seqc: seqcount_t, delay/retry readers while entries are moved to + * new table on resizing + * - bit lock: serialize bucket modifications + * - writers may use other locking mechanisms to serialize operations for + * resizing, moving and installing new tables + */ + +void ip_vs_rht_free(struct ip_vs_rht *t) +{ + kvfree(t->buckets); + kvfree(t->seqc); + kvfree(t->lock); + kfree(t); +} + +void ip_vs_rht_rcu_free(struct rcu_head *head) +{ + struct ip_vs_rht *t; + + t = container_of(head, struct ip_vs_rht, rcu_head); + ip_vs_rht_free(t); +} + +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) +{ + struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); + int i; + + if (!t) + return NULL; + if (scounts) { + int ml = roundup_pow_of_two(nr_cpu_ids); + + scounts = min(scounts, buckets); + scounts = min(scounts, ml); + t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); + if (!t->seqc) + goto err; + for (i = 0; i < scounts; i++) + seqcount_init(&t->seqc[i]); + + if (locks) { + locks = min(locks, scounts); + t->lock = kvmalloc_array(locks, sizeof(*t->lock), + GFP_KERNEL); + if (!t->lock) + goto err; + for (i = 0; i < locks; i++) + spin_lock_init(&t->lock[i].l); + } + } + + t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); + if (!t->buckets) + goto err; + for (i = 0; i < buckets; i++) + INIT_HLIST_BL_HEAD(&t->buckets[i]); + t->mask = buckets - 1; + t->size = buckets; + t->seqc_mask = scounts - 1; + t->lock_mask = locks - 1; + t->u_thresh = buckets; + t->l_thresh = buckets >> 4; + t->bits = order_base_2(buckets); + /* new_tbl points to self if no new table is filled */ + RCU_INIT_POINTER(t->new_tbl, t); + get_random_bytes(&t->hash_key, sizeof(t->hash_key)); + return t; + +err: + ip_vs_rht_free(t); + return NULL; +} + +/* Get the desired table size for n entries based on current table size and + * by using the formula size = n / (2^lfactor) + * lfactor: shift value for the load factor: + * - >0: u_thresh=size << lfactor, for load factor above 100% + * - <0: u_thresh=size >> -lfactor, for load factor below 100% + * - 0: for load factor of 100% + */ +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits) +{ + if (!t) + return 1 << min_bits; + n = roundup_pow_of_two(n); + if (lfactor < 0) { + int factor = min(-lfactor, max_bits); + + n = min(n, 1 << (max_bits - factor)); + n <<= factor; + } else { + n = min(n >> lfactor, 1 << max_bits); + } + if (lfactor != t->lfactor) + return clamp(n, 1 << min_bits, 1 << max_bits); + if (n > t->size) + return n; + if (n > t->size >> 4) + return t->size; + /* Shrink but keep it n * 2 to prevent frequent resizing */ + return clamp(n << 1, 1 << min_bits, 1 << max_bits); +} + +/* Set thresholds based on table size and load factor: + * u_thresh = size * (2^lfactor) + * l_thresh = u_thresh / 16 + * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink + */ +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits) +{ + if (size >= 1 << max_bits) + t->u_thresh = INT_MAX; /* stop growing */ + else if (lfactor <= 0) + t->u_thresh = size >> min(-lfactor, max_bits); + else + t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; + + /* l_thresh: shrink when load is 16 times lower, can be 0 */ + if (size >= 1 << max_bits) + t->l_thresh = (1 << max_bits) >> 4; + else if (size > 1 << min_bits) + t->l_thresh = t->u_thresh >> 4; + else + t->l_thresh = 0; /* stop shrinking */ +} + +/* Return hash value for local info (fast, insecure) */ +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2) +{ + u32 v3; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + v3 = ipv6_addr_hash(&addr->in6); + else +#endif + v3 = addr->all[0]; + + return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); +} + static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { -- cgit v1.2.3 From 840aac3d900d09ec8fb8efe41bd7d09f9eb15538 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:06 +0200 Subject: ipvs: use resizable hash table for services Make the hash table for services resizable in the bit range of 4-20. Table is attached only while services are present. Resizing is done by delayed work based on load (the number of hashed services). Table grows when load increases 2+ times (above 12.5% with lfactor=-3) and shrinks 8+ times when load decreases 16+ times (below 0.78%). Switch to jhash hashing to reduce the collisions for multiple services. Add a hash_key field into the service to store the table ID in the highest bit and the entry's hash value in the lowest bits. The lowest part of the hash value is used as bucket ID, the remaining part is used to filter the entries in the bucket before matching the keys and as result, helps the lookup operation to access only one cache line. By knowing the table ID and bucket ID for entry, we can unlink it without calculating the hash value and doing lookup by keys. We need only to validate the saved hash_key under lock. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 49 ++- net/netfilter/ipvs/ip_vs_ctl.c | 673 ++++++++++++++++++++++++++++++++++------- 2 files changed, 593 insertions(+), 129 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index c373fbdd2d0f..663ad6ad9518 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -35,12 +35,10 @@ #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* svc_table limits */ +#define IP_VS_SVC_TAB_MIN_BITS 4 +#define IP_VS_SVC_TAB_MAX_BITS 20 /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) @@ -51,8 +49,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; -extern struct mutex __ip_vs_mutex; - struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -289,6 +285,12 @@ static inline int ip_vs_af_index(int af) return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; } +/* work_flags */ +enum { + IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ + IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ +}; + /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -889,14 +891,15 @@ struct ip_vs_dest_user_kern { * forwarding entries. */ struct ip_vs_service { - struct hlist_node s_list; /* node in service table */ - atomic_t refcnt; /* reference counter */ - + struct hlist_bl_node s_list; /* node in service table */ + u32 hash_key; /* Key for the hash table */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ + union nf_inet_addr addr; /* IP address for virtual service */ - __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ + atomic_t refcnt; /* reference counter */ + __be16 port; /* port number for the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ @@ -1155,6 +1158,10 @@ struct netns_ipvs { struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ + struct mutex service_mutex; /* service reconfig */ + struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct delayed_work svc_resize_work; /* resize svc_table */ + atomic_t svc_table_changes;/* ++ on new table */ /* Service counters */ atomic_t num_services[IP_VS_AF_MAX]; /* Services */ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ @@ -1219,6 +1226,7 @@ struct netns_ipvs { int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif + int sysctl_svc_lfactor; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1228,6 +1236,7 @@ struct netns_ipvs { int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; + unsigned long work_flags; /* IP_VS_WORK_* flags */ /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ @@ -1259,9 +1268,7 @@ struct netns_ipvs { unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ - /* the service mutex that protect svc_table and svc_fwm_table */ - struct mutex service_mutex; - struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */ + struct ip_vs_rht __rcu *svc_table; /* Services */ }; #define DEFAULT_SYNC_THRESHOLD 3 @@ -1511,6 +1518,18 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs) #endif +/* Get load factor to map num_services/u_thresh to t->size + * Smaller value decreases u_thresh to reduce collisions but increases + * the table size + * Returns factor where: + * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load + * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load + */ +static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_svc_lfactor); +} + /* IPVS core functions * (from ip_vs_core.c) */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f7d454df2b58..2baef945c56f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -293,47 +294,59 @@ ip_vs_use_count_dec(void) } - +/* Service hashing: + * Operation Locking order + * --------------------------------------------------------------------------- + * add table service_mutex, svc_resize_sem(W) + * del table service_mutex + * move between tables svc_resize_sem(W), seqcount_t(W), bit lock + * add/del service service_mutex, bit lock + * find service RCU, seqcount_t(R) + * walk services(blocking) service_mutex, svc_resize_sem(R) + * walk services(non-blocking) RCU, seqcount_t(R) + * + * - new tables are linked/unlinked under service_mutex and svc_resize_sem + * - new table is linked on resizing and all operations can run in parallel + * in 2 tables until the new table is registered as current one + * - two contexts can modify buckets: config and table resize, both in + * process context + * - only table resizer can move entries, so we do not protect t->seqc[] + * items with t->lock[] + * - lookups occur under RCU lock and seqcount reader lock to detect if + * services are moved to new table + * - move operations may disturb readers: find operation will not miss entries + * but walkers may see same entry twice if they are forced to retry chains + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold + * service_mutex to disallow new tables to be installed or to check + * svc_table_changes and repeat the RCU read section if new table is installed + */ /* * Returns hash value for virtual service */ -static inline unsigned int -ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, +static inline u32 +ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - unsigned int porth = ntohs(port); - __be32 addr_fold = addr->ip; - __u32 ahash; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - ahash = ntohl(addr_fold); - ahash ^= ((size_t) ipvs >> 8); - - return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & - IP_VS_SVC_TAB_MASK; + return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); } /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) +static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, + __u32 fwmark) { - return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; + return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); } -/* - * Hashes a service in the svc_table by - * or by fwmark. - * Should be called with locked tables. - */ +/* Hashes a service in the svc_table by or by fwmark */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned int hash; + struct netns_ipvs *ipvs = svc->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; + u32 hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", @@ -341,23 +354,32 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) return 0; } + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + + /* New entries go into recent table */ + t = rcu_dereference_protected(ipvs->svc_table, 1); + t = rcu_dereference_protected(t->new_tbl, 1); + if (svc->fwmark == 0) { /* - * Hash it by + * Hash it by */ - hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, + hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, &svc->addr, svc->port); } else { /* * Hash it by fwmark */ - hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); + hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); } - hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); - + head = t->buckets + (hash & t->mask); + hlist_bl_lock(head); + WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); + hlist_bl_add_head_rcu(&svc->s_list, head); + hlist_bl_unlock(head); + return 1; } @@ -368,17 +390,45 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { + struct netns_ipvs *ipvs = svc->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; + u32 hash_key2; + u32 hash_key; + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } + t = rcu_dereference_protected(ipvs->svc_table, 1); + hash_key = READ_ONCE(svc->hash_key); + /* We need to lock the bucket in the right table */ + if (ip_vs_rht_same_table(t, hash_key)) { + head = t->buckets + (hash_key & t->mask); + hlist_bl_lock(head); + /* Ensure hash_key is read under lock */ + hash_key2 = READ_ONCE(svc->hash_key); + /* Moved to new table ? */ + if (hash_key != hash_key2) { + hlist_bl_unlock(head); + t = rcu_dereference_protected(t->new_tbl, 1); + head = t->buckets + (hash_key2 & t->mask); + hlist_bl_lock(head); + } + } else { + /* It is already moved to new table */ + t = rcu_dereference_protected(t->new_tbl, 1); + head = t->buckets + (hash_key & t->mask); + hlist_bl_lock(head); + } /* Remove it from svc_table */ - hlist_del_rcu(&svc->s_list); + hlist_bl_del_rcu(&svc->s_list); svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); + hlist_bl_unlock(head); return 1; } @@ -390,18 +440,29 @@ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); - - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { - if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && - svc->port == vport && svc->protocol == protocol && - !svc->fwmark) { - /* HIT */ - return svc; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + u32 hash, hash_key; + + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); + + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { + if (READ_ONCE(svc->hash_key) == hash_key && + svc->af == af && + ip_vs_addr_equal(af, &svc->addr, vaddr) && + svc->port == vport && + svc->protocol == protocol && !svc->fwmark) { + /* HIT */ + return svc; + } + } } } @@ -415,16 +476,26 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); - - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { - if (svc->fwmark == fwmark && svc->af == af) { - /* HIT */ - return svc; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + u32 hash, hash_key; + + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashval(t, af, fwmark); + + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { + if (READ_ONCE(svc->hash_key) == hash_key && + svc->fwmark == fwmark && svc->af == af) { + /* HIT */ + return svc; + } + } } } @@ -487,6 +558,220 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol return svc; } +/* Return the number of registered services */ +static int ip_vs_get_num_services(struct netns_ipvs *ipvs) +{ + int ns = 0, ni = IP_VS_AF_MAX; + + while (--ni >= 0) + ns += atomic_read(&ipvs->num_services[ni]); + return ns; +} + +/* Get default load factor to map num_services/u_thresh to t->size */ +static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) +{ + int factor; + + if (net_eq(ipvs->net, &init_net)) + factor = -3; /* grow if load is above 12.5% */ + else + factor = -2; /* grow if load is above 25% */ + return factor; +} + +/* Get the desired svc_table size */ +static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor) +{ + return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), + lfactor, IP_VS_SVC_TAB_MIN_BITS, + IP_VS_SVC_TAB_MAX_BITS); +} + +/* Allocate svc_table */ +static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, + int buckets, int lfactor) +{ + struct ip_vs_rht *t; + int scounts, locks; + + /* No frequent lookups to race with resizing, so use max of 64 + * seqcounts. Only resizer moves entries, so use 0 locks. + */ + scounts = clamp(buckets >> 4, 1, 64); + locks = 0; + + t = ip_vs_rht_alloc(buckets, scounts, locks); + if (!t) + return NULL; + t->lfactor = lfactor; + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, + IP_VS_SVC_TAB_MAX_BITS); + return t; +} + +/* svc_table resizer work */ +static void svc_resize_work_handler(struct work_struct *work) +{ + struct hlist_bl_head *head, *head2; + struct ip_vs_rht *t_free = NULL; + unsigned int resched_score = 0; + struct hlist_bl_node *cn, *nn; + struct ip_vs_rht *t, *t_new; + struct ip_vs_service *svc; + struct netns_ipvs *ipvs; + bool more_work = true; + seqcount_t *sc; + int limit = 0; + int new_size; + int lfactor; + u32 bucket; + + ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); + + if (!down_write_trylock(&ipvs->svc_resize_sem)) + goto out; + if (!mutex_trylock(&ipvs->service_mutex)) + goto unlock_sem; + more_work = false; + clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_m; + t = rcu_dereference_protected(ipvs->svc_table, 1); + /* Do nothing if table is removed */ + if (!t) + goto unlock_m; + /* New table needs to be registered? BUG! */ + if (t != rcu_dereference_protected(t->new_tbl, 1)) + goto unlock_m; + + lfactor = sysctl_svc_lfactor(ipvs); + /* Should we resize ? */ + new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); + if (new_size == t->size && lfactor == t->lfactor) + goto unlock_m; + + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); + if (!t_new) { + more_work = true; + goto unlock_m; + } + /* Flip the table_id */ + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + + rcu_assign_pointer(t->new_tbl, t_new); + /* Allow add/del to new_tbl while moving from old table */ + mutex_unlock(&ipvs->service_mutex); + + ip_vs_rht_for_each_bucket(t, bucket, head) { +same_bucket: + if (++limit >= 16) { + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) + goto unlock_sem; + if (resched_score >= 100) { + resched_score = 0; + cond_resched(); + } + limit = 0; + } + if (hlist_bl_empty(head)) { + resched_score++; + continue; + } + /* Preemption calls ahead... */ + resched_score = 0; + + sc = &t->seqc[bucket & t->seqc_mask]; + /* seqcount_t usage considering PREEMPT_RT rules: + * - we are the only writer => preemption can be allowed + * - readers (SoftIRQ) => disable BHs + * - readers (processes) => preemption should be disabled + */ + local_bh_disable(); + preempt_disable_nested(); + write_seqcount_begin(sc); + hlist_bl_lock(head); + + hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { + u32 hash; + + /* New hash for the new table */ + if (svc->fwmark == 0) { + /* Hash it by */ + hash = ip_vs_svc_hashval(t_new, svc->af, + svc->protocol, + &svc->addr, svc->port); + } else { + /* Hash it by fwmark */ + hash = ip_vs_svc_fwm_hashval(t_new, svc->af, + svc->fwmark); + } + hlist_bl_del_rcu(&svc->s_list); + head2 = t_new->buckets + (hash & t_new->mask); + + hlist_bl_lock(head2); + WRITE_ONCE(svc->hash_key, + ip_vs_rht_build_hash_key(t_new, hash)); + /* t_new->seqc are not used at this stage, we race + * only with add/del, so only lock the bucket. + */ + hlist_bl_add_head_rcu(&svc->s_list, head2); + hlist_bl_unlock(head2); + /* Too long chain? Do it in steps */ + if (++limit >= 64) + break; + } + + hlist_bl_unlock(head); + write_seqcount_end(sc); + preempt_enable_nested(); + local_bh_enable(); + if (limit >= 64) + goto same_bucket; + } + + /* Tables can be switched only under service_mutex */ + while (!mutex_trylock(&ipvs->service_mutex)) { + cond_resched(); + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_sem; + } + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_m; + + rcu_assign_pointer(ipvs->svc_table, t_new); + /* Inform readers that new table is installed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); + t_free = t; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + +unlock_sem: + up_write(&ipvs->svc_resize_sem); + + if (t_free) { + /* RCU readers should not see more than two tables in chain. + * To prevent new table to be attached wait here instead of + * freeing the old table in RCU callback. + */ + synchronize_rcu(); + ip_vs_rht_free(t_free); + } + +out: + if (!READ_ONCE(ipvs->enable) || !more_work || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + return; + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); +} static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) @@ -1357,12 +1642,13 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; struct ip_vs_scheduler *sched = NULL; + struct ip_vs_rht *t, *t_new = NULL; int af_id = ip_vs_af_index(u->af); - struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + struct ip_vs_pe *pe = NULL; int ret_hooks = -1; + int ret = 0; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1404,6 +1690,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif + t = rcu_dereference_protected(ipvs->svc_table, 1); + if (!t) { + int lfactor = sysctl_svc_lfactor(ipvs); + int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); + if (!t_new) { + ret = -ENOMEM; + goto out_err; + } + } + if (!atomic_read(&ipvs->num_services[af_id])) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) @@ -1449,6 +1747,12 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret < 0) goto out_err; + if (t_new) { + clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); + rcu_assign_pointer(ipvs->svc_table, t_new); + t_new = NULL; + } + /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter[af_id]); @@ -1470,6 +1774,12 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, /* Hash the service into the service table */ ip_vs_svc_hash(svc); + /* Schedule resize work */ + if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, + 1); + *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { @@ -1484,6 +1794,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, out_err: + if (t_new) + ip_vs_rht_free(t_new); if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { @@ -1671,10 +1983,38 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) */ static int ip_vs_del_service(struct ip_vs_service *svc) { + struct netns_ipvs *ipvs; + struct ip_vs_rht *t, *p; + int ns; + if (svc == NULL) return -EEXIST; + ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - + t = rcu_dereference_protected(ipvs->svc_table, 1); + + /* Drop the table if no more services */ + ns = ip_vs_get_num_services(ipvs); + if (!ns) { + /* Stop the resizer and drop the tables */ + set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); + cancel_delayed_work_sync(&ipvs->svc_resize_work); + if (t) { + rcu_assign_pointer(ipvs->svc_table, NULL); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } + } + } else if (ns <= t->l_thresh && + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) { + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, + 1); + } return 0; } @@ -1684,14 +2024,36 @@ static int ip_vs_del_service(struct ip_vs_service *svc) */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - struct hlist_node *n; + struct hlist_bl_node *ne; + struct hlist_bl_node *e; + struct ip_vs_rht *t, *p; + + /* Stop the resizer and drop the tables */ + if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + cancel_delayed_work_sync(&ipvs->svc_resize_work); + /* No resizer, so now we have exclusive write access */ + + if (ip_vs_get_num_services(ipvs)) { + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { + hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) + ip_vs_unlink_service(svc, cleanup); + } + } - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], - s_list) - ip_vs_unlink_service(svc, cleanup); + /* Unregister the hash table and release it after RCU grace period */ + t = rcu_dereference_protected(ipvs->svc_table, 1); + if (t) { + rcu_assign_pointer(ipvs->svc_table, NULL); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } } return 0; } @@ -1742,19 +2104,44 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; + struct hlist_bl_head *head; struct ip_vs_service *svc; + struct hlist_bl_node *e; struct ip_vs_dest *dest; - unsigned int idx; + int old_gen, new_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + + old_gen = atomic_read(&ipvs->svc_table_changes); + rcu_read_lock(); - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) + +repeat: + smp_rmb(); /* ipvs->svc_table and svc_table_changes */ + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { list_for_each_entry_rcu(dest, &svc->destinations, - n_list) + n_list) { ip_vs_forget_dev(dest, dev); + resched_score += 10; + } + resched_score++; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->svc_table_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat; + } + } } rcu_read_unlock(); @@ -1777,14 +2164,28 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) static int ip_vs_zero_all(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; + struct hlist_bl_head *head; struct ip_vs_service *svc; + struct hlist_bl_node *e; - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) + rcu_read_lock(); + + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { ip_vs_zero_service(svc); + resched_score += 10; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + } } + rcu_read_unlock(); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } @@ -2218,7 +2619,8 @@ static struct ctl_table vs_vars[] = { struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - int bucket; + struct ip_vs_rht *t; + u32 bucket; }; /* @@ -2239,17 +2641,23 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) } } - +/* Do not expect consistent view during add, del and move(table resize). + * We may miss entries and even show duplicates. + */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { - struct net *net = seq_file_net(seq); - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; - int idx; + struct ip_vs_rht *t = iter->t; struct ip_vs_service *svc; + struct hlist_bl_node *e; + int idx; - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { + if (!t) + return NULL; + for (idx = 0; idx < t->size; idx++) { + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; if (pos-- == 0) { iter->bucket = idx; return svc; @@ -2262,18 +2670,22 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ip_vs_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + rcu_read_lock(); + iter->t = rcu_dereference(ipvs->svc_table); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct hlist_node *e; - struct ip_vs_iter *iter; struct ip_vs_service *svc; - struct net *net = seq_file_net(seq); - struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_iter *iter; + struct hlist_bl_node *e; + struct ip_vs_rht *t; ++*pos; if (v == SEQ_START_TOKEN) @@ -2281,15 +2693,22 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) svc = v; iter = seq->private; + t = iter->t; + if (!t) + return NULL; - e = rcu_dereference(hlist_next_rcu(&svc->s_list)); - if (e) - return hlist_entry(e, struct ip_vs_service, s_list); + hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { + /* Our cursor was moved to new table ? */ + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; + return svc; + } - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - hlist_for_each_entry_rcu(svc, - &ipvs->svc_table[iter->bucket], - s_list) { + while (++iter->bucket < t->size) { + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], + s_list) { + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; return svc; } } @@ -2770,13 +3189,18 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { - int idx, count=0; - struct ip_vs_service *svc; struct ip_vs_service_entry entry; + DECLARE_IP_VS_RHT_WALK_BUCKETS(); + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct hlist_bl_node *e; + int count = 0; int ret = 0; - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { + lockdep_assert_held(&ipvs->svc_resize_sem); + /* All service modifications are disabled, go ahead */ + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { + hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET) continue; @@ -2948,6 +3372,35 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } + if (cmd == IP_VS_SO_GET_SERVICES) { + struct ip_vs_get_services *get; + size_t size; + + get = (struct ip_vs_get_services *)arg; + size = struct_size(get, entrytable, get->num_services); + if (*len != size) { + pr_err("length: %u != %zu\n", *len, size); + return -EINVAL; + } + /* Protect against table resizer moving the entries. + * Try reverse locking, so that we do not hold the mutex + * while waiting for semaphore. + */ + while (1) { + ret = down_read_killable(&ipvs->svc_resize_sem); + if (ret < 0) + return ret; + if (mutex_trylock(&ipvs->service_mutex)) + break; + up_read(&ipvs->svc_resize_sem); + cond_resched(); + } + ret = __ip_vs_get_service_entries(ipvs, get, user); + up_read(&ipvs->svc_resize_sem); + mutex_unlock(&ipvs->service_mutex); + return ret; + } + mutex_lock(&ipvs->service_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: @@ -2976,22 +3429,6 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } break; - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - size_t size; - - get = (struct ip_vs_get_services *)arg; - size = struct_size(get, entrytable, get->num_services); - if (*len != size) { - pr_err("length: %u != %zu\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(ipvs, get, user); - } - break; - case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; @@ -3277,15 +3714,19 @@ nla_put_failure: static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0, i; - int start = cb->args[0]; - struct ip_vs_service *svc; + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct hlist_bl_node *e; + int start = cb->args[0]; + int idx = 0; + down_read(&ipvs->svc_resize_sem); rcu_read_lock(); - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) { + ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3297,6 +3738,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, nla_put_failure: rcu_read_unlock(); + up_read(&ipvs->svc_resize_sem); cb->args[0] = idx; return skb->len; @@ -4306,8 +4748,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) - INIT_HLIST_HEAD(&ipvs->svc_table[idx]); + init_rwsem(&ipvs->svc_resize_sem); + INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); + atomic_set(&ipvs->svc_table_changes, 0); + RCU_INIT_POINTER(ipvs->svc_table, NULL); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4326,6 +4770,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) } INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); + ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); /* procfs stats */ ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); -- cgit v1.2.3 From 2fa7cc9c70254d42a82bf82827d8d20cafe975d2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:07 +0200 Subject: ipvs: switch to per-net connection table Use per-net resizable hash table for connections. The global table is slow to walk when using many namespaces. The table can be resized in the range of [256 - ip_vs_conn_tab_size]. Table is attached only while services are present. Resizing is done by delayed work based on load (the number of connections). Add a hash_key field into the connection to store the table ID in the highest bit and the entry's hash value in the lowest bits. The lowest part of the hash value is used as bucket ID, the remaining part is used to filter the entries in the bucket before matching the keys and as result, helps the lookup operation to access only one cache line. By knowing the table ID and bucket ID for entry, we can unlink it without calculating the hash value and doing lookup by keys. We need only to validate the saved hash_key under lock. For better security switch from jhash to siphash for the default connection hashing but the persistence engines may use their own function. Keeping the hash table loaded with entries below the size (12%) allows to avoid collision for 96+% of the conns. ip_vs_conn_fill_cport() now will rehash the connection with proper locking because unhash+hash is not safe for RCU readers. To invalidate the templates setting just dport to 0xffff is enough, no need to rehash them. As result, ip_vs_conn_unhash() is now unused and removed. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 34 +- net/netfilter/ipvs/ip_vs_conn.c | 849 ++++++++++++++++++++++++++------------ net/netfilter/ipvs/ip_vs_ctl.c | 18 + net/netfilter/ipvs/ip_vs_pe_sip.c | 4 +- net/netfilter/ipvs/ip_vs_sync.c | 23 ++ 5 files changed, 667 insertions(+), 261 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 663ad6ad9518..3d595bd99eb3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -36,6 +36,14 @@ #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 +/* conn_tab limits (as per Kconfig) */ +#define IP_VS_CONN_TAB_MIN_BITS 8 +#if BITS_PER_LONG > 32 +#define IP_VS_CONN_TAB_MAX_BITS 27 +#else +#define IP_VS_CONN_TAB_MAX_BITS 20 +#endif + /* svc_table limits */ #define IP_VS_SVC_TAB_MIN_BITS 4 #define IP_VS_SVC_TAB_MAX_BITS 20 @@ -289,6 +297,7 @@ static inline int ip_vs_af_index(int af) enum { IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ + IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */ }; /* The port number of FTP service (in network order). */ @@ -779,18 +788,19 @@ struct ip_vs_conn_param { /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_node c_list; /* hashed list heads */ + struct hlist_bl_node c_list; /* node in conn_tab */ + __u32 hash_key; /* Key for the hash table */ /* Protocol, addresses and port numbers */ __be16 cport; __be16 dport; __be16 vport; u16 af; /* address family */ + __u16 protocol; /* Which protocol (TCP/UDP) */ + __u16 daf; /* Address family of the dest */ union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ struct netns_ipvs *ipvs; /* counter and timer */ @@ -1009,8 +1019,8 @@ struct ip_vs_pe { int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); - u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, - bool inverse); + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, + struct ip_vs_rht *t, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, @@ -1150,6 +1160,7 @@ struct netns_ipvs { /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ atomic_t no_cport_conns[IP_VS_AF_MAX]; + struct delayed_work conn_resize_work;/* resize conn_tab */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ @@ -1226,6 +1237,7 @@ struct netns_ipvs { int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif + int sysctl_conn_lfactor; int sysctl_svc_lfactor; /* ip_vs_lblc */ @@ -1269,6 +1281,8 @@ struct netns_ipvs { unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ struct ip_vs_rht __rcu *svc_table; /* Services */ + struct ip_vs_rht __rcu *conn_tab; /* Connections */ + atomic_t conn_tab_changes;/* ++ on new table */ }; #define DEFAULT_SYNC_THRESHOLD 3 @@ -1518,6 +1532,12 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs) #endif +/* Get load factor to map conn_count/u_thresh to t->size */ +static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_conn_lfactor); +} + /* Get load factor to map num_services/u_thresh to t->size * Smaller value decreases u_thresh to reduce collisions but increases * the table size @@ -1603,6 +1623,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor); +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a6fd3b64428f..07a47e525f01 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -47,28 +47,12 @@ static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); -/* size and mask values */ +/* Max table size */ int ip_vs_conn_tab_size __read_mostly; -static int ip_vs_conn_tab_mask __read_mostly; - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct hlist_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd __read_mostly; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 5 -#define CT_LOCKARRAY_SIZE (1<lock protects conn fields like cp->flags, cp->dest + */ -static inline void ct_write_lock_bh(unsigned int key) +/* Lock conn_tab bucket for conn hash/unhash, not for rehash */ +static __always_inline void +conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, + bool new_hash, struct hlist_bl_head **head_ret) { - spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + struct hlist_bl_head *head; + u32 hash_key_new; + + if (!new_hash) { + /* We need to lock the bucket in the right table */ + +retry: + if (!ip_vs_rht_same_table(t, hash_key)) { + /* It is already moved to new table */ + t = rcu_dereference(t->new_tbl); + } + } + + head = t->buckets + (hash_key & t->mask); + + local_bh_disable(); + /* Do not touch seqcount, this is a safe operation */ + + hlist_bl_lock(head); + if (!new_hash) { + /* Ensure hash_key is read under lock */ + hash_key_new = READ_ONCE(cp->hash_key); + /* Hash changed ? */ + if (hash_key != hash_key_new) { + hlist_bl_unlock(head); + local_bh_enable(); + hash_key = hash_key_new; + goto retry; + } + } + *head_ret = head; } -static inline void ct_write_unlock_bh(unsigned int key) +static inline void conn_tab_unlock(struct hlist_bl_head *head) { - spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + hlist_bl_unlock(head); + local_bh_enable(); } static void ip_vs_conn_expire(struct timer_list *t); @@ -95,30 +122,31 @@ static void ip_vs_conn_expire(struct timer_list *t); /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, - const union nf_inet_addr *addr, - __be16 port) +static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto, + const union nf_inet_addr *addr, __be16 port) { + u64 a = (u32)proto << 16 | (__force u32)port; + #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), - (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; + if (af == AF_INET6) { + u64 b = (u64)addr->all[0] << 32 | addr->all[1]; + u64 c = (u64)addr->all[2] << 32 | addr->all[3]; + + return (u32)siphash_3u64(a, b, c, &t->hash_key); + } #endif - return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, - ip_vs_conn_rnd) ^ - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; + a |= (u64)addr->all[0] << 32; + return (u32)siphash_1u64(a, &t->hash_key); } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, - bool inverse) + struct ip_vs_rht *t, bool inverse) { const union nf_inet_addr *addr; __be16 port; if (p->pe_data && p->pe->hashkey_raw) - return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & - ip_vs_conn_tab_mask; + return p->pe->hashkey_raw(p, t, inverse); if (likely(!inverse)) { addr = p->caddr; @@ -128,10 +156,11 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port); } -static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, + const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; @@ -144,31 +173,36 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) p.pe_data_len = cp->pe_data_len; } - return ip_vs_conn_hashkey_param(&p, false); + return ip_vs_conn_hashkey_param(&p, t, false); } -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. +/* Hashes ip_vs_conn in conn_tab * returns bool success. */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned int hash; + struct netns_ipvs *ipvs = cp->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; + u32 hash_key; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return 0; - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey_conn(cp); + /* New entries go into recent table */ + t = rcu_dereference(ipvs->conn_tab); + t = rcu_dereference(t->new_tbl); - ct_write_lock_bh(hash); + hash_key = ip_vs_rht_build_hash_key(t, ip_vs_conn_hashkey_conn(t, cp)); + conn_tab_lock(t, cp, hash_key, true /* new_hash */, &head); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hash_key, hash_key); refcount_inc(&cp->refcnt); - hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); + hlist_bl_add_head_rcu(&cp->c_list, head); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pS\n", @@ -177,75 +211,58 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); - - return ret; -} + conn_tab_unlock(head); - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. Caller should hold conn reference. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned int hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey_conn(cp); - - ct_write_lock_bh(hash); - spin_lock(&cp->lock); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del_rcu(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - refcount_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); + /* Schedule resizing if load increases */ + if (atomic_read(&ipvs->conn_count) > t->u_thresh && + !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags)) + mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); return ret; } -/* Try to unlink ip_vs_conn from ip_vs_conn_tab. +/* Try to unlink ip_vs_conn from conn_tab. * returns bool success. */ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) { - unsigned int hash; + struct netns_ipvs *ipvs = cp->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; bool ret = false; + u32 hash_key; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return refcount_dec_if_one(&cp->refcnt); - hash = ip_vs_conn_hashkey_conn(cp); + rcu_read_lock(); + + t = rcu_dereference(ipvs->conn_tab); + hash_key = READ_ONCE(cp->hash_key); - ct_write_lock_bh(hash); + conn_tab_lock(t, cp, hash_key, false /* new_hash */, &head); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ if (refcount_dec_if_one(&cp->refcnt)) { - hlist_del_rcu(&cp->c_list); + hlist_bl_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } } spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); + conn_tab_unlock(head); + + rcu_read_unlock(); return ret; } /* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Gets ip_vs_conn associated with supplied parameters in the conn_tab. * Called for pkts coming from OUTside-to-INside. * p->caddr, p->cport: pkt source address (foreign host) * p->vaddr, p->vport: pkt dest address (load balancer) @@ -253,26 +270,38 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) static inline struct ip_vs_conn * __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey_param(p, false); + u32 hash, hash_key; rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->cport == cp->cport && p->vport == cp->vport && - cp->af == p->af && - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && - ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && - ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (!__ip_vs_conn_get(cp)) - continue; - /* HIT */ - rcu_read_unlock(); - return cp; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + if (READ_ONCE(cp->hash_key) == hash_key && + p->cport == cp->cport && + p->vport == cp->vport && cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, + &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, + &cp->vaddr) && + (!p->cport ^ + (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol) { + if (__ip_vs_conn_get(cp)) { + /* HIT */ + rcu_read_unlock(); + return cp; + } + } + } } } @@ -345,37 +374,50 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey_param(p, false); + u32 hash, hash_key; rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (unlikely(p->pe_data && p->pe->ct_match)) { - if (cp->ipvs != p->ipvs) - continue; - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { - if (__ip_vs_conn_get(cp)) - goto out; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + if (READ_ONCE(cp->hash_key) != hash_key) + continue; + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (p->pe == cp->pe && + p->pe->ct_match(p, cp) && + __ip_vs_conn_get(cp)) + goto out; + continue; + } + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, + &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark + */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? + AF_UNSPEC : p->af, + p->vaddr, &cp->vaddr) && + p->vport == cp->vport && + p->cport == cp->cport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol && + cp->dport != htons(0xffff)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } - continue; } - if (cp->af == p->af && - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && - /* protocol should only be IPPROTO_IP if - * p->vaddr is a fwmark */ - ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : - p->af, p->vaddr, &cp->vaddr) && - p->vport == cp->vport && p->cport == cp->cport && - cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (__ip_vs_conn_get(cp)) - goto out; - } } cp = NULL; @@ -391,58 +433,64 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) return cp; } -/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. +/* Gets ip_vs_conn associated with supplied parameters in the conn_tab. * Called for pkts coming from inside-to-OUTside. * p->caddr, p->cport: pkt source address (inside host) * p->vaddr, p->vport: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned int hash; - struct ip_vs_conn *cp, *ret=NULL; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; const union nf_inet_addr *saddr; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; + struct ip_vs_conn *cp; + u32 hash, hash_key; __be16 sport; - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey_param(p, true); - rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->vport != cp->cport) - continue; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, true); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + if (READ_ONCE(cp->hash_key) != hash_key || + p->vport != cp->cport) + continue; - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - sport = cp->vport; - saddr = &cp->vaddr; - } else { - sport = cp->dport; - saddr = &cp->daddr; - } + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + sport = cp->vport; + saddr = &cp->vaddr; + } else { + sport = cp->dport; + saddr = &cp->daddr; + } - if (p->cport == sport && cp->af == p->af && - ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && - ip_vs_addr_equal(p->af, p->caddr, saddr) && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (!__ip_vs_conn_get(cp)) - continue; - /* HIT */ - ret = cp; - break; + if (p->cport == sport && cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, + &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, saddr) && + p->protocol == cp->protocol) { + if (__ip_vs_conn_get(cp)) + goto out; + } + } } } + cp = NULL; +out: rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), - ret ? "hit" : "not hit"); + cp ? "hit" : "not hit"); - return ret; + return cp; } struct ip_vs_conn * @@ -487,23 +535,261 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) */ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { - if (ip_vs_conn_unhash(cp)) { - struct netns_ipvs *ipvs = cp->ipvs; - int af_id = ip_vs_af_index(cp->af); + struct hlist_bl_head *head, *head2, *head_new; + struct netns_ipvs *ipvs = cp->ipvs; + int af_id = ip_vs_af_index(cp->af); + u32 hash_r = 0, hash_key_r = 0; + struct ip_vs_rht *t, *tp, *t2; + u32 hash_key, hash_key_new; + struct ip_vs_conn_param p; + int ntbl; + + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, + cport, &cp->vaddr, cp->vport, &p); + ntbl = 0; + + /* Attempt to rehash cp safely, by informing seqcount readers */ + t = rcu_dereference(ipvs->conn_tab); + hash_key = READ_ONCE(cp->hash_key); + tp = NULL; + +retry: + /* Moved to new table ? */ + if (!ip_vs_rht_same_table(t, hash_key)) { + t = rcu_dereference(t->new_tbl); + ntbl++; + /* We are lost? */ + if (ntbl >= 2) + return; + } - spin_lock_bh(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock_bh(&cp->lock); + /* Rehashing during resize? Use the recent table for adds */ + t2 = rcu_dereference(t->new_tbl); + /* Calc new hash once per table */ + if (tp != t2) { + hash_r = ip_vs_conn_hashkey_param(&p, t2, false); + hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r); + tp = t2; + } + head = t->buckets + (hash_key & t->mask); + head2 = t2->buckets + (hash_key_r & t2->mask); + head_new = head2; + + if (head > head2 && t == t2) + swap(head, head2); - /* hash on new dport */ - ip_vs_conn_hash(cp); + /* Lock seqcount only for the old bucket, even if we are on new table + * because it affects the del operation, not the adding. + */ + spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + preempt_disable_nested(); + write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); + + /* Lock buckets in same (increasing) order */ + hlist_bl_lock(head); + if (head != head2) + hlist_bl_lock(head2); + + /* Ensure hash_key is read under lock */ + hash_key_new = READ_ONCE(cp->hash_key); + /* Racing with another rehashing ? */ + if (unlikely(hash_key != hash_key_new)) { + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + hash_key = hash_key_new; + goto retry; } + + spin_lock(&cp->lock); + if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && + (cp->flags & IP_VS_CONN_F_HASHED)) { + /* We do not recalc hash_key_r under lock, we assume the + * parameters in cp do not change, i.e. cport is + * the only possible change. + */ + WRITE_ONCE(cp->hash_key, hash_key_r); + if (head != head2) { + hlist_bl_del_rcu(&cp->c_list); + hlist_bl_add_head_rcu(&cp->c_list, head_new); + } + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); +} + +/* Get default load factor to map conn_count/u_thresh to t->size */ +static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs) +{ + int factor; + + if (net_eq(ipvs->net, &init_net)) + factor = -3; + else + factor = -1; + return factor; +} + +/* Get the desired conn_tab size */ +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor) +{ + return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count), + lfactor, IP_VS_CONN_TAB_MIN_BITS, + ip_vs_conn_tab_bits); } +/* Allocate conn_tab */ +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor) +{ + struct ip_vs_rht *t; + int scounts, locks; + + /* scounts: affects readers during resize */ + scounts = clamp(buckets >> 6, 1, 256); + /* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */ + locks = clamp(8, 1, scounts); + + t = ip_vs_rht_alloc(buckets, scounts, locks); + if (!t) + return NULL; + t->lfactor = lfactor; + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS, + ip_vs_conn_tab_bits); + return t; +} + +/* conn_tab resizer work */ +static void conn_resize_work_handler(struct work_struct *work) +{ + struct hlist_bl_head *head, *head2; + unsigned int resched_score = 0; + struct hlist_bl_node *cn, *nn; + struct ip_vs_rht *t, *t_new; + struct netns_ipvs *ipvs; + struct ip_vs_conn *cp; + bool more_work = false; + u32 hash, hash_key; + int limit = 0; + int new_size; + int lfactor; + u32 bucket; + + ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work); + + /* Allow work to be queued again */ + clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags); + t = rcu_dereference_protected(ipvs->conn_tab, 1); + /* Do nothing if table is removed */ + if (!t) + goto out; + /* New table needs to be registered? BUG! */ + if (t != rcu_dereference_protected(t->new_tbl, 1)) + goto out; + + lfactor = sysctl_conn_lfactor(ipvs); + /* Should we resize ? */ + new_size = ip_vs_conn_desired_size(ipvs, t, lfactor); + if (new_size == t->size && lfactor == t->lfactor) + goto out; + + t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!t_new) { + more_work = true; + goto out; + } + /* Flip the table_id */ + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + + rcu_assign_pointer(t->new_tbl, t_new); + + /* Wait RCU readers to see the new table, we do not want new + * conns to go into old table and to be left there. + */ + synchronize_rcu(); + + ip_vs_rht_for_each_bucket(t, bucket, head) { +same_bucket: + if (++limit >= 16) { + if (resched_score >= 100) { + resched_score = 0; + cond_resched(); + } + limit = 0; + } + if (hlist_bl_empty(head)) { + resched_score++; + continue; + } + /* Preemption calls ahead... */ + resched_score = 0; + + /* seqcount_t usage considering PREEMPT_RT rules: + * - other writers (SoftIRQ) => serialize with spin_lock_bh + * - readers (SoftIRQ) => disable BHs + * - readers (processes) => preemption should be disabled + */ + spin_lock_bh(&t->lock[bucket & t->lock_mask].l); + preempt_disable_nested(); + write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]); + hlist_bl_lock(head); + + hlist_bl_for_each_entry_safe(cp, cn, nn, head, c_list) { + hash = ip_vs_conn_hashkey_conn(t_new, cp); + hash_key = ip_vs_rht_build_hash_key(t_new, hash); + + head2 = t_new->buckets + (hash & t_new->mask); + hlist_bl_lock(head2); + /* t_new->seqc are not used at this stage, we race + * only with add/del, so only lock the bucket. + */ + hlist_bl_del_rcu(&cp->c_list); + WRITE_ONCE(cp->hash_key, hash_key); + hlist_bl_add_head_rcu(&cp->c_list, head2); + hlist_bl_unlock(head2); + /* Too long chain? Do it in steps */ + if (++limit >= 64) + break; + } + + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[bucket & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[bucket & t->lock_mask].l); + if (limit >= 64) + goto same_bucket; + } + + rcu_assign_pointer(ipvs->conn_tab, t_new); + /* Inform readers that new table is installed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->conn_tab_changes); + + /* RCU readers should not see more than two tables in chain. + * To prevent new table to be attached wait here instead of + * freeing the old table in RCU callback. + */ + synchronize_rcu(); + ip_vs_rht_free(t); + +out: + /* Monitor if we need to shrink table */ + queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, + more_work ? 1 : 2 * HZ); +} /* * Bind a connection entry with the corresponding packet_xmit. @@ -787,17 +1073,11 @@ int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) IP_VS_DBG_ADDR(ct->daf, &ct->daddr), ntohs(ct->dport)); - /* - * Invalidate the connection template + /* Invalidate the connection template. Prefer to avoid + * rehashing, it will move it as first in chain, so use + * only dport as indication, it is not a hash key. */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } + ct->dport = htons(0xffff); /* * Simply decrease the refcnt of the template, @@ -938,7 +1218,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) /* - * Create a new connection entry and hash it into the ip_vs_conn_tab + * Create a new connection entry and hash it into the conn_tab */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, @@ -956,7 +1236,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, return NULL; } - INIT_HLIST_NODE(&cp->c_list); + INIT_HLIST_BL_NODE(&cp->c_list); timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; cp->af = p->af; @@ -1040,7 +1320,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, if (ip_vs_conntrack_enabled(ipvs)) cp->flags |= IP_VS_CONN_F_NFCT; - /* Hash it in the ip_vs_conn_tab finally */ + /* Hash it in the conn_tab finally */ ip_vs_conn_hash(cp); return cp; @@ -1052,22 +1332,33 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - unsigned int bucket; + struct ip_vs_rht *t; + int gen; + u32 bucket; unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) +static void *ip_vs_conn_array(struct seq_file *seq) { - int idx; + struct ip_vs_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_rht *t = iter->t; + struct hlist_bl_node *e; struct ip_vs_conn *cp; + int idx; - for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + if (!t) + return NULL; + for (idx = iter->bucket; idx < t->size; idx++) { unsigned int skip = 0; - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[idx], c_list) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ + if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + break; if (skip >= iter->skip_elems) { iter->bucket = idx; return cp; @@ -1076,8 +1367,13 @@ static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) ++skip; } + if (!(idx & 31)) { + cond_resched_rcu(); + /* New table installed ? */ + if (iter->gen != atomic_read(&ipvs->conn_tab_changes)) + break; + } iter->skip_elems = 0; - cond_resched_rcu(); } iter->bucket = idx; @@ -1088,38 +1384,50 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); rcu_read_lock(); + iter->gen = atomic_read(&ipvs->conn_tab_changes); + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + iter->t = rcu_dereference(ipvs->conn_tab); if (*pos == 0) { iter->skip_elems = 0; iter->bucket = 0; return SEQ_START_TOKEN; } - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; - struct hlist_node *e; + struct ip_vs_conn *cp = v; + struct hlist_bl_node *e; + struct ip_vs_rht *t; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); + + t = iter->t; + if (!t) + return NULL; /* more on same hash chain? */ - e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) { + hlist_bl_for_each_entry_continue_rcu(cp, e, c_list) { + /* Our cursor was moved to new table ? */ + if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + break; iter->skip_elems++; - return hlist_entry(e, struct ip_vs_conn, c_list); + return cp; } iter->skip_elems = 0; iter->bucket++; - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) @@ -1136,13 +1444,10 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; - struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; - if (!net_eq(cp->ipvs->net, net)) - return 0; if (cp->pe_data) { pe_data[0] = ' '; len = strlen(cp->pe->name); @@ -1214,10 +1519,6 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); else { const struct ip_vs_conn *cp = v; - struct net *net = seq_file_net(seq); - - if (!net_eq(cp->ipvs->net, net)) - return 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->daf == AF_INET6) @@ -1307,22 +1608,29 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); } -/* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { - int idx; + struct hlist_bl_node *e; struct ip_vs_conn *cp; + struct ip_vs_rht *t; + unsigned int r; + int idx; + r = get_random_u32(); rcu_read_lock(); + t = rcu_dereference(ipvs->conn_tab); + if (!t) + goto out; /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; + for (idx = 0; idx < (t->size >> 5); idx++) { + unsigned int hash = (r + idx) & t->mask; - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->ipvs != ipvs) - continue; + /* Don't care if due to moved entry we jump to another bucket + * and even to new table + */ + hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[hash], c_list) { if (atomic_read(&cp->n_control)) continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { @@ -1369,27 +1677,39 @@ drop: IP_VS_DBG(4, "drop connection\n"); ip_vs_conn_del(cp); } - cond_resched_rcu(); + if (!(idx & 31)) { + cond_resched_rcu(); + t = rcu_dereference(ipvs->conn_tab); + if (!t) + goto out; + } } + +out: rcu_read_unlock(); } #endif -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ +/* Flush all the connection entries in the conn_tab */ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct ip_vs_conn *cp, *cp_c; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) + return; + cancel_delayed_work_sync(&ipvs->conn_resize_work); + if (!atomic_read(&ipvs->conn_count)) + goto unreg; flush_again: + /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */ rcu_read_lock(); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (cp->ipvs != ipvs) - continue; + ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) { + hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { if (atomic_read(&cp->n_control)) continue; cp_c = cp->control; @@ -1410,21 +1730,47 @@ flush_again: schedule(); goto flush_again; } + +unreg: + /* Unregister the hash table and release it after RCU grace period. + * This is needed because other works may not be stopped yet and + * they may walk the tables. + */ + t = rcu_dereference_protected(ipvs->conn_tab, 1); + rcu_assign_pointer(ipvs->conn_tab, NULL); + /* Inform readers that conn_tab is changed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->conn_tab_changes); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } } #ifdef CONFIG_SYSCTL void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; struct ip_vs_conn *cp, *cp_c; + struct hlist_bl_head *head; struct ip_vs_dest *dest; + struct hlist_bl_node *e; + int old_gen, new_gen; + if (!atomic_read(&ipvs->conn_count)) + return; + old_gen = atomic_read(&ipvs->conn_tab_changes); rcu_read_lock(); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (cp->ipvs != ipvs) - continue; +repeat: + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) { + hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + resched_score++; dest = cp->dest; if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) continue; @@ -1439,13 +1785,25 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) IP_VS_DBG(4, "del controlling connection\n"); ip_vs_conn_del(cp_c); } + resched_score += 10; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + /* netns clean up started, abort delayed work */ + if (!READ_ONCE(ipvs->enable)) + goto out; + new_gen = atomic_read(&ipvs->conn_tab_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat; + } } - cond_resched_rcu(); - - /* netns clean up started, abort delayed work */ - if (!READ_ONCE(ipvs->enable)) - break; } + +out: rcu_read_unlock(); } #endif @@ -1460,6 +1818,10 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->conn_count, 0); for (idx = 0; idx < IP_VS_AF_MAX; idx++) atomic_set(&ipvs->no_cport_conns[idx], 0); + INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler); + RCU_INIT_POINTER(ipvs->conn_tab, NULL); + atomic_set(&ipvs->conn_tab_changes, 0); + ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs); #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, @@ -1495,56 +1857,36 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_conn_init(void) { + int min = IP_VS_CONN_TAB_MIN_BITS; + int max = IP_VS_CONN_TAB_MAX_BITS; size_t tab_array_size; int max_avail; -#if BITS_PER_LONG > 32 - int max = 27; -#else - int max = 20; -#endif - int min = 8; - int idx; max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; - max_avail -= 2; /* ~4 in hash row */ + /* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */ + max_avail += 1; /* hash table loaded at 50% */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); max = clamp(max_avail, min, max); ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; - ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; /* * Allocate the connection hash table and initialize its list heads */ tab_array_size = array_size(ip_vs_conn_tab_size, - sizeof(*ip_vs_conn_tab)); - ip_vs_conn_tab = kvmalloc_objs(*ip_vs_conn_tab, ip_vs_conn_tab_size); - if (!ip_vs_conn_tab) - return -ENOMEM; + sizeof(struct hlist_bl_head)); /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); - if (!ip_vs_conn_cachep) { - kvfree(ip_vs_conn_tab); + if (!ip_vs_conn_cachep) return -ENOMEM; - } pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n", ip_vs_conn_tab_size, tab_array_size / 1024); IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) - INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return 0; } @@ -1554,5 +1896,4 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - kvfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 2baef945c56f..032425025d88 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1643,6 +1643,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { struct ip_vs_scheduler *sched = NULL; + struct ip_vs_rht *tc_new = NULL; struct ip_vs_rht *t, *t_new = NULL; int af_id = ip_vs_af_index(u->af); struct ip_vs_service *svc = NULL; @@ -1702,6 +1703,17 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } } + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { + int lfactor = sysctl_conn_lfactor(ipvs); + int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); + + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!tc_new) { + ret = -ENOMEM; + goto out_err; + } + } + if (!atomic_read(&ipvs->num_services[af_id])) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) @@ -1752,6 +1764,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, rcu_assign_pointer(ipvs->svc_table, t_new); t_new = NULL; } + if (tc_new) { + rcu_assign_pointer(ipvs->conn_tab, tc_new); + tc_new = NULL; + } /* Update the virtual service counters */ if (svc->port == FTPPORT) @@ -1794,6 +1810,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, out_err: + if (tc_new) + ip_vs_rht_free(tc_new); if (t_new) ip_vs_rht_free(t_new); if (ret_hooks >= 0) diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 85f31d71e29a..0c83c7b69581 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -132,9 +132,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, } static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, - u32 initval, bool inverse) + struct ip_vs_rht *t, bool inverse) { - return jhash(p->pe_data, p->pe_data_len, initval); + return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]); } static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b2ba3befbd55..93038abbf5e0 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1755,6 +1755,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; + /* Backup server can be started without services just to sync conns, + * make sure conn_tab is created even if ipvs->enable is 0. + */ + if (state == IP_VS_STATE_BACKUP) { + mutex_lock(&ipvs->service_mutex); + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { + int lfactor = sysctl_conn_lfactor(ipvs); + int new_size = ip_vs_conn_desired_size(ipvs, NULL, + lfactor); + struct ip_vs_rht *tc_new; + + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!tc_new) { + mutex_unlock(&ipvs->service_mutex); + result = -ENOMEM; + goto out_module; + } + rcu_assign_pointer(ipvs->conn_tab, tc_new); + } + mutex_unlock(&ipvs->service_mutex); + } + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1922,6 +1944,7 @@ out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); +out_module: /* decrease the module use count */ ip_vs_use_count_dec(); return result; -- cgit v1.2.3 From f20c73b0460d15301cf1bddf0f85d060a38a75df Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:08 +0200 Subject: ipvs: use more keys for connection hashing Simon Kirby reported long time ago that IPVS connection hashing based only on the client address/port (caddr, cport) as hash keys is not suitable for setups that accept traffic on multiple virtual IPs and ports. It can happen for multiple VIP:VPORT services, for single or many fwmark service(s) that match multiple virtual IPs and ports or even for passive FTP with peristence in DR/TUN mode where we expect traffic on multiple ports for the virtual IP. Fix it by adding virtual addresses and ports to the hash function. This causes the traffic from NAT real servers to clients to use second hashing for the in->out direction. As result: - the IN direction from client will use hash node hn0 where the source/dest addresses and ports used by client will be used as hash keys - the OUT direction from NAT real servers will use hash node hn1 for the traffic from real server to client - the persistence templates are hashed only with parameters based on the IN direction, so they now will also use the virtual address, port and fwmark from the service. OLD: - all methods: c_list node: proto, caddr:cport - persistence templates: c_list node: proto, caddr_net:0 - persistence engine templates: c_list node: per-PE, PE-SIP uses jhash NEW: - all methods: hn0 node (dir 0): proto, caddr:cport -> vaddr:vport - MASQ method: hn1 node (dir 1): proto, daddr:dport -> caddr:cport - persistence templates: hn0 node (dir 0): proto, caddr_net:0 -> vaddr:vport_or_0 proto, caddr_net:0 -> fwmark:0 - persistence engine templates: hn0 node (dir 0): as before Also reorder the ip_vs_conn fields, so that hash nodes are on same read-mostly cache line while write-mostly fields are on separate cache line. Reported-by: Simon Kirby Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 104 +++++++++------ net/netfilter/ipvs/ip_vs_conn.c | 271 ++++++++++++++++++++++++++++++---------- 2 files changed, 275 insertions(+), 100 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3d595bd99eb3..72d325c81313 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -786,51 +786,48 @@ struct ip_vs_conn_param { __u8 pe_data_len; }; +/* Hash node in conn_tab */ +struct ip_vs_conn_hnode { + struct hlist_bl_node node; /* node in conn_tab */ + u32 hash_key; /* Key for the hash table */ + u8 dir; /* 0=out->in, 1=in->out */ +} __packed; + /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_bl_node c_list; /* node in conn_tab */ - __u32 hash_key; /* Key for the hash table */ - /* Protocol, addresses and port numbers */ + /* Cacheline for hash table nodes - rarely modified */ + + struct ip_vs_conn_hnode hn0; /* Original direction */ + u8 af; /* address family */ __be16 cport; + struct ip_vs_conn_hnode hn1; /* Reply direction */ + u8 daf; /* Address family of the dest */ __be16 dport; - __be16 vport; - u16 af; /* address family */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ + struct ip_vs_dest *dest; /* real server */ + atomic_t n_control; /* Number of controlled ones */ volatile __u32 flags; /* status flags */ - struct netns_ipvs *ipvs; - - /* counter and timer */ - refcount_t refcnt; /* reference count */ - struct timer_list timer; /* Expiration timer */ - volatile unsigned long timeout; /* timeout */ + /* 44/64 */ - /* Flags and state transition */ - spinlock_t lock; /* lock for state transition */ + struct ip_vs_conn *control; /* Master control connection */ + const struct ip_vs_pe *pe; + char *pe_data; + __u8 pe_data_len; volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ - __u32 fwmark; /* Fire wall mark from skb */ - unsigned long sync_endtime; /* jiffies + sent_retries */ + /* 2-byte hole */ + /* 64/96 */ - /* Control members */ - struct ip_vs_conn *control; /* Master control connection */ - atomic_t n_control; /* Number of controlled ones */ - struct ip_vs_dest *dest; /* real server */ - atomic_t in_pkts; /* incoming packet counter */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + /* 96/128 */ - /* Packet transmitter for different forwarding methods. If it - * mangles the packet, it must return NF_DROP or better NF_STOLEN, - * otherwise this must be changed to a sk_buff **. - * NF_ACCEPT can be returned when destination is local. - */ - int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); + union nf_inet_addr daddr; /* destination address */ + __u32 fwmark; /* Fire wall mark from skb */ + __be16 vport; + __u16 protocol; /* Which protocol (TCP/UDP) */ /* Note: we can group the following members into a structure, * in order to save more space, and the following members are @@ -838,14 +835,31 @@ struct ip_vs_conn { */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ + /* 128/168 */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); + /* 152/192 */ - const struct ip_vs_pe *pe; - char *pe_data; - __u8 pe_data_len; + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + spinlock_t lock; /* lock for state transition */ + refcount_t refcnt; /* reference count */ + atomic_t in_pkts; /* incoming packet counter */ + /* 64-bit: 4-byte gap */ + + /* 188/256 */ + unsigned long sync_endtime; /* jiffies + sent_retries */ + struct netns_ipvs *ipvs; + + /* Packet transmitter for different forwarding methods. If it + * mangles the packet, it must return NF_DROP or better NF_STOLEN, + * otherwise this must be changed to a sk_buff **. + * NF_ACCEPT can be returned when destination is local. + */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); struct rcu_head rcu_head; }; @@ -1628,6 +1642,19 @@ int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, int lfactor); +static inline struct ip_vs_conn * +ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn) +{ + return container_of(hn, struct ip_vs_conn, hn0); +} + +static inline struct ip_vs_conn * +ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn) +{ + return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) : + container_of(hn, struct ip_vs_conn, hn0); +} + struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, @@ -1980,6 +2007,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) return fwd; } +/* Check if connection uses double hashing */ +static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp) +{ + return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ && + !(cp->flags & IP_VS_CONN_F_TEMPLATE); +} + void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 07a47e525f01..2082bfb2d93c 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -76,11 +76,19 @@ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; /* Lock conn_tab bucket for conn hash/unhash, not for rehash */ static __always_inline void conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, - bool new_hash, struct hlist_bl_head **head_ret) + u32 hash_key2, bool use2, bool new_hash, + struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret) { - struct hlist_bl_head *head; - u32 hash_key_new; + struct hlist_bl_head *head, *head2; + u32 hash_key_new, hash_key_new2; + struct ip_vs_rht *t2 = t; + u32 idx, idx2; + idx = hash_key & t->mask; + if (use2) + idx2 = hash_key2 & t->mask; + else + idx2 = idx; if (!new_hash) { /* We need to lock the bucket in the right table */ @@ -88,31 +96,64 @@ retry: if (!ip_vs_rht_same_table(t, hash_key)) { /* It is already moved to new table */ t = rcu_dereference(t->new_tbl); + /* Rehashing works in two steps and we may detect + * both nodes in different tables, use idx/idx2 + * for proper lock ordering for heads. + */ + idx = hash_key & t->mask; + idx |= IP_VS_RHT_TABLE_ID_MASK; + } + if (use2) { + if (!ip_vs_rht_same_table(t2, hash_key2)) { + /* It is already moved to new table */ + t2 = rcu_dereference(t2->new_tbl); + idx2 = hash_key2 & t2->mask; + idx2 |= IP_VS_RHT_TABLE_ID_MASK; + } + } else { + idx2 = idx; } } head = t->buckets + (hash_key & t->mask); + head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head; local_bh_disable(); /* Do not touch seqcount, this is a safe operation */ - hlist_bl_lock(head); + if (idx <= idx2) { + hlist_bl_lock(head); + if (head != head2) + hlist_bl_lock(head2); + } else { + hlist_bl_lock(head2); + hlist_bl_lock(head); + } if (!new_hash) { /* Ensure hash_key is read under lock */ - hash_key_new = READ_ONCE(cp->hash_key); + hash_key_new = READ_ONCE(cp->hn0.hash_key); + hash_key_new2 = READ_ONCE(cp->hn1.hash_key); /* Hash changed ? */ - if (hash_key != hash_key_new) { + if (hash_key != hash_key_new || + (hash_key2 != hash_key_new2 && use2)) { + if (head != head2) + hlist_bl_unlock(head2); hlist_bl_unlock(head); local_bh_enable(); hash_key = hash_key_new; + hash_key2 = hash_key_new2; goto retry; } } *head_ret = head; + *head2_ret = head2; } -static inline void conn_tab_unlock(struct hlist_bl_head *head) +static inline void conn_tab_unlock(struct hlist_bl_head *head, + struct hlist_bl_head *head2) { + if (head != head2) + hlist_bl_unlock(head2); hlist_bl_unlock(head); local_bh_enable(); } @@ -123,26 +164,34 @@ static void ip_vs_conn_expire(struct timer_list *t); * Returns hash value for IPVS connection entry */ static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto, - const union nf_inet_addr *addr, __be16 port) + const union nf_inet_addr *addr, __be16 port, + const union nf_inet_addr *laddr, __be16 lport) { u64 a = (u32)proto << 16 | (__force u32)port; + u64 d; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { u64 b = (u64)addr->all[0] << 32 | addr->all[1]; u64 c = (u64)addr->all[2] << 32 | addr->all[3]; - return (u32)siphash_3u64(a, b, c, &t->hash_key); + a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport; + c ^= laddr->all[1]; + d = (u64)laddr->all[0] << 32 | laddr->all[3]; + return (u32)siphash_4u64(a, b, c, d, &t->hash_key); } #endif a |= (u64)addr->all[0] << 32; - return (u32)siphash_1u64(a, &t->hash_key); + d = (u64)laddr->all[0] << 32 | (__force u32)lport; + return (u32)siphash_2u64(a, d, &t->hash_key); } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, struct ip_vs_rht *t, bool inverse) { + const union nf_inet_addr *laddr; const union nf_inet_addr *addr; + __be16 lport; __be16 port; if (p->pe_data && p->pe->hashkey_raw) @@ -151,21 +200,33 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, if (likely(!inverse)) { addr = p->caddr; port = p->cport; + laddr = p->vaddr; + lport = p->vport; } else { addr = p->vaddr; port = p->vport; + laddr = p->caddr; + lport = p->cport; } - return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr, + lport); } static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, - const struct ip_vs_conn *cp) + const struct ip_vs_conn *cp, + bool out) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, - &cp->caddr, cp->cport, NULL, 0, &p); + if (!out) + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->caddr, cp->cport, &cp->vaddr, + cp->vport, &p); + else + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->daddr, cp->dport, &cp->caddr, + cp->cport, &p); if (cp->pe) { p.pe = cp->pe; @@ -173,7 +234,7 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, p.pe_data_len = cp->pe_data_len; } - return ip_vs_conn_hashkey_param(&p, t, false); + return ip_vs_conn_hashkey_param(&p, t, out); } /* Hashes ip_vs_conn in conn_tab @@ -182,9 +243,11 @@ static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; - struct hlist_bl_head *head; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; struct ip_vs_rht *t; - u32 hash_key; + u32 hash, hash2; + bool use2; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -194,15 +257,28 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) t = rcu_dereference(ipvs->conn_tab); t = rcu_dereference(t->new_tbl); - hash_key = ip_vs_rht_build_hash_key(t, ip_vs_conn_hashkey_conn(t, cp)); - conn_tab_lock(t, cp, hash_key, true /* new_hash */, &head); + hash = ip_vs_conn_hashkey_conn(t, cp, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + if (ip_vs_conn_use_hash2(cp)) { + hash2 = ip_vs_conn_hashkey_conn(t, cp, true); + hash_key2 = ip_vs_rht_build_hash_key(t, hash2); + use2 = true; + } else { + hash_key2 = hash_key; + use2 = false; + } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hash_key, hash_key); + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->c_list, head); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pS\n", @@ -211,7 +287,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - conn_tab_unlock(head); + conn_tab_unlock(head, head2); /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -227,10 +303,11 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; - struct hlist_bl_head *head; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; struct ip_vs_rht *t; bool ret = false; - u32 hash_key; + bool use2; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return refcount_dec_if_one(&cp->refcnt); @@ -238,22 +315,27 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); - hash_key = READ_ONCE(cp->hash_key); + hash_key = READ_ONCE(cp->hn0.hash_key); + hash_key2 = READ_ONCE(cp->hn1.hash_key); + use2 = ip_vs_conn_use_hash2(cp); - conn_tab_lock(t, cp, hash_key, false /* new_hash */, &head); + conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ if (refcount_dec_if_one(&cp->refcnt)) { - hlist_bl_del_rcu(&cp->c_list); + hlist_bl_del_rcu(&cp->hn0.node); + if (use2) + hlist_bl_del_rcu(&cp->hn1.node); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } } spin_unlock(&cp->lock); - conn_tab_unlock(head); + conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -272,6 +354,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -284,9 +367,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, false); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) == hash_key && - p->cport == cp->cport && + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); + if (p->cport == cp->cport && p->vport == cp->vport && cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && @@ -376,6 +462,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -388,9 +475,11 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, false); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) != hash_key) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) continue; + cp = ip_vs_hn0_to_conn(hn); if (unlikely(p->pe_data && p->pe->ct_match)) { if (p->pe == cp->pe && p->pe->ct_match(p, cp) && @@ -442,6 +531,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); struct netns_ipvs *ipvs = p->ipvs; const union nf_inet_addr *saddr; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *pt; struct hlist_bl_node *e; @@ -455,9 +545,12 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) hash = ip_vs_conn_hashkey_param(p, t, true); hash_key = ip_vs_rht_build_hash_key(t, hash); ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { - if (READ_ONCE(cp->hash_key) != hash_key || - p->vport != cp->cport) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + /* dir can be 0 for DR/TUN */ + if (READ_ONCE(hn->hash_key) != hash_key) + continue; + cp = ip_vs_hn_to_conn(hn); + if (p->vport != cp->cport) continue; if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { @@ -536,21 +629,33 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { struct hlist_bl_head *head, *head2, *head_new; + bool use2 = ip_vs_conn_use_hash2(cp); struct netns_ipvs *ipvs = cp->ipvs; int af_id = ip_vs_af_index(cp->af); u32 hash_r = 0, hash_key_r = 0; struct ip_vs_rht *t, *tp, *t2; + struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; int ntbl; + int dir; + + /* No packets from inside, so we can do it in 2 steps. */ + dir = use2 ? 1 : 0; - ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, - cport, &cp->vaddr, cp->vport, &p); +next_dir: + if (dir) + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr, + cp->dport, &cp->caddr, cport, &p); + else + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, + cport, &cp->vaddr, cp->vport, &p); + hn = dir ? &cp->hn1 : &cp->hn0; ntbl = 0; /* Attempt to rehash cp safely, by informing seqcount readers */ t = rcu_dereference(ipvs->conn_tab); - hash_key = READ_ONCE(cp->hash_key); + hash_key = READ_ONCE(hn->hash_key); tp = NULL; retry: @@ -567,7 +672,7 @@ retry: t2 = rcu_dereference(t->new_tbl); /* Calc new hash once per table */ if (tp != t2) { - hash_r = ip_vs_conn_hashkey_param(&p, t2, false); + hash_r = ip_vs_conn_hashkey_param(&p, t2, dir); hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r); tp = t2; } @@ -591,7 +696,7 @@ retry: hlist_bl_lock(head2); /* Ensure hash_key is read under lock */ - hash_key_new = READ_ONCE(cp->hash_key); + hash_key_new = READ_ONCE(hn->hash_key); /* Racing with another rehashing ? */ if (unlikely(hash_key != hash_key_new)) { if (head != head2) @@ -611,14 +716,21 @@ retry: * parameters in cp do not change, i.e. cport is * the only possible change. */ - WRITE_ONCE(cp->hash_key, hash_key_r); + WRITE_ONCE(hn->hash_key, hash_key_r); + if (!use2) + WRITE_ONCE(cp->hn1.hash_key, hash_key_r); + /* For dir=1 we do not check in flags if hn is already + * rehashed but this check will do it. + */ if (head != head2) { - hlist_bl_del_rcu(&cp->c_list); - hlist_bl_add_head_rcu(&cp->c_list, head_new); + hlist_bl_del_rcu(&hn->node); + hlist_bl_add_head_rcu(&hn->node, head_new); + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; } - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; } spin_unlock(&cp->lock); @@ -628,6 +740,8 @@ retry: write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + if (dir--) + goto next_dir; } /* Get default load factor to map conn_count/u_thresh to t->size */ @@ -639,6 +753,8 @@ static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs) factor = -3; else factor = -1; + /* Double hashing adds twice more nodes for NAT */ + factor--; return factor; } @@ -679,6 +795,7 @@ static void conn_resize_work_handler(struct work_struct *work) unsigned int resched_score = 0; struct hlist_bl_node *cn, *nn; struct ip_vs_rht *t, *t_new; + struct ip_vs_conn_hnode *hn; struct netns_ipvs *ipvs; struct ip_vs_conn *cp; bool more_work = false; @@ -747,8 +864,9 @@ same_bucket: write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]); hlist_bl_lock(head); - hlist_bl_for_each_entry_safe(cp, cn, nn, head, c_list) { - hash = ip_vs_conn_hashkey_conn(t_new, cp); + hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) { + cp = ip_vs_hn_to_conn(hn); + hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir); hash_key = ip_vs_rht_build_hash_key(t_new, hash); head2 = t_new->buckets + (hash & t_new->mask); @@ -756,9 +874,12 @@ same_bucket: /* t_new->seqc are not used at this stage, we race * only with add/del, so only lock the bucket. */ - hlist_bl_del_rcu(&cp->c_list); - WRITE_ONCE(cp->hash_key, hash_key); - hlist_bl_add_head_rcu(&cp->c_list, head2); + hlist_bl_del_rcu(&hn->node); + WRITE_ONCE(hn->hash_key, hash_key); + /* Keep both hash keys in sync if no double hashing */ + if (!ip_vs_conn_use_hash2(cp)) + WRITE_ONCE(cp->hn1.hash_key, hash_key); + hlist_bl_add_head_rcu(&hn->node, head2); hlist_bl_unlock(head2); /* Too long chain? Do it in steps */ if (++limit >= 64) @@ -1236,10 +1357,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, return NULL; } - INIT_HLIST_BL_NODE(&cp->c_list); + INIT_HLIST_BL_NODE(&cp->hn0.node); + INIT_HLIST_BL_NODE(&cp->hn1.node); timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; + cp->hn0.dir = 0; cp->af = p->af; + cp->hn1.dir = 1; cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); @@ -1344,8 +1468,8 @@ static void *ip_vs_conn_array(struct seq_file *seq) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_rht *t = iter->t; + struct ip_vs_conn_hnode *hn; struct hlist_bl_node *e; - struct ip_vs_conn *cp; int idx; if (!t) @@ -1353,15 +1477,17 @@ static void *ip_vs_conn_array(struct seq_file *seq) for (idx = iter->bucket; idx < t->size; idx++) { unsigned int skip = 0; - hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[idx], c_list) { + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) break; + if (hn->dir != 0) + continue; if (skip >= iter->skip_elems) { iter->bucket = idx; - return cp; + return hn; } ++skip; @@ -1403,7 +1529,7 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_iter_state *iter = seq->private; - struct ip_vs_conn *cp = v; + struct ip_vs_conn_hnode *hn = v; struct hlist_bl_node *e; struct ip_vs_rht *t; @@ -1416,12 +1542,14 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return NULL; /* more on same hash chain? */ - hlist_bl_for_each_entry_continue_rcu(cp, e, c_list) { + hlist_bl_for_each_entry_continue_rcu(hn, e, node) { /* Our cursor was moved to new table ? */ - if (!ip_vs_rht_same_table(t, READ_ONCE(cp->hash_key))) + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) break; + if (hn->dir != 0) + continue; iter->skip_elems++; - return cp; + return hn; } iter->skip_elems = 0; @@ -1443,7 +1571,8 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { - const struct ip_vs_conn *cp = v; + struct ip_vs_conn_hnode *hn = v; + const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; @@ -1610,6 +1739,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { + struct ip_vs_conn_hnode *hn; struct hlist_bl_node *e; struct ip_vs_conn *cp; struct ip_vs_rht *t; @@ -1630,7 +1760,10 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) /* Don't care if due to moved entry we jump to another bucket * and even to new table */ - hlist_bl_for_each_entry_rcu(cp, e, &t->buckets[hash], c_list) { + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { @@ -1695,6 +1828,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_rht *t, *p; struct hlist_bl_node *e; @@ -1709,7 +1843,10 @@ flush_again: /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */ rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; cp_c = cp->control; @@ -1756,6 +1893,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); unsigned int resched_score = 0; struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; struct hlist_bl_head *head; struct ip_vs_dest *dest; struct hlist_bl_node *e; @@ -1769,7 +1907,10 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) repeat: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) { - hlist_bl_for_each_entry_rcu(cp, e, head, c_list) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); resched_score++; dest = cp->dest; if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) -- cgit v1.2.3 From 44d93cf1abb6a85d65c3b4b027c82d44263de6a5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Wed, 4 Mar 2026 14:23:42 +0530 Subject: wifi: UHR: define DPS/DBE/P-EDCA elements and fix size parsing Add UHR Operation and Capability definitions and parsing helpers: - Define ieee80211_uhr_dps_info, ieee80211_uhr_dbe_info, ieee80211_uhr_p_edca_info with masks. - Update ieee80211_uhr_oper_size_ok() to account for optional DPS/DBE/P-EDCA blocks. - Move NPCA pointer position after DPS Operation Parameter if it is present in ieee80211_uhr_oper_size_ok(). - Move NPCA pointer position after DPS info if it is present in ieee80211_uhr_npca_info(). Signed-off-by: Karthikeyan Kathirvel Link: https://patch.msgid.link/20260304085343.1093993-2-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 271 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 265 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 132acced7d79..9729d23e4766 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -29,11 +29,216 @@ struct ieee80211_uhr_operation { #define IEEE80211_UHR_NPCA_PARAMS_MOPLEN 0x00400000 #define IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES 0x00800000 +/** + * struct ieee80211_uhr_npca_info - npca operation information + * + * This structure is the "NPCA Operation Parameters field format" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa4. + * + * Refer to IEEE80211_UHR_NPCA* + * @params: + * NPCA Primary Channel - NPCA primary channel + * NPCA_Min Duration Threshold - Minimum duration of inter-BSS activity + * NPCA Switching Delay - + * Time needed by an NPCA AP to switch from the + * BSS primary channel to the NPCA primary channel + * in the unit of 4 µs. + * NPCA Switching Back Delay - + * Time to switch from the NPCA primary channel + * to the BSS primary channel in the unit of 4 µs. + * NPCA Initial QSRC - + * Initialize the EDCAF QSRC[AC] variables + * when an NPCA STA in the BSS + * switches to NPCA operation. + * NPCA MOPLEN - + * Indicates which conditions can be used to + * initiate an NPCA operation, + * 1 -> both PHYLEN NPCA operation and MOPLEN + * NPCA operation are + * permitted in the BSS + * 0 -> only PHYLEN NPCA operation is allowed in the BSS. + * NPCA Disabled Subchannel Bitmap Present - + * Indicates whether the NPCA Disabled Subchannel + * Bitmap field is present. A 1 in this field indicates that + * the NPCA Disabled Subchannel Bitmap field is present + * @dis_subch_bmap: + * A bit in the bitmap that lies within the BSS bandwidth is set + * to 1 to indicate that the corresponding 20 MHz subchannel is + * punctured and is set to 0 to indicate that the corresponding + * 20 MHz subchannel is not punctured. A bit in the bitmap that + * falls outside of the BSS bandwidth is reserved. This field is + * present when the value of the NPCA Disabled Subchannel Bitmap + * Field Present field is equal to 1, and not present, otherwise + */ struct ieee80211_uhr_npca_info { __le32 params; __le16 dis_subch_bmap[]; } __packed; +#define IEEE80211_UHR_DPS_PADDING_DELAY 0x0000003F +#define IEEE80211_UHR_DPS_TRANSITION_DELAY 0x00003F00 +#define IEEE80211_UHR_DPS_ICF_REQUIRED 0x00010000 +#define IEEE80211_UHR_DPS_PARAMETERIZED_FLAG 0x00020000 +#define IEEE80211_UHR_DPS_LC_MODE_BW 0x001C0000 +#define IEEE80211_UHR_DPS_LC_MODE_NSS 0x01E00000 +#define IEEE80211_UHR_DPS_LC_MODE_MCS 0x1E000000 +#define IEEE80211_UHR_DPS_MOBILE_AP_DPS_STATIC_HCM 0x20000000 + +/** + * struct ieee80211_uhr_dps_info - DPS operation information + * + * This structure is the "DPS Operation Parameter field" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.1.87. See Figure 9-207u. + * + * Refer to IEEE80211_UHR_DPS* + * @params: + * DPS Padding Delay - + * Indicates the minimum MAC padding + * duration that is required by a DPS STA + * in an ICF to cause the STA to transition + * from the lower capability mode to the + * higher capability mode. The DPS Padding + * Delay field is in units of 4 µs. + * DPS Transition Delay - + * Indicates the amount of time required by a + * DPS STA to transition from the higher + * capability mode to the lower capability + * mode. The DPS Transition Delay field is in + * units of 4 µs. + * ICF Required - + * Indicates when the DPS assisting STA needs + * to transmit an ICF frame to the peer DPS STA + * before performing the frame exchanges with + * the peer DPS STA in a TXOP. + * 1 -> indicates that the transmission of the + * ICF frame to the peer DPS STA prior to + * any frame exchange is needed. + * 0 -> ICF transmission before the frame + * exchanges with the peer DPS STA is only + * needed if the frame exchange is performed + * in the HC mode. + * Parameterized Flag - + * 0 -> indicates that only 20 MHz, 1 SS, + * non-HT PPDU format with the data + * rate of 6, 12, and 24 Mb/s as the + * default mode are supported by the + * DPS STA in the LC mode + * 1 -> indicates that a bandwidth up to the + * bandwidth indicated in the LC Mode + * Bandwidth field, a number of spatial + * streams up to the NSS indicated in + * the LC Mode Nss field, and an MCS up + * to the MCS indicated in the LC Mode + * MCS fields are supported by the DPS + * STA in the LC mode as the + * parameterized mode. + * LC Mode Bandwidth - + * Indicates the maximum bandwidth supported + * by the STA in the LC mode. + * LC Mode NSS - + * Indicates the maximum number of the spatial + * streams supported by the STA in the LC mode. + * LC Mode MCS - + * Indicates the highest MCS supported by the STA + * in the LC mode. + * Mobile AP DPS Static HCM - + * 1 -> indicates that it will remain in the DPS high + * capability mode until the next TBTT on that + * link. + * 0 -> otherwise. + */ +struct ieee80211_uhr_dps_info { + __le32 params; +} __packed; + +#define IEEE80211_UHR_DBE_OPER_BANDWIDTH 0x07 +#define IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES 0x08 + +/** + * enum ieee80211_uhr_dbe_oper_bw - DBE Operational Bandwidth + * + * Encoding for the DBE Operational Bandwidth field in the UHR Operation + * element (DBE Operation Parameters). + * + * @IEEE80211_UHR_DBE_OPER_BW_40: 40 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_80: 80 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_160: 160 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_1: 320-1 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_2: 320-2 MHz operational DBE bandwidth + */ +enum ieee80211_uhr_dbe_oper_bw { + IEEE80211_UHR_DBE_OPER_BW_40 = 1, + IEEE80211_UHR_DBE_OPER_BW_80 = 2, + IEEE80211_UHR_DBE_OPER_BW_160 = 3, + IEEE80211_UHR_DBE_OPER_BW_320_1 = 4, + IEEE80211_UHR_DBE_OPER_BW_320_2 = 5, +}; + +/** + * struct ieee80211_uhr_dbe_info - DBE operation information + * + * This structure is the "DBE Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa6. + * + * Refer to IEEE80211_UHR_DBE_OPER* + * @params: + * B0-B2 - DBE Operational Bandwidth field, see + * "enum ieee80211_uhr_dbe_oper_bw" for values. + * Value 0 is reserved. + * Value 1 indicates 40 MHz operational DBE bandwidth. + * Value 2 indicates 80 MHz operational DBE bandwidth. + * Value 3 indicates 160 MHz operational DBE bandwidth. + * Value 4 indicates 320-1 MHz operational DBE bandwidth. + * Value 5 indicates 320-2 MHz operational DBE bandwidth. + * Values 6 to 7 are reserved. + * B3 - DBE Disabled Subchannel Bitmap Present. + * @dis_subch_bmap: DBE Disabled Subchannel Bitmap field is set to indicate + * disabled 20 MHz subchannels within the DBE Bandwidth. + */ +struct ieee80211_uhr_dbe_info { + u8 params; + __le16 dis_subch_bmap[]; +} __packed; + +#define IEEE80211_UHR_P_EDCA_ECWMIN 0x0F +#define IEEE80211_UHR_P_EDCA_ECWMAX 0xF0 +#define IEEE80211_UHR_P_EDCA_AIFSN 0x000F +#define IEEE80211_UHR_P_EDCA_CW_DS 0x0030 +#define IEEE80211_UHR_P_EDCA_PSRC_THRESHOLD 0x01C0 +#define IEEE80211_UHR_P_EDCA_QSRC_THRESHOLD 0x0600 + +/** + * struct ieee80211_uhr_p_edca_info - P-EDCA operation information + * + * This structure is the "P-EDCA Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa5. + * + * Refer to IEEE80211_UHR_P_EDCA* + * @p_edca_ec: P-EDCA ECWmin and ECWmax. + * These fields indicate the CWmin and CWmax values used by a + * P-EDCA STA during P-EDCA contention. + * @params: AIFSN, CW DS, PSRC threshold, and QSRC threshold. + * - The AIFSN field indicates the AIFSN value used by a P-EDCA STA + * during P-EDCA contention. + * - The CW DS field indicates the value used for randomization of the + * transmission slot of the DS-CTS frame. The value 3 is reserved. + * The value 0 indicates that randomization is not enabled. + * - The P-EDCA PSRC threshold field indicates the maximum number of + * allowed consecutive DS-CTS transmissions. The value 0 and values + * greater than 4 are reserved. + * - The P-EDCA QSRC threshold field indicates the value of the + * QSRC[AC_VO] counter required to start P-EDCA contention. The + * value 0 is reserved. + */ +struct ieee80211_uhr_p_edca_info { + u8 p_edca_ec; + __le16 params; +} __packed; + static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, bool beacon) { @@ -47,19 +252,52 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, if (beacon) return true; - /* FIXME: DPS, DBE, P-EDCA (consider order, also relative to NPCA) */ + /* DPS Operation Parameters (fixed 4 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) { + needed += sizeof(struct ieee80211_uhr_dps_info); + if (len < needed) + return false; + } + /* NPCA Operation Parameters (fixed 4 bytes + optional 2 bytes) */ if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA)) { const struct ieee80211_uhr_npca_info *npca = - (const void *)oper->variable; + (const void *)(data + needed); needed += sizeof(*npca); - if (len < needed) return false; - if (npca->params & cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) + if (npca->params & + cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) { needed += sizeof(npca->dis_subch_bmap[0]); + if (len < needed) + return false; + } + } + + /* P-EDCA Operation Parameters (fixed 3 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA)) { + needed += sizeof(struct ieee80211_uhr_p_edca_info); + if (len < needed) + return false; + } + + /* DBE Operation Parameters (fixed 1 byte + optional 2 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DBE_ENA)) { + const struct ieee80211_uhr_dbe_info *dbe = + (const void *)(data + needed); + + needed += sizeof(*dbe); + if (len < needed) + return false; + + if (dbe->params & + IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES) { + needed += sizeof(dbe->dis_subch_bmap[0]); + if (len < needed) + return false; + } } return len >= needed; @@ -72,12 +310,15 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, static inline const struct ieee80211_uhr_npca_info * ieee80211_uhr_npca_info(const struct ieee80211_uhr_operation *oper) { + const u8 *pos = oper->variable; + if (!(oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA))) return NULL; - /* FIXME: DPS */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) + pos += sizeof(struct ieee80211_uhr_dps_info); - return (const void *)oper->variable; + return (const void *)pos; } static inline const __le16 * @@ -131,6 +372,24 @@ ieee80211_uhr_npca_dis_subch_bitmap(const struct ieee80211_uhr_operation *oper) #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_160_PRES 0x08 #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_320_PRES 0x10 +/** + * enum ieee80211_uhr_dbe_max_supported_bw - DBE Maximum Supported Bandwidth + * + * As per spec P802.11bn_D1.3 "Table 9-bb5—Encoding of the DBE Maximum + * Supported Bandwidth field". + * + * @IEEE80211_UHR_DBE_MAX_BW_40: Indicates 40 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_80: Indicates 80 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_160: Indicates 160 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_320: Indicates 320 MHz DBE max supported bw + */ +enum ieee80211_uhr_dbe_max_supported_bw { + IEEE80211_UHR_DBE_MAX_BW_40 = 1, + IEEE80211_UHR_DBE_MAX_BW_80 = 2, + IEEE80211_UHR_DBE_MAX_BW_160 = 3, + IEEE80211_UHR_DBE_MAX_BW_320 = 4, +}; + struct ieee80211_uhr_cap_mac { u8 mac_cap[5]; } __packed; -- cgit v1.2.3 From 242ab49aca7fa754c31d66f1c846396832dbece3 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 25 Feb 2026 19:57:08 +0200 Subject: drm/intel: add reg_bits.h for the various register content helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a shared header that's used by i915, xe, and i915 display. Reviewed-by: Michał Grzelak Acked-by: Rodrigo Vivi Link: https://patch.msgid.link/e641fe6dcecef92367471f3e0d150f9f47ae4edc.1772042022.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_reg_defs.h | 133 +-------------------------------- include/drm/intel/reg_bits.h | 139 +++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 132 deletions(-) create mode 100644 include/drm/intel/reg_bits.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_reg_defs.h b/drivers/gpu/drm/i915/i915_reg_defs.h index 9d72f6fae4ae..a1dc7ff2bef7 100644 --- a/drivers/gpu/drm/i915/i915_reg_defs.h +++ b/drivers/gpu/drm/i915/i915_reg_defs.h @@ -6,116 +6,7 @@ #ifndef __I915_REG_DEFS__ #define __I915_REG_DEFS__ -#include -#include - -/* - * Wrappers over the generic fixed width BIT_U*() and GENMASK_U*() - * implementations, for compatibility reasons with previous implementation. - */ -#define REG_GENMASK(high, low) GENMASK_U32(high, low) -#define REG_GENMASK64(high, low) GENMASK_U64(high, low) -#define REG_GENMASK16(high, low) GENMASK_U16(high, low) -#define REG_GENMASK8(high, low) GENMASK_U8(high, low) - -#define REG_BIT(n) BIT_U32(n) -#define REG_BIT64(n) BIT_U64(n) -#define REG_BIT16(n) BIT_U16(n) -#define REG_BIT8(n) BIT_U8(n) - -/* - * Local integer constant expression version of is_power_of_2(). - */ -#define IS_POWER_OF_2(__x) ((__x) && (((__x) & ((__x) - 1)) == 0)) - -/** - * REG_FIELD_PREP() - Prepare a u32 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to put in the field - * - * Local copy of FIELD_PREP() to generate an integer constant expression, force - * u32 and for consistency with REG_FIELD_GET(), REG_BIT() and REG_GENMASK(). - * - * @return: @__val masked and shifted into the field defined by @__mask. - */ -#define REG_FIELD_PREP(__mask, __val) \ - ((u32)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ - BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ - BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U32_MAX) + \ - BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ - BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) - -/** - * REG_FIELD_PREP8() - Prepare a u8 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to put in the field - * - * Local copy of FIELD_PREP() to generate an integer constant expression, force - * u8 and for consistency with REG_FIELD_GET8(), REG_BIT8() and REG_GENMASK8(). - * - * @return: @__val masked and shifted into the field defined by @__mask. - */ -#define REG_FIELD_PREP8(__mask, __val) \ - ((u8)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ - BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ - BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U8_MAX) + \ - BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ - BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) - -/** - * REG_FIELD_GET() - Extract a u32 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to extract the bitfield value from - * - * Local wrapper for FIELD_GET() to force u32 and for consistency with - * REG_FIELD_PREP(), REG_BIT() and REG_GENMASK(). - * - * @return: Masked and shifted value of the field defined by @__mask in @__val. - */ -#define REG_FIELD_GET(__mask, __val) ((u32)FIELD_GET(__mask, __val)) - -/** - * REG_FIELD_GET64() - Extract a u64 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to extract the bitfield value from - * - * Local wrapper for FIELD_GET() to force u64 and for consistency with - * REG_GENMASK64(). - * - * @return: Masked and shifted value of the field defined by @__mask in @__val. - */ -#define REG_FIELD_GET64(__mask, __val) ((u64)FIELD_GET(__mask, __val)) - - -/** - * REG_FIELD_PREP16() - Prepare a u16 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to put in the field - * - * Local copy of FIELD_PREP16() to generate an integer constant - * expression, force u8 and for consistency with - * REG_FIELD_GET16(), REG_BIT16() and REG_GENMASK16(). - * - * @return: @__val masked and shifted into the field defined by @__mask. - */ -#define REG_FIELD_PREP16(__mask, __val) \ - ((u16)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ - BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ - BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U16_MAX) + \ - BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ - BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) - -#define REG_MASKED_FIELD(mask, value) \ - (BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(mask), (mask) & 0xffff0000, 0)) + \ - BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(value), (value) & 0xffff0000, 0)) + \ - BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(mask) && __builtin_constant_p(value), (value) & ~(mask), 0)) + \ - ((mask) << 16 | (value))) - -#define REG_MASKED_FIELD_ENABLE(a) \ - (__builtin_choose_expr(__builtin_constant_p(a), REG_MASKED_FIELD((a), (a)), ({ typeof(a) _a = (a); REG_MASKED_FIELD(_a, _a); }))) - -#define REG_MASKED_FIELD_DISABLE(a) \ - (REG_MASKED_FIELD((a), 0)) +#include /* * Given the first two numbers __a and __b of arbitrarily many evenly spaced @@ -161,28 +52,6 @@ */ #define _PICK(__index, ...) (((const u32 []){ __VA_ARGS__ })[__index]) -/** - * REG_FIELD_GET8() - Extract a u8 bitfield value - * @__mask: shifted mask defining the field's length and position - * @__val: value to extract the bitfield value from - * - * Local wrapper for FIELD_GET() to force u8 and for consistency with - * REG_FIELD_PREP(), REG_BIT() and REG_GENMASK(). - * - * @return: Masked and shifted value of the field defined by @__mask in @__val. - */ -#define REG_FIELD_GET8(__mask, __val) ((u8)FIELD_GET(__mask, __val)) - -/** - * REG_FIELD_MAX() - produce the maximum value representable by a field - * @__mask: shifted mask defining the field's length and position - * - * Local wrapper for FIELD_MAX() to return the maximum bit value that can - * be held in the field specified by @_mask, cast to u32 for consistency - * with other macros. - */ -#define REG_FIELD_MAX(__mask) ((u32)FIELD_MAX(__mask)) - typedef struct { u32 reg; } i915_reg_t; diff --git a/include/drm/intel/reg_bits.h b/include/drm/intel/reg_bits.h new file mode 100644 index 000000000000..2a9066e1d808 --- /dev/null +++ b/include/drm/intel/reg_bits.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _REG_BITS_H_ +#define _REG_BITS_H_ + +#include +#include + +/* + * Wrappers over the generic fixed width BIT_U*() and GENMASK_U*() + * implementations, for compatibility reasons with previous implementation. + */ +#define REG_GENMASK(high, low) GENMASK_U32(high, low) +#define REG_GENMASK64(high, low) GENMASK_U64(high, low) +#define REG_GENMASK16(high, low) GENMASK_U16(high, low) +#define REG_GENMASK8(high, low) GENMASK_U8(high, low) + +#define REG_BIT(n) BIT_U32(n) +#define REG_BIT64(n) BIT_U64(n) +#define REG_BIT16(n) BIT_U16(n) +#define REG_BIT8(n) BIT_U8(n) + +/* + * Local integer constant expression version of is_power_of_2(). + */ +#define IS_POWER_OF_2(__x) ((__x) && (((__x) & ((__x) - 1)) == 0)) + +/** + * REG_FIELD_PREP8() - Prepare a u8 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to put in the field + * + * Local copy of FIELD_PREP() to generate an integer constant expression, force + * u8 and for consistency with REG_FIELD_GET8(), REG_BIT8() and REG_GENMASK8(). + * + * @return: @__val masked and shifted into the field defined by @__mask. + */ +#define REG_FIELD_PREP8(__mask, __val) \ + ((u8)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ + BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ + BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U8_MAX) + \ + BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ + BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) + +/** + * REG_FIELD_PREP16() - Prepare a u16 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to put in the field + * + * Local copy of FIELD_PREP16() to generate an integer constant + * expression, force u8 and for consistency with + * REG_FIELD_GET16(), REG_BIT16() and REG_GENMASK16(). + * + * @return: @__val masked and shifted into the field defined by @__mask. + */ +#define REG_FIELD_PREP16(__mask, __val) \ + ((u16)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ + BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ + BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U16_MAX) + \ + BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ + BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) + +/** + * REG_FIELD_PREP() - Prepare a u32 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to put in the field + * + * Local copy of FIELD_PREP() to generate an integer constant expression, force + * u32 and for consistency with REG_FIELD_GET(), REG_BIT() and REG_GENMASK(). + * + * @return: @__val masked and shifted into the field defined by @__mask. + */ +#define REG_FIELD_PREP(__mask, __val) \ + ((u32)((((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) + \ + BUILD_BUG_ON_ZERO(!__is_constexpr(__mask)) + \ + BUILD_BUG_ON_ZERO((__mask) == 0 || (__mask) > U32_MAX) + \ + BUILD_BUG_ON_ZERO(!IS_POWER_OF_2((__mask) + (1ULL << __bf_shf(__mask)))) + \ + BUILD_BUG_ON_ZERO(__builtin_choose_expr(__is_constexpr(__val), (~((__mask) >> __bf_shf(__mask)) & (__val)), 0)))) + +/** + * REG_FIELD_GET8() - Extract a u8 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to extract the bitfield value from + * + * Local wrapper for FIELD_GET() to force u8 and for consistency with + * REG_FIELD_PREP(), REG_BIT() and REG_GENMASK(). + * + * @return: Masked and shifted value of the field defined by @__mask in @__val. + */ +#define REG_FIELD_GET8(__mask, __val) ((u8)FIELD_GET(__mask, __val)) + +/** + * REG_FIELD_GET() - Extract a u32 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to extract the bitfield value from + * + * Local wrapper for FIELD_GET() to force u32 and for consistency with + * REG_FIELD_PREP(), REG_BIT() and REG_GENMASK(). + * + * @return: Masked and shifted value of the field defined by @__mask in @__val. + */ +#define REG_FIELD_GET(__mask, __val) ((u32)FIELD_GET(__mask, __val)) + +/** + * REG_FIELD_GET64() - Extract a u64 bitfield value + * @__mask: shifted mask defining the field's length and position + * @__val: value to extract the bitfield value from + * + * Local wrapper for FIELD_GET() to force u64 and for consistency with + * REG_GENMASK64(). + * + * @return: Masked and shifted value of the field defined by @__mask in @__val. + */ +#define REG_FIELD_GET64(__mask, __val) ((u64)FIELD_GET(__mask, __val)) + +/** + * REG_FIELD_MAX() - produce the maximum value representable by a field + * @__mask: shifted mask defining the field's length and position + * + * Local wrapper for FIELD_MAX() to return the maximum bit value that can + * be held in the field specified by @_mask, cast to u32 for consistency + * with other macros. + */ +#define REG_FIELD_MAX(__mask) ((u32)FIELD_MAX(__mask)) + +#define REG_MASKED_FIELD(mask, value) \ + (BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(mask), (mask) & 0xffff0000, 0)) + \ + BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(value), (value) & 0xffff0000, 0)) + \ + BUILD_BUG_ON_ZERO(__builtin_choose_expr(__builtin_constant_p(mask) && __builtin_constant_p(value), (value) & ~(mask), 0)) + \ + ((mask) << 16 | (value))) + +#define REG_MASKED_FIELD_ENABLE(a) \ + (__builtin_choose_expr(__builtin_constant_p(a), REG_MASKED_FIELD((a), (a)), ({ typeof(a) _a = (a); REG_MASKED_FIELD(_a, _a); }))) + +#define REG_MASKED_FIELD_DISABLE(a) \ + (REG_MASKED_FIELD((a), 0)) + +#endif -- cgit v1.2.3 From 3c35731b7296a9c2e621387587a3e4cebb3bce8b Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 25 Feb 2026 19:57:09 +0200 Subject: drm/intel: add pick.h for the various "picker" helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a shared header that's used by i915, xe, and i915 display. This allows us to drop the compat-i915-headers/i915_reg_defs.h include from xe_reg_defs.h. All the register macro helpers were subtly pulled in from i915 to all of xe through this. Reviewed-by: Michał Grzelak Acked-by: Rodrigo Vivi Link: https://patch.msgid.link/fcd70f3317755bf98a6e7ae88974aa8ba06efd1e.1772042022.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_reg_defs.h | 45 +------------------ .../gpu/drm/xe/compat-i915-headers/intel_uncore.h | 1 + drivers/gpu/drm/xe/regs/xe_reg_defs.h | 5 ++- include/drm/intel/pick.h | 51 ++++++++++++++++++++++ 4 files changed, 56 insertions(+), 46 deletions(-) create mode 100644 include/drm/intel/pick.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_reg_defs.h b/drivers/gpu/drm/i915/i915_reg_defs.h index a1dc7ff2bef7..e897d3ccbf9e 100644 --- a/drivers/gpu/drm/i915/i915_reg_defs.h +++ b/drivers/gpu/drm/i915/i915_reg_defs.h @@ -6,52 +6,9 @@ #ifndef __I915_REG_DEFS__ #define __I915_REG_DEFS__ +#include #include -/* - * Given the first two numbers __a and __b of arbitrarily many evenly spaced - * numbers, pick the 0-based __index'th value. - * - * Always prefer this over _PICK() if the numbers are evenly spaced. - */ -#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a))) - -/* - * Like _PICK_EVEN(), but supports 2 ranges of evenly spaced address offsets. - * @__c_index corresponds to the index in which the second range starts to be - * used. Using math interval notation, the first range is used for indexes [ 0, - * @__c_index), while the second range is used for [ @__c_index, ... ). Example: - * - * #define _FOO_A 0xf000 - * #define _FOO_B 0xf004 - * #define _FOO_C 0xf008 - * #define _SUPER_FOO_A 0xa000 - * #define _SUPER_FOO_B 0xa100 - * #define FOO(x) _MMIO(_PICK_EVEN_2RANGES(x, 3, \ - * _FOO_A, _FOO_B, \ - * _SUPER_FOO_A, _SUPER_FOO_B)) - * - * This expands to: - * 0: 0xf000, - * 1: 0xf004, - * 2: 0xf008, - * 3: 0xa000, - * 4: 0xa100, - * 5: 0xa200, - * ... - */ -#define _PICK_EVEN_2RANGES(__index, __c_index, __a, __b, __c, __d) \ - (BUILD_BUG_ON_ZERO(!__is_constexpr(__c_index)) + \ - ((__index) < (__c_index) ? _PICK_EVEN(__index, __a, __b) : \ - _PICK_EVEN((__index) - (__c_index), __c, __d))) - -/* - * Given the arbitrary numbers in varargs, pick the 0-based __index'th number. - * - * Always prefer _PICK_EVEN() over this if the numbers are evenly spaced. - */ -#define _PICK(__index, ...) (((const u32 []){ __VA_ARGS__ })[__index]) - typedef struct { u32 reg; } i915_reg_t; diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h index c5e198ace7bc..a8cfd65119e0 100644 --- a/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h @@ -6,6 +6,7 @@ #ifndef __INTEL_UNCORE_H__ #define __INTEL_UNCORE_H__ +#include "i915_reg_defs.h" #include "xe_device.h" #include "xe_device_types.h" #include "xe_mmio.h" diff --git a/drivers/gpu/drm/xe/regs/xe_reg_defs.h b/drivers/gpu/drm/xe/regs/xe_reg_defs.h index c39aab843e35..27ac0bf1f6cd 100644 --- a/drivers/gpu/drm/xe/regs/xe_reg_defs.h +++ b/drivers/gpu/drm/xe/regs/xe_reg_defs.h @@ -6,12 +6,13 @@ #ifndef _XE_REG_DEFS_H_ #define _XE_REG_DEFS_H_ +#include +#include + #include #include #include -#include "compat-i915-headers/i915_reg_defs.h" - /** * XE_REG_ADDR_MAX - The upper limit on MMIO register address * diff --git a/include/drm/intel/pick.h b/include/drm/intel/pick.h new file mode 100644 index 000000000000..d976fab8f270 --- /dev/null +++ b/include/drm/intel/pick.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _PICK_H_ +#define _PICK_H_ + +/* + * Given the first two numbers __a and __b of arbitrarily many evenly spaced + * numbers, pick the 0-based __index'th value. + * + * Always prefer this over _PICK() if the numbers are evenly spaced. + */ +#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a))) + +/* + * Like _PICK_EVEN(), but supports 2 ranges of evenly spaced address offsets. + * @__c_index corresponds to the index in which the second range starts to be + * used. Using math interval notation, the first range is used for indexes [ 0, + * @__c_index), while the second range is used for [ @__c_index, ... ). Example: + * + * #define _FOO_A 0xf000 + * #define _FOO_B 0xf004 + * #define _FOO_C 0xf008 + * #define _SUPER_FOO_A 0xa000 + * #define _SUPER_FOO_B 0xa100 + * #define FOO(x) _MMIO(_PICK_EVEN_2RANGES(x, 3, \ + * _FOO_A, _FOO_B, \ + * _SUPER_FOO_A, _SUPER_FOO_B)) + * + * This expands to: + * 0: 0xf000, + * 1: 0xf004, + * 2: 0xf008, + * 3: 0xa000, + * 4: 0xa100, + * 5: 0xa200, + * ... + */ +#define _PICK_EVEN_2RANGES(__index, __c_index, __a, __b, __c, __d) \ + (BUILD_BUG_ON_ZERO(!__is_constexpr(__c_index)) + \ + ((__index) < (__c_index) ? _PICK_EVEN(__index, __a, __b) : \ + _PICK_EVEN((__index) - (__c_index), __c, __d))) + +/* + * Given the arbitrary numbers in varargs, pick the 0-based __index'th number. + * + * Always prefer _PICK_EVEN() over this if the numbers are evenly spaced. + */ +#define _PICK(__index, ...) (((const u32 []){ __VA_ARGS__ })[__index]) + +#endif -- cgit v1.2.3 From 4059172b2a78a71d15d8fcd8d3fd8ea1ba65d25b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:26:47 -0800 Subject: KVM: x86: Move kvm_rebooting to x86 Move kvm_rebooting, which is only read by x86, to KVM x86 so that it can be moved again to core x86 code. Add a "shutdown" arch hook to facilate setting the flag in KVM x86, along with a pile of comments to provide more context around what KVM x86 is doing and why. Reviewed-by: Chao Gao Acked-by: Dave Hansen Tested-by: Chao Gao Reviewed-by: Dan Williams Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + include/linux/kvm_host.h | 8 +++++++- virt/kvm/kvm_main.c | 14 +++++++------- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a03530795707..7ac3578e6ec0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -700,6 +700,9 @@ static void drop_user_return_notifiers(void) kvm_on_user_return(&msrs->urn); } +__visible bool kvm_rebooting; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); + /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * @@ -13177,6 +13180,25 @@ int kvm_arch_enable_virtualization_cpu(void) return 0; } +void kvm_arch_shutdown(void) +{ + /* + * Set kvm_rebooting to indicate that KVM has asynchronously disabled + * hardware virtualization, i.e. that errors and/or exceptions on SVM + * and VMX instructions are expected and should be ignored. + */ + kvm_rebooting = true; + + /* + * Ensure kvm_rebooting is visible before IPIs are sent to other CPUs + * to disable virtualization. Effectively pairs with the reception of + * the IPI (kvm_rebooting is read in task/exception context, but only + * _needs_ to be read as %true after the IPI function callback disables + * virtualization). + */ + smp_wmb(); +} + void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 94d4f07aaaa0..b314649e5c02 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -54,6 +54,7 @@ struct kvm_host_values { u64 arch_capabilities; }; +extern bool kvm_rebooting; void kvm_spurious_fault(void); #define SIZE_OF_MEMSLOTS_HASHTABLE \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..7c4ebd5210ec 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1627,6 +1627,13 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +/* + * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling + * hardware virtualization on all CPUs via IPI function calls (in preparation + * for shutdown or reboot), e.g. to allow arch code to prepare for disabling + * virtualization while KVM may be actively running vCPUs. + */ +void kvm_arch_shutdown(void); /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of @@ -2313,7 +2320,6 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool enable_virt_at_load; -extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1bc1da66b4b0..d27bf2488b12 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5578,13 +5578,15 @@ bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); - static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; +__weak void kvm_arch_shutdown(void) +{ + +} + __weak void kvm_arch_enable_virtualization(void) { @@ -5638,10 +5640,9 @@ static int kvm_offline_cpu(unsigned int cpu) static void kvm_shutdown(void *data) { + kvm_arch_shutdown(); + /* - * Disable hardware virtualization and set kvm_rebooting to indicate - * that KVM has asynchronously disabled hardware virtualization, i.e. - * that relevant errors and exceptions aren't entirely unexpected. * Some flavors of hardware virtualization need to be disabled before * transferring control to firmware (to perform shutdown/reboot), e.g. * on x86, virtualization can block INIT interrupts, which are used by @@ -5650,7 +5651,6 @@ static void kvm_shutdown(void *data) * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } -- cgit v1.2.3 From d30372d0b7e637475c79a785d055f4eb8c863656 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:27:01 -0800 Subject: KVM: Bury kvm_{en,dis}able_virtualization() in kvm_main.c once more Now that TDX handles doing VMXON without KVM's involvement, bury the top-level APIs to enable and disable virtualization back in kvm_main.c. No functional change intended. Reviewed-by: Dan Williams Reviewed-by: Chao Gao Tested-by: Chao Gao Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-16-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 8 -------- virt/kvm/kvm_main.c | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c4ebd5210ec..fbd549bdf052 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2613,12 +2613,4 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_enable_virtualization(void); -void kvm_disable_virtualization(void); -#else -static inline int kvm_enable_virtualization(void) { return 0; } -static inline void kvm_disable_virtualization(void) { } -#endif - #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d27bf2488b12..a9ccf9a1c41e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1102,6 +1102,9 @@ static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, !refcount_read(&kvm->users_count)); } +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -5689,7 +5692,7 @@ static struct syscore kvm_syscore = { .ops = &kvm_syscore_ops, }; -int kvm_enable_virtualization(void) +static int kvm_enable_virtualization(void) { int r; @@ -5734,9 +5737,8 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); -void kvm_disable_virtualization(void) +static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5747,7 +5749,6 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5763,6 +5764,14 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int kvm_enable_virtualization(void) +{ + return 0; +} +static void kvm_disable_virtualization(void) +{ + +} static int kvm_init_virtualization(void) { return 0; -- cgit v1.2.3 From c3d1892569afad7cdd5fbe94b4698e3b87fbde9f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 20 Jan 2026 17:47:07 +0100 Subject: dt-bindings: arm: qcom,ids: Add SoC ID for CQ7790 Document the IDs used by Eliza SoC IoT variant: CQ7790S (without modem) and CQ7790M, present for example on MTP7790 IoT and evalkit boards. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20260120164706.501119-3-krzysztof.kozlowski@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index 8776844e0eeb..febb69b0438f 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -294,6 +294,8 @@ #define QCOM_ID_QCS8275 675 #define QCOM_ID_QCS9075 676 #define QCOM_ID_QCS615 680 +#define QCOM_ID_CQ7790M 731 +#define QCOM_ID_CQ7790S 732 /* * The board type and revision information, used by Qualcomm bootloaders and -- cgit v1.2.3 From c26b8c4e291c55c7b2138d7bcb27348ca3a5ae59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 16:39:33 +0000 Subject: net: fix off-by-one in udp_flow_src_port() / psp_write_headers() udp_flow_src_port() and psp_write_headers() use ip_local_port_range. ip_local_port_range is inclusive : all ports between min and max can be used. Before this patch, if ip_local_port_range was set to 40000-40001 40001 would not be used as a source port. Use reciprocal_scale() to help code readability. Not tagged for stable trees, as this change could break user expectations. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260302163933.1754393-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/udp.h | 3 ++- net/psp/psp_main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index da68702ddf6e..b648003e5792 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -29,6 +29,7 @@ #include #include #include +#include /** * struct udp_skb_cb - UDP(-Lite) private variables @@ -376,7 +377,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, */ hash ^= hash << 16; - return htons((((u64) hash * (max - min)) >> 32) + min); + return htons(reciprocal_scale(hash, max - min + 1) + min); } static inline int udp_rqueue_get(struct sock *sk) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index d4c04c923c5a..9508b6c38003 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -202,7 +202,7 @@ static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, * reciprocal divide. */ hash ^= hash << 16; - uh->source = htons((((u64)hash * (max - min)) >> 32) + min); + uh->source = htons(reciprocal_scale(hash, max - min + 1) + min); } else { uh->source = udp_flow_src_port(net, skb, 0, 0, false); } -- cgit v1.2.3 From 42a101775bc515a77ac0c39de6cef42aa7abb3a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:26 +0000 Subject: net: add rps_tag_ptr type and helpers Add a new rps_tag_ptr type to encode a pointer and a size to a power-of-two table. Three helpers are added converting an rps_tag_ptr to: 1) A log of the size. 2) A mask : (size - 1). 3) A pointer to the array. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps-types.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 include/net/rps-types.h (limited to 'include') diff --git a/include/net/rps-types.h b/include/net/rps-types.h new file mode 100644 index 000000000000..6b90a66866c1 --- /dev/null +++ b/include/net/rps-types.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_TYPES_H +#define _NET_RPS_TYPES_H + +/* Define a rps_tag_ptr: + * Low order 5 bits are used to store the ilog2(size) of an RPS table. + */ +typedef unsigned long rps_tag_ptr; + +static inline u8 rps_tag_to_log(rps_tag_ptr tag_ptr) +{ + return tag_ptr & 31U; +} + +static inline u32 rps_tag_to_mask(rps_tag_ptr tag_ptr) +{ + return (1U << rps_tag_to_log(tag_ptr)) - 1; +} + +static inline void *rps_tag_to_table(rps_tag_ptr tag_ptr) +{ + return (void *)(tag_ptr & ~31UL); +} +#endif /* _NET_RPS_TYPES_H */ -- cgit v1.2.3 From 61753849b8bc6420cc5834fb3de331ce1134060d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:27 +0000 Subject: net-sysfs: remove rcu field from 'struct rps_sock_flow_table' Removing rcu_head (and @mask in a following patch) will allow a power-of-two allocation and thus high-order allocation for better performance. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 1 - net/core/sysctl_net_core.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index f1794cd2e7fb..32cfa250d9f9 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -60,7 +60,6 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - struct rcu_head rcu; u32 mask; u32 ents[] ____cacheline_aligned_in_smp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03aea10073f0..0b659c932cff 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -147,6 +147,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); + void *tofree = NULL; mutex_lock(&sock_flow_mutex); @@ -193,13 +194,14 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu(orig_sock_table, rcu); + tofree = orig_sock_table; } } } mutex_unlock(&sock_flow_mutex); + kvfree_rcu_mightsleep(tofree); return ret; } #endif /* CONFIG_RPS */ -- cgit v1.2.3 From 9cde131cdd888873363b5d9dfd8d4d4c1fae6986 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:28 +0000 Subject: net-sysfs: add rps_sock_flow_table_mask() helper In preparation of the following patch, abstract access to the @mask field in 'struct rps_sock_flow_table'. Also cleanup rps_sock_flow_sysctl() a bit : - Rename orig_sock_table to o_sock_table. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 11 ++++++++--- net/core/dev.c | 4 +++- net/core/sysctl_net_core.c | 19 ++++++++++--------- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index 32cfa250d9f9..82cdffdf3e6b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -60,18 +60,23 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + u32 _mask; u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) +static inline u32 rps_sock_flow_table_mask(const struct rps_sock_flow_table *table) +{ + return table->_mask; +} + #define RPS_NO_CPU 0xffff static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { - unsigned int index = hash & table->mask; + unsigned int index = hash & rps_sock_flow_table_mask(table); u32 val = hash & ~net_hotdata.rps_cpu_mask; /* We only give a hint, preemption can change CPU under us */ @@ -129,7 +134,7 @@ static inline void _sock_rps_delete_flow(const struct sock *sk) rcu_read_lock(); table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (table) { - index = hash & table->mask; + index = hash & rps_sock_flow_table_mask(table); if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) WRITE_ONCE(table->ents[index], RPS_NO_CPU); } diff --git a/net/core/dev.c b/net/core/dev.c index 19b84eaa2643..92f8eeac8de3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5112,12 +5112,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; + u32 flow_id; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ - ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); + flow_id = hash & rps_sock_flow_table_mask(sock_flow_table); + ident = READ_ONCE(sock_flow_table->ents[flow_id]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0b659c932cff..cfbe798493b5 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -145,16 +145,17 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, .maxlen = sizeof(size), .mode = table->mode }; - struct rps_sock_flow_table *orig_sock_table, *sock_table; + struct rps_sock_flow_table *o_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); void *tofree = NULL; mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected( + o_sock_table = rcu_dereference_protected( net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); - size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0; + orig_size = size; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); @@ -165,6 +166,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -EINVAL; } + sock_table = o_sock_table; size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = @@ -175,26 +177,25 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, } net_hotdata.rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; - sock_table->mask = size - 1; - } else - sock_table = orig_sock_table; + sock_table->_mask = size - 1; + } for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; - if (sock_table != orig_sock_table) { + if (sock_table != o_sock_table) { rcu_assign_pointer(net_hotdata.rps_sock_flow_table, sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); } - if (orig_sock_table) { + if (o_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - tofree = orig_sock_table; + tofree = o_sock_table; } } } -- cgit v1.2.3 From dd378109d20ff6789091fa3558607c1d242d80ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:29 +0000 Subject: net-sysfs: use rps_tag_ptr and remove metadata from rps_sock_flow_table Instead of storing the @mask at the beginning of rps_sock_flow_table, use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss to fetch @mask. More importantly, we can switch to vmalloc_huge() without wasting memory. Tested with: numactl --interleave=all bash -c "echo 4194304 >/proc/sys/net/core/rps_sock_flow_entries" Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/scaling.rst | 13 ++++-- include/net/hotdata.h | 5 +- include/net/rps.h | 42 ++++++++--------- net/core/dev.c | 12 +++-- net/core/sysctl_net_core.c | 89 +++++++++++++++++++----------------- 5 files changed, 86 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 0023afa530ec..6c261eb48845 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -403,16 +403,21 @@ Both of these need to be set before RFS is enabled for a receive queue. Values for both are rounded up to the nearest power of two. The suggested flow count depends on the expected number of active connections at any given time, which may be significantly less than the number of open -connections. We have found that a value of 32768 for rps_sock_flow_entries -works fairly well on a moderately loaded server. +connections. We have found that a value of 65536 for rps_sock_flow_entries +works fairly well on a moderately loaded server. Big servers might +need 1048576 or even higher values. + +On a NUMA host it is advisable to spread rps_sock_flow_entries on all nodes. + +numactl --interleave=all bash -c "echo 1048576 >/proc/sys/net/core/rps_sock_flow_entries" For a single queue device, the rps_flow_cnt value for the single queue would normally be configured to the same value as rps_sock_flow_entries. For a multi-queue device, the rps_flow_cnt for each queue might be configured as rps_sock_flow_entries / N, where N is the number of -queues. So for instance, if rps_sock_flow_entries is set to 32768 and there +queues. So for instance, if rps_sock_flow_entries is set to 131072 and there are 16 configured receive queues, rps_flow_cnt for each queue might be -configured as 2048. +configured as 8192. Accelerated RFS diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 6632b1aa7584..62534d1f3c70 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -6,6 +6,9 @@ #include #include #include +#ifdef CONFIG_RPS +#include +#endif struct skb_defer_node { struct llist_head defer_list; @@ -33,7 +36,7 @@ struct net_hotdata { struct kmem_cache *skbuff_fclone_cache; struct kmem_cache *skb_small_head_cache; #ifdef CONFIG_RPS - struct rps_sock_flow_table __rcu *rps_sock_flow_table; + rps_tag_ptr rps_sock_flow_table; u32 rps_cpu_mask; #endif struct skb_defer_node __percpu *skb_defer_nodes; diff --git a/include/net/rps.h b/include/net/rps.h index 82cdffdf3e6b..dee930d9dd38 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -8,6 +8,7 @@ #include #ifdef CONFIG_RPS +#include extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; @@ -60,45 +61,38 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 _mask; - - u32 ents[] ____cacheline_aligned_in_smp; + u32 ent; }; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) - -static inline u32 rps_sock_flow_table_mask(const struct rps_sock_flow_table *table) -{ - return table->_mask; -} #define RPS_NO_CPU 0xffff -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) +static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) { - unsigned int index = hash & rps_sock_flow_table_mask(table); + unsigned int index = hash & rps_tag_to_mask(tag_ptr); u32 val = hash & ~net_hotdata.rps_cpu_mask; + struct rps_sock_flow_table *table; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); + table = rps_tag_to_table(tag_ptr); /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in get_rps_cpu(). */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); + if (READ_ONCE(table[index].ent) != val) + WRITE_ONCE(table[index].ent, val); } static inline void _sock_rps_record_flow_hash(__u32 hash) { - struct rps_sock_flow_table *sock_flow_table; + rps_tag_ptr tag_ptr; if (!hash) return; rcu_read_lock(); - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (sock_flow_table) - rps_record_sock_flow(sock_flow_table, hash); + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); + if (tag_ptr) + rps_record_sock_flow(tag_ptr, hash); rcu_read_unlock(); } @@ -125,6 +119,7 @@ static inline void _sock_rps_record_flow(const struct sock *sk) static inline void _sock_rps_delete_flow(const struct sock *sk) { struct rps_sock_flow_table *table; + rps_tag_ptr tag_ptr; u32 hash, index; hash = READ_ONCE(sk->sk_rxhash); @@ -132,11 +127,12 @@ static inline void _sock_rps_delete_flow(const struct sock *sk) return; rcu_read_lock(); - table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (table) { - index = hash & rps_sock_flow_table_mask(table); - if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) - WRITE_ONCE(table->ents[index], RPS_NO_CPU); + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); + if (tag_ptr) { + index = hash & rps_tag_to_mask(tag_ptr); + table = rps_tag_to_table(tag_ptr); + if (READ_ONCE(table[index].ent) != RPS_NO_CPU) + WRITE_ONCE(table[index].ent, RPS_NO_CPU); } rcu_read_unlock(); } diff --git a/net/core/dev.c b/net/core/dev.c index 92f8eeac8de3..7ae87be81afc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5075,9 +5075,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { - const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; + rps_tag_ptr global_tag_ptr; struct rps_map *map; int cpu = -1; u32 tcpu; @@ -5108,8 +5108,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (flow_table && sock_flow_table) { + global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); + if (flow_table && global_tag_ptr) { + struct rps_sock_flow_table *sock_flow_table; struct rps_dev_flow *rflow; u32 next_cpu; u32 flow_id; @@ -5118,8 +5119,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ - flow_id = hash & rps_sock_flow_table_mask(sock_flow_table); - ident = READ_ONCE(sock_flow_table->ents[flow_id]); + flow_id = hash & rps_tag_to_mask(global_tag_ptr); + sock_flow_table = rps_tag_to_table(global_tag_ptr); + ident = READ_ONCE(sock_flow_table[flow_id].ent); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cfbe798493b5..502705e04649 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -138,68 +138,73 @@ done: static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct rps_sock_flow_table *o_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + rps_tag_ptr o_tag_ptr, tag_ptr; unsigned int orig_size, size; - int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; - struct rps_sock_flow_table *o_sock_table, *sock_table; - static DEFINE_MUTEX(sock_flow_mutex); void *tofree = NULL; + int ret, i; + u8 log; mutex_lock(&sock_flow_mutex); - o_sock_table = rcu_dereference_protected( - net_hotdata.rps_sock_flow_table, - lockdep_is_held(&sock_flow_mutex)); - size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0; + o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table; + + size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0; + o_sock_table = rps_tag_to_table(o_tag_ptr); orig_size = size; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); - if (write) { - if (size) { - if (size > 1<<29) { - /* Enforce limit to prevent overflow */ + if (!write) + goto unlock; + + if (size) { + if (size > 1<<29) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + sock_table = o_sock_table; + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = vmalloc_huge(size * sizeof(*sock_table), + GFP_KERNEL); + if (!sock_table) { mutex_unlock(&sock_flow_mutex); - return -EINVAL; - } - sock_table = o_sock_table; - size = roundup_pow_of_two(size); - if (size != orig_size) { - sock_table = - vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); - if (!sock_table) { - mutex_unlock(&sock_flow_mutex); - return -ENOMEM; - } - net_hotdata.rps_cpu_mask = - roundup_pow_of_two(nr_cpu_ids) - 1; - sock_table->_mask = size - 1; + return -ENOMEM; } + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; + log = ilog2(size); + tag_ptr = (rps_tag_ptr)sock_table | log; + } - for (i = 0; i < size; i++) - sock_table->ents[i] = RPS_NO_CPU; - } else - sock_table = NULL; - - if (sock_table != o_sock_table) { - rcu_assign_pointer(net_hotdata.rps_sock_flow_table, - sock_table); - if (sock_table) { - static_branch_inc(&rps_needed); - static_branch_inc(&rfs_needed); - } - if (o_sock_table) { - static_branch_dec(&rps_needed); - static_branch_dec(&rfs_needed); - tofree = o_sock_table; - } + for (i = 0; i < size; i++) + sock_table[i].ent = RPS_NO_CPU; + } else { + sock_table = NULL; + tag_ptr = 0UL; + } + if (tag_ptr != o_tag_ptr) { + smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr); + if (sock_table) { + static_branch_inc(&rps_needed); + static_branch_inc(&rfs_needed); + } + if (o_sock_table) { + static_branch_dec(&rps_needed); + static_branch_dec(&rfs_needed); + tofree = o_sock_table; } } +unlock: mutex_unlock(&sock_flow_mutex); kvfree_rcu_mightsleep(tofree); -- cgit v1.2.3 From b2cc61857e3cf7e103089dd54c0548d54a6ae381 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:31 +0000 Subject: net-sysfs: remove rcu field from 'struct rps_dev_flow_table' Remove rps_dev_flow_table_release() in favor of kvfree_rcu_mightsleep(). In the following pach, we will remove "u8 @log" field and 'struct rps_dev_flow_table' size will be a power-of-two. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 1 - net/core/net-sysfs.c | 11 ++--------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index dee930d9dd38..e900480e828b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -44,7 +44,6 @@ struct rps_dev_flow { */ struct rps_dev_flow_table { u8 log; - struct rcu_head rcu; struct rps_dev_flow flows[]; }; #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 52fcf7fa58a8..fd6f81930bc6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1072,13 +1072,6 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return sysfs_emit(buf, "%lu\n", val); } -static void rps_dev_flow_table_release(struct rcu_head *rcu) -{ - struct rps_dev_flow_table *table = container_of(rcu, - struct rps_dev_flow_table, rcu); - vfree(table); -} - static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { @@ -1131,7 +1124,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, RCU_INITIALIZER(table))); if (old_table) - call_rcu(&old_table->rcu, rps_dev_flow_table_release); + kvfree_rcu_mightsleep(old_table); return len; } @@ -1168,7 +1161,7 @@ static void rx_queue_release(struct kobject *kobj) old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL)); if (old_table) - call_rcu(&old_table->rcu, rps_dev_flow_table_release); + kvfree_rcu_mightsleep(old_table); #endif memset(kobj, 0, sizeof(*kobj)); -- cgit v1.2.3 From a435163d3100b044d620990772a5ce1684ff02ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 18:14:32 +0000 Subject: net-sysfs: use rps_tag_ptr and remove metadata from rps_dev_flow_table Instead of storing the @log at the beginning of rps_dev_flow_table use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss (for light traffic). This allows us to switch to one high-order allocation instead of vmalloc() when CONFIG_RFS_ACCEL is not set. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260302181432.1836150-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netdev_rx_queue.h | 3 +- include/net/rps.h | 10 ------- net/core/dev.c | 53 ++++++++++++++++++-------------- net/core/net-sysfs.c | 70 +++++++++++++++++++++---------------------- 4 files changed, 67 insertions(+), 69 deletions(-) (limited to 'include') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cfa72c485387..08f81329fc11 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -8,13 +8,14 @@ #include #include #include +#include /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; - struct rps_dev_flow_table __rcu *rps_flow_table; + rps_tag_ptr rps_flow_table; #endif struct kobject kobj; const struct attribute_group **groups; diff --git a/include/net/rps.h b/include/net/rps.h index e900480e828b..e33c6a2fa8bb 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -39,16 +39,6 @@ struct rps_dev_flow { }; #define RPS_NO_FILTER 0xffff -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - u8 log; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - /* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). diff --git a/net/core/dev.c b/net/core/dev.c index 7ae87be81afc..b470487788a2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4968,16 +4968,16 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); -static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr) { - return hash_32(hash, flow_table->log); + return hash_32(hash, rps_tag_to_log(tag_ptr)); } #ifdef CONFIG_RFS_ACCEL /** * rps_flow_is_active - check whether the flow is recently active. * @rflow: Specific flow to check activity. - * @flow_table: per-queue flowtable that @rflow belongs to. + * @log: ilog2(hashsize). * @cpu: CPU saved in @rflow. * * If the CPU has processed many packets since the flow's last activity @@ -4986,7 +4986,7 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) * Return: true if flow was recently active. */ static bool rps_flow_is_active(struct rps_dev_flow *rflow, - struct rps_dev_flow_table *flow_table, + u8 log, unsigned int cpu) { unsigned int flow_last_active; @@ -4999,7 +4999,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow, flow_last_active = READ_ONCE(rflow->last_qtail); return (int)(sd_input_head - flow_last_active) < - (int)(10 << flow_table->log); + (int)(10 << log); } #endif @@ -5011,9 +5011,10 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; - struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *old_rflow; struct rps_dev_flow *tmp_rflow; + rps_tag_ptr q_tag_ptr; unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; @@ -5028,16 +5029,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; rxqueue = dev->_rx + rxq_index; - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (!flow_table) + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + if (!q_tag_ptr) goto out; - flow_id = rfs_slot(hash, flow_table); - tmp_rflow = &flow_table->flows[flow_id]; + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); + tmp_rflow = flow_table + flow_id; tmp_cpu = READ_ONCE(tmp_rflow->cpu); if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { - if (rps_flow_is_active(tmp_rflow, flow_table, + if (rps_flow_is_active(tmp_rflow, + rps_tag_to_log(q_tag_ptr), tmp_cpu)) { if (hash != READ_ONCE(tmp_rflow->hash) || next_cpu == tmp_cpu) @@ -5076,8 +5079,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue = dev->_rx; - struct rps_dev_flow_table *flow_table; - rps_tag_ptr global_tag_ptr; + rps_tag_ptr global_tag_ptr, q_tag_ptr; struct rps_map *map; int cpu = -1; u32 tcpu; @@ -5098,9 +5100,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ - flow_table = rcu_dereference(rxqueue->rps_flow_table); + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (!flow_table && !map) + if (!q_tag_ptr && !map) goto done; skb_reset_network_header(skb); @@ -5109,8 +5111,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto done; global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); - if (flow_table && global_tag_ptr) { + if (q_tag_ptr && global_tag_ptr) { struct rps_sock_flow_table *sock_flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *rflow; u32 next_cpu; u32 flow_id; @@ -5130,7 +5133,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); + rflow = flow_table + flow_id; tcpu = rflow->cpu; /* @@ -5190,19 +5195,23 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; - struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *rflow; + rps_tag_ptr q_tag_ptr; bool expire = true; + u8 log; rcu_read_lock(); - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id < (1UL << flow_table->log)) { + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + log = rps_tag_to_log(q_tag_ptr); + if (q_tag_ptr && flow_id < (1UL << log)) { unsigned int cpu; - rflow = &flow_table->flows[flow_id]; + flow_table = rps_tag_to_table(q_tag_ptr); + rflow = flow_table + flow_id; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && - rps_flow_is_active(rflow, flow_table, cpu)) + rps_flow_is_active(rflow, log, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fd6f81930bc6..2ce011fae249 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1060,14 +1060,12 @@ out: static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { - struct rps_dev_flow_table *flow_table; unsigned long val = 0; + rps_tag_ptr tag_ptr; - rcu_read_lock(); - flow_table = rcu_dereference(queue->rps_flow_table); - if (flow_table) - val = 1UL << flow_table->log; - rcu_read_unlock(); + tag_ptr = READ_ONCE(queue->rps_flow_table); + if (tag_ptr) + val = 1UL << rps_tag_to_log(tag_ptr); return sysfs_emit(buf, "%lu\n", val); } @@ -1075,8 +1073,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { + rps_tag_ptr otag, tag_ptr = 0UL; + struct rps_dev_flow *table; unsigned long mask, count; - struct rps_dev_flow_table *table, *old_table; + size_t sz; int rc; if (!capable(CAP_NET_ADMIN)) @@ -1093,38 +1093,36 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); - /* On 64 bit arches, must check mask fits in table->mask (u32), - * and on 32bit arches, must check - * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. - */ -#if BITS_PER_LONG > 32 - if (mask > (unsigned long)(u32)mask) - return -EINVAL; -#else - if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) - / sizeof(struct rps_dev_flow)) { - /* Enforce a limit to prevent overflow */ + + /* Do not accept too large tables. */ + if (mask > (INT_MAX / sizeof(*table) - 1)) return -EINVAL; - } -#endif - table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); + + sz = max_t(size_t, sizeof(*table) * (mask + 1), + PAGE_SIZE); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + is_power_of_2(sizeof(*table))) + table = kvmalloc(sz, GFP_KERNEL); + else + table = vmalloc(sz); if (!table) return -ENOMEM; - - table->log = ilog2(mask) + 1; + tag_ptr = (rps_tag_ptr)table; + if (rps_tag_to_log(tag_ptr)) { + pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n"); + kvfree(table); + return -ENOMEM; + } + tag_ptr |= (ilog2(mask) + 1); for (count = 0; count <= mask; count++) { - table->flows[count].cpu = RPS_NO_CPU; - table->flows[count].filter = RPS_NO_FILTER; + table[count].cpu = RPS_NO_CPU; + table[count].filter = RPS_NO_FILTER; } - } else { - table = NULL; } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, - RCU_INITIALIZER(table))); - - if (old_table) - kvfree_rcu_mightsleep(old_table); + otag = xchg(&queue->rps_flow_table, tag_ptr); + if (otag) + kvfree_rcu_mightsleep(rps_tag_to_table(otag)); return len; } @@ -1150,7 +1148,7 @@ static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS - struct rps_dev_flow_table *old_table; + rps_tag_ptr tag_ptr; struct rps_map *map; map = rcu_dereference_protected(queue->rps_map, 1); @@ -1159,9 +1157,9 @@ static void rx_queue_release(struct kobject *kobj) kfree_rcu(map, rcu); } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL)); - if (old_table) - kvfree_rcu_mightsleep(old_table); + tag_ptr = xchg(&queue->rps_flow_table, 0UL); + if (tag_ptr) + kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr)); #endif memset(kobj, 0, sizeof(*kobj)); -- cgit v1.2.3 From 39ae83b0f557969c461d93c608545443a2f5c307 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 3 Mar 2026 17:24:37 -0800 Subject: net: openvswitch: clean up some kernel-doc warnings Fix some kernel-doc warnings in openvswitch.h: Mark enum placeholders that are not used as "private" so that kernel-doc comments are not needed for them. Correct names for 2 enum values: Warning: include/uapi/linux/openvswitch.h:300 Excess enum value '@OVS_VPORT_UPCALL_SUCCESS' description in 'ovs_vport_upcall_attr' Warning: include/uapi/linux/openvswitch.h:300 Excess enum value '@OVS_VPORT_UPCALL_FAIL' description in 'ovs_vport_upcall_attr' Convert one comment from "/**" kernel-doc to a plain C "/*" comment: Warning: include/uapi/linux/openvswitch.h:638 This comment starts with '/**', but isn't a kernel-doc comment. * Omit attributes for notifications. Add more kernel-doc: - add kernel-doc for kernel-only enums; - add missing kernel-doc for enum ovs_datapath_attr; - add missing kernel-doc for enum ovs_flow_attr; - add missing kernel-doc for enum ovs_sample_attr; - add kernel-doc for enum ovs_check_pkt_len_attr; - add kernel-doc for enum ovs_action_attr; - add kernel-doc for enum ovs_action_push_eth; - add kernel-doc for enum ovs_vport_attr; Signed-off-by: Randy Dunlap Acked-by: Ilya Maximets Link: https://patch.msgid.link/20260304012437.469151-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 76 +++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 3092c2c6f1d2..aa2acdbda8f8 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -70,12 +70,15 @@ enum ovs_datapath_cmd { * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should * not be sent. + * @OVS_DP_ATTR_MASKS_CACHE_SIZE: Number of the entries in the flow table + * masks cache. * @OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set. * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the * datapath. Always present in notifications. * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the * datapath. Always present in notifications. + * @OVS_DP_ATTR_USER_FEATURES: OVS_DP_F_* flags. * @OVS_DP_ATTR_IFINDEX: Interface index for a new datapath netdev. Only * valid for %OVS_DP_CMD_NEW requests. * @@ -83,18 +86,23 @@ enum ovs_datapath_cmd { * payload for %OVS_DP_* commands. */ enum ovs_datapath_attr { + /* private: */ OVS_DP_ATTR_UNSPEC, + /* public: */ OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */ OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */ OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */ OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ + /* private: */ OVS_DP_ATTR_PAD, + /* public: */ OVS_DP_ATTR_MASKS_CACHE_SIZE, OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in * per-cpu dispatch mode */ OVS_DP_ATTR_IFINDEX, + /* private: */ __OVS_DP_ATTR_MAX }; @@ -181,6 +189,7 @@ enum ovs_packet_cmd { * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the * output port is actually a tunnel port. Contains the output tunnel key * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. + * @OVS_PACKET_ATTR_PROBE: Packet operation is a feature probe. * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and * @OVS_PACKET_ATTR_LEN: Packet size before truncation. * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment @@ -196,21 +205,26 @@ enum ovs_packet_cmd { * payload for %OVS_PACKET_* commands. */ enum ovs_packet_attr { + /* private: */ OVS_PACKET_ATTR_UNSPEC, + /* public: */ OVS_PACKET_ATTR_PACKET, /* Packet data. */ OVS_PACKET_ATTR_KEY, /* Nested OVS_KEY_ATTR_* attributes. */ OVS_PACKET_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ OVS_PACKET_ATTR_USERDATA, /* OVS_ACTION_ATTR_USERSPACE arg. */ OVS_PACKET_ATTR_EGRESS_TUN_KEY, /* Nested OVS_TUNNEL_KEY_ATTR_* attributes. */ + /* private: */ OVS_PACKET_ATTR_UNUSED1, OVS_PACKET_ATTR_UNUSED2, + /* public: */ OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ OVS_PACKET_ATTR_HASH, /* Packet hash. */ OVS_PACKET_ATTR_UPCALL_PID, /* u32 Netlink PID. */ + /* private: */ __OVS_PACKET_ATTR_MAX }; @@ -257,6 +271,11 @@ enum ovs_vport_type { * upcalls should not be sent. * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for * packets sent or received through the vport. + * @OVS_VPORT_ATTR_IFINDEX: Provides the ifindex of a vport, or sets the desired + * ifindex while creating a new vport with type %OVS_VPORT_TYPE_INTERNAL. + * @OVS_VPORT_ATTR_NETNSID: Provides the netns id of the vport if it's not local. + * @OVS_VPORT_ATTR_UPCALL_STATS: Provides upcall statistics for a vport. + * Contains nested %OVS_VPORT_UPCALL_ATTR_* attributes. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_VPORT_* commands. @@ -272,7 +291,9 @@ enum ovs_vport_type { * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport. */ enum ovs_vport_attr { + /* private: */ OVS_VPORT_ATTR_UNSPEC, + /* public: */ OVS_VPORT_ATTR_PORT_NO, /* u32 port number within datapath */ OVS_VPORT_ATTR_TYPE, /* u32 OVS_VPORT_TYPE_* constant. */ OVS_VPORT_ATTR_NAME, /* string name, up to IFNAMSIZ bytes long */ @@ -280,23 +301,27 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_UPCALL_PID, /* array of u32 Netlink socket PIDs for */ /* receiving upcalls */ OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */ + /* private: */ OVS_VPORT_ATTR_PAD, + /* public: */ OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, OVS_VPORT_ATTR_UPCALL_STATS, + /* private: */ __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) /** - * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands - * @OVS_VPORT_UPCALL_SUCCESS: 64-bit upcall success packets. - * @OVS_VPORT_UPCALL_FAIL: 64-bit upcall fail packets. + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_ATTR_UPCALL_STATS + * @OVS_VPORT_UPCALL_ATTR_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_ATTR_FAIL: 64-bit upcall fail packets. */ enum ovs_vport_upcall_attr { OVS_VPORT_UPCALL_ATTR_SUCCESS, OVS_VPORT_UPCALL_ATTR_FAIL, + /* private: */ __OVS_VPORT_UPCALL_ATTR_MAX }; @@ -431,6 +456,7 @@ enum ovs_frag_type { OVS_FRAG_TYPE_NONE, OVS_FRAG_TYPE_FIRST, OVS_FRAG_TYPE_LATER, + /* private: */ __OVS_FRAG_TYPE_MAX }; @@ -604,6 +630,8 @@ struct ovs_nsh_key_md1 { * a wildcarded match. Omitting attribute is treated as wildcarding all * corresponding fields. Optional for all requests. If not present, * all flow key bits are exact match bits. + * @OVS_FLOW_ATTR_PROBE: Flow operation is a feature probe, error logging + * should be suppressed. * @OVS_FLOW_ATTR_UFID: A value between 1-16 octets specifying a unique * identifier for the flow. Causes the flow to be indexed by this value rather * than the value of the %OVS_FLOW_ATTR_KEY attribute. Optional for all @@ -617,7 +645,9 @@ struct ovs_nsh_key_md1 { * payload for %OVS_FLOW_* commands. */ enum ovs_flow_attr { + /* private: */ OVS_FLOW_ATTR_UNSPEC, + /* public: */ OVS_FLOW_ATTR_KEY, /* Sequence of OVS_KEY_ATTR_* attributes. */ OVS_FLOW_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ OVS_FLOW_ATTR_STATS, /* struct ovs_flow_stats. */ @@ -629,13 +659,14 @@ enum ovs_flow_attr { * logging should be suppressed. */ OVS_FLOW_ATTR_UFID, /* Variable length unique flow identifier. */ OVS_FLOW_ATTR_UFID_FLAGS,/* u32 of OVS_UFID_F_*. */ + /* private: */ OVS_FLOW_ATTR_PAD, __OVS_FLOW_ATTR_MAX }; #define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1) -/** +/* * Omit attributes for notifications. * * If a datapath request contains an %OVS_UFID_F_OMIT_* flag, then the datapath @@ -653,17 +684,23 @@ enum ovs_flow_attr { * fractions of packets. * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event. * Actions are passed as nested attributes. + * @OVS_SAMPLE_ATTR_ARG: For in-kernel use, passing &struct sample_arg + * derived from other attributes. * * Executes the specified actions with the given probability on a per-packet * basis. Nested actions will be able to access the probability value of the * parent @OVS_ACTION_ATTR_SAMPLE. */ enum ovs_sample_attr { + /* private: */ OVS_SAMPLE_ATTR_UNSPEC, + /* public: */ OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */ OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ + /* private: */ __OVS_SAMPLE_ATTR_MAX, + /* public: */ #ifdef __KERNEL__ OVS_SAMPLE_ATTR_ARG /* struct sample_arg */ #endif @@ -693,12 +730,15 @@ struct sample_arg { * @OVS_USERSPACE_ATTR_ACTIONS: If present, send actions with upcall. */ enum ovs_userspace_attr { + /* private: */ OVS_USERSPACE_ATTR_UNSPEC, + /* public: */ OVS_USERSPACE_ATTR_PID, /* u32 Netlink PID to receive upcalls. */ OVS_USERSPACE_ATTR_USERDATA, /* Optional user-specified cookie. */ OVS_USERSPACE_ATTR_EGRESS_TUN_PORT, /* Optional, u32 output port * to get tunnel info. */ OVS_USERSPACE_ATTR_ACTIONS, /* Optional flag to get actions. */ + /* private: */ __OVS_USERSPACE_ATTR_MAX }; @@ -819,7 +859,9 @@ struct ovs_action_hash { * @OVS_CT_ATTR_TIMEOUT: Variable length string defining conntrack timeout. */ enum ovs_ct_attr { + /* private: */ OVS_CT_ATTR_UNSPEC, + /* public: */ OVS_CT_ATTR_COMMIT, /* No argument, commits connection. */ OVS_CT_ATTR_ZONE, /* u16 zone id. */ OVS_CT_ATTR_MARK, /* mark to associate with this connection. */ @@ -831,6 +873,7 @@ enum ovs_ct_attr { OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ OVS_CT_ATTR_TIMEOUT, /* Associate timeout with this connection for * fine-grain timeout tuning. */ + /* private: */ __OVS_CT_ATTR_MAX }; @@ -859,7 +902,9 @@ enum ovs_ct_attr { * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping */ enum ovs_nat_attr { + /* private: */ OVS_NAT_ATTR_UNSPEC, + /* public: */ OVS_NAT_ATTR_SRC, OVS_NAT_ATTR_DST, OVS_NAT_ATTR_IP_MIN, @@ -869,38 +914,44 @@ enum ovs_nat_attr { OVS_NAT_ATTR_PERSISTENT, OVS_NAT_ATTR_PROTO_HASH, OVS_NAT_ATTR_PROTO_RANDOM, + /* private: */ __OVS_NAT_ATTR_MAX, }; #define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) -/* +/** * struct ovs_action_push_eth - %OVS_ACTION_ATTR_PUSH_ETH action argument. * @addresses: Source and destination MAC addresses. - * @eth_type: Ethernet type */ struct ovs_action_push_eth { struct ovs_key_ethernet addresses; }; -/* +/** * enum ovs_check_pkt_len_attr - Attributes for %OVS_ACTION_ATTR_CHECK_PKT_LEN. * * @OVS_CHECK_PKT_LEN_ATTR_PKT_LEN: u16 Packet length to check for. * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: Nested OVS_ACTION_ATTR_* * actions to apply if the packer length is greater than the specified * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. - * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested OVS_ACTION_ATTR_* + * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL: Nested OVS_ACTION_ATTR_* * actions to apply if the packer length is lesser or equal to the specified * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. + * @OVS_CHECK_PKT_LEN_ATTR_ARG: For in-kernel use, passing &struct + * check_pkt_len_arg derived from other attributes. */ enum ovs_check_pkt_len_attr { + /* private: */ OVS_CHECK_PKT_LEN_ATTR_UNSPEC, + /* public: */ OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, + /* private: */ __OVS_CHECK_PKT_LEN_ATTR_MAX, + /* public: */ #ifdef __KERNEL__ OVS_CHECK_PKT_LEN_ATTR_ARG /* struct check_pkt_len_arg */ #endif @@ -968,6 +1019,9 @@ enum ovs_psample_attr { * from the packet. * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in * the nested %OVS_SAMPLE_ATTR_* attributes. + * @OVS_ACTION_ATTR_RECIRC: Recirculate the clone of the packet through the + * datapath with the new id (u32 recirc_id). + * @OVS_ACTION_ATTR_HASH: Compute the packet hash, using &struct ovs_action_hash. * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the * top of the packets MPLS label stack. Set the ethertype of the * encapsulating frame to either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to @@ -997,6 +1051,8 @@ enum ovs_psample_attr { * start of the packet or at the start of the l3 header depending on the value * of l3 tunnel flag in the tun_flags field of OVS_ACTION_ATTR_ADD_MPLS * argument. + * @OVS_ACTION_ATTR_DEC_TTL: Decrement TTL or hop limit of the packet. Execute + * nested %OVS_DEC_TTL_ATTR_* actions if the value is less or equal to 1. * @OVS_ACTION_ATTR_DROP: Explicit drop action. * @OVS_ACTION_ATTR_PSAMPLE: Send a sample of the packet to external observers * via psample. @@ -1010,7 +1066,9 @@ enum ovs_psample_attr { */ enum ovs_action_attr { + /* private: */ OVS_ACTION_ATTR_UNSPEC, + /* public: */ OVS_ACTION_ATTR_OUTPUT, /* u32 port number. */ OVS_ACTION_ATTR_USERSPACE, /* Nested OVS_USERSPACE_ATTR_*. */ OVS_ACTION_ATTR_SET, /* One nested OVS_KEY_ATTR_*. */ @@ -1040,9 +1098,11 @@ enum ovs_action_attr { OVS_ACTION_ATTR_DROP, /* u32 error code. */ OVS_ACTION_ATTR_PSAMPLE, /* Nested OVS_PSAMPLE_ATTR_*. */ + /* private: */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ + /* public: */ #ifdef __KERNEL__ OVS_ACTION_ATTR_SET_TO_MASKED, /* Kernel module internal masked * set action converted from -- cgit v1.2.3 From c66e0f453d1afa82534383c58d503238a43fa76c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 01:27:47 +0000 Subject: net: use ktime_t in struct scm_timestamping_internal Instead of using struct timespec64 in scm_timestamping_internal, use ktime_t, saving 24 bytes in kernel stack. This makes tcp_update_recv_tstamps() small enough to be inlined. The ktime_t -> timespec64 conversions happen after socket lock has been released in tcp_recvmsg(), and only if the application requested them. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/2 grow/shrink: 5/4 up/down: 146/-277 (-131) Function old new delta tcp_zerocopy_receive 2383 2425 +42 mptcp_recvmsg 1565 1607 +42 tcp_recvmsg_locked 3797 3823 +26 put_cmsg_scm_timestamping64 131 149 +18 put_cmsg_scm_timestamping 131 149 +18 __pfx_tcp_update_recv_tstamps 16 - -16 do_tcp_getsockopt 4024 4006 -18 tcp_recv_timestamp 474 430 -44 tcp_zc_handle_leftover 417 371 -46 __sock_recv_timestamp 1087 1031 -56 tcp_update_recv_tstamps 97 - -97 Total: Before=25223788, After=25223657, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260304012747.881644-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- include/net/tcp.h | 11 +++++++-- net/core/scm.c | 12 ++++++---- net/ipv4/tcp.c | 61 ++++++++++++++++++-------------------------------- net/socket.c | 23 +++++++++---------- 5 files changed, 51 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index ec715ad4bf25..ec4a0a025793 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -415,7 +415,7 @@ struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { - struct timespec64 ts[3]; + ktime_t ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); diff --git a/include/net/tcp.h b/include/net/tcp.h index 9cf8785ef0b4..fea6081cf6c7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -503,8 +503,15 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); -void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss); + +static inline void +tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + tss->ts[0] = skb->tstamp; + tss->ts[2] = skb_hwtstamps(skb)->hwtstamp; +} + void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); diff --git a/net/core/scm.c b/net/core/scm.c index a29aa8fb8065..eec13f50ecaf 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -318,8 +318,10 @@ void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_int int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { - tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; - tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + struct timespec64 tv = ktime_to_timespec64(tss_internal->ts[i]); + + tss.ts[i].tv_sec = tv.tv_sec; + tss.ts[i].tv_nsec = tv.tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); @@ -332,8 +334,10 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { - tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; - tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + struct timespec64 tv = ktime_to_timespec64(tss_internal->ts[i]); + + tss.ts[i].tv_sec = tv.tv_sec; + tss.ts[i].tv_nsec = tv.tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5997e0fb7a45..1c8be22a361e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1871,20 +1871,6 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_IPV6_MOD(tcp_set_rcvlowat); -void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -2376,22 +2362,23 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); u32 tsflags = READ_ONCE(sk->sk_tsflags); - bool has_timestamping = false; - if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (tss->ts[0]) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { + struct timespec64 tv = ktime_to_timespec64(tss->ts[0]); + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { - .tv_sec = tss->ts[0].tv_sec, - .tv_nsec = tss->ts[0].tv_nsec, + .tv_sec = tv.tv_sec, + .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { - .tv_sec = tss->ts[0].tv_sec, - .tv_nsec = tss->ts[0].tv_nsec, + .tv_sec = tv.tv_sec, + .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); @@ -2399,41 +2386,37 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { - .tv_sec = tss->ts[0].tv_sec, - .tv_usec = tss->ts[0].tv_nsec / 1000, + .tv_sec = tv.tv_sec, + .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { - struct __kernel_old_timeval tv = { - .tv_sec = tss->ts[0].tv_sec, - .tv_usec = tss->ts[0].tv_nsec / 1000, + struct __kernel_old_timeval otv = { + .tv_sec = tv.tv_sec, + .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, - sizeof(tv), &tv); + sizeof(otv), &otv); } } } - if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + if (!(tsflags & SOF_TIMESTAMPING_SOFTWARE && (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) - has_timestamping = true; - else - tss->ts[0] = (struct timespec64) {0}; + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) + tss->ts[0] = 0; } - if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + if (tss->ts[2]) { + if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) - has_timestamping = true; - else - tss->ts[2] = (struct timespec64) {0}; + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) + tss->ts[2] = 0; } - if (has_timestamping) { - tss->ts[1] = (struct timespec64) {0}; + if (tss->ts[0] | tss->ts[2]) { + tss->ts[1] = 0; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else diff --git a/net/socket.c b/net/socket.c index 05952188127f..68829d09bcf1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -912,11 +912,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); - struct scm_timestamping_internal tss; - int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - int if_index; + struct scm_timestamping_internal tss; + int if_index, false_tstamp = 0; ktime_t hwtstamp; u32 tsflags; @@ -961,12 +960,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); - if ((tsflags & SOF_TIMESTAMPING_SOFTWARE && - (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || - skb_is_err_queue(skb) || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && - ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) - empty = 0; + if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + skb_is_err_queue(skb) || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) + tss.ts[0] = skb->tstamp; + if (shhwtstamps && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || @@ -983,15 +982,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); - if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { - empty = 0; + if (hwtstamp) { + tss.ts[2] = hwtstamp; if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } } - if (!empty) { + if (tss.ts[0] | tss.ts[2]) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, &tss); else -- cgit v1.2.3 From 01b7768578a68abe597cfb36ebe0fc47c9305f88 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:31 +0200 Subject: net/mlx5: Add TLP emulation device capabilities Introduce the hardware structures and definitions needed for the driver support of TLP emulation in mlx5_ifc. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 775cb0c56865..a3948b36820d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1389,6 +1389,26 @@ struct mlx5_ifc_virtio_emulation_cap_bits { u8 reserved_at_1c0[0x640]; }; +struct mlx5_ifc_tlp_dev_emu_capabilities_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x13]; + u8 log_tlp_rsp_gw_page_stride[0x5]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0xc0]; + + u8 reserved_at_100[0xc]; + u8 tlp_rsp_gw_num_pages[0x4]; + u8 reserved_at_110[0x10]; + + u8 reserved_at_120[0xa0]; + + u8 tlp_rsp_gw_pages_bar_offset[0x40]; + + u8 reserved_at_200[0x600]; +}; + enum { MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, @@ -1961,7 +1981,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x1]; + u8 tlp_device_emulation_manager[0x1]; u8 vnic_env_cnt_bar_uar_access[0x1]; u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; @@ -3830,6 +3850,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_tls_cap_bits tls_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; + struct mlx5_ifc_tlp_dev_emu_capabilities_bits tlp_dev_emu_capabilities; struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; -- cgit v1.2.3 From 385a06f74ff7a03e3fb0b15fb87cfeb052d75073 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:32 +0200 Subject: net/mlx5: Expose TLP emulation capabilities Expose and query TLP device emulation caps on driver load. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + include/linux/mlx5/device.h | 9 +++++++++ 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index eeb4437975f2..55249f405841 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -255,6 +255,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, tlp_device_emulation_manager)) { + err = mlx5_core_get_caps_mode(dev, MLX5_CAP_TLP_EMULATION, HCA_CAP_OPMOD_GET_CUR); + if (err) + return err; + } + if (MLX5_CAP_GEN(dev, ipsec_offload)) { err = mlx5_core_get_caps_mode(dev, MLX5_CAP_IPSEC, HCA_CAP_OPMOD_GET_CUR); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..b0bc4a7d4a93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1772,6 +1772,7 @@ static const int types[] = { MLX5_CAP_CRYPTO, MLX5_CAP_SHAMPO, MLX5_CAP_ADV_RDMA, + MLX5_CAP_TLP_EMULATION, }; static void mlx5_hca_caps_free(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b37fe39cef27..25c6b42140b2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1259,6 +1259,7 @@ enum mlx5_cap_type { MLX5_CAP_PORT_SELECTION = 0x25, MLX5_CAP_ADV_VIRTUALIZATION = 0x26, MLX5_CAP_ADV_RDMA = 0x28, + MLX5_CAP_TLP_EMULATION = 0x2a, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1481,6 +1482,14 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(virtio_emulation_cap, \ (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap) +#define MLX5_CAP_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + +#define MLX5_CAP64_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET64(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + #define MLX5_CAP_IPSEC(mdev, cap)\ MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap) -- cgit v1.2.3 From da73d7634f61a1d5dbedc237f392c04ae487ca46 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:15 +0200 Subject: RDMA/nldev: Add command to set pinned FRMR handles Allow users to set through netlink, for a specific FRMR pool, the amount of handles that are not aged, and fill the pool to this amount. This allows users to warm-up the FRMR pools to an expected amount of handles with specific attributes that fits their expected usage. Signed-off-by: Michael Guralnik Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-10-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 88 +++++++++++++++++++++++++++++++++++----- include/uapi/rdma/rdma_netlink.h | 1 + 2 files changed, 78 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 8d004b7568b7..0c2076d2f48c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -185,6 +185,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2692,6 +2693,9 @@ static int fill_frmr_pool_entry(struct sk_buff *msg, struct ib_frmr_pool *pool) if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, pool->in_use, RDMA_NLDEV_ATTR_PAD)) goto err_unlock; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES, + pool->pinned_handles)) + goto err_unlock; spin_unlock(&pool->lock); return 0; @@ -2701,6 +2705,54 @@ err_unlock: return -EMSGSIZE; } +static void nldev_frmr_pools_parse_key(struct nlattr *tb[], + struct ib_frmr_key *key, + struct netlink_ext_ack *extack) +{ + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]) + key->ats = nla_get_u8(tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]); + + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS]) + key->access_flags = nla_get_u32( + tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ACCESS_FLAGS]); + + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY]) + key->vendor_key = nla_get_u64( + tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_VENDOR_KEY]); + + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS]) + key->num_dma_blocks = nla_get_u64( + tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS]); +} + +static int nldev_frmr_pools_set_pinned(struct ib_device *device, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + struct nlattr *key_tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_frmr_key key = { 0 }; + u32 pinned_handles = 0; + int err = 0; + + pinned_handles = + nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES]); + + if (!tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY]) + return -EINVAL; + + err = nla_parse_nested(key_tb, RDMA_NLDEV_ATTR_MAX - 1, + tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY], nldev_policy, + extack); + if (err) + return err; + + nldev_frmr_pools_parse_key(key_tb, &key, extack); + + err = ib_frmr_pools_set_pinned(device, &key, pinned_handles); + + return err; +} + static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2803,32 +2855,46 @@ err: static int nldev_frmr_pools_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; + struct nlattr **tb; u32 aging_period; int err; + tb = kzalloc_objs(*tb, RDMA_NLDEV_ATTR_MAX, GFP_KERNEL); + if (!tb) + return -ENOMEM; + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err) - return err; - - if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX]) - return -EINVAL; + goto free_tb; - if (!tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]) - return -EINVAL; + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + err = -EINVAL; + goto free_tb; + } device = ib_device_get_by_index( sock_net(skb->sk), nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX])); - if (!device) - return -EINVAL; + if (!device) { + err = -EINVAL; + goto free_tb; + } - aging_period = nla_get_u32(tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]); + if (tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]) { + aging_period = nla_get_u32( + tb[RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD]); + err = ib_frmr_pools_set_aging_period(device, aging_period); + goto done; + } - err = ib_frmr_pools_set_aging_period(device, aging_period); + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES]) + err = nldev_frmr_pools_set_pinned(device, tb, extack); +done: ib_device_put(device); +free_tb: + kfree(tb); return err; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index f9c295caf2b1..39178df104f0 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -601,6 +601,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_FRMR_POOL_MAX_IN_USE, /* u64 */ RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, /* u64 */ RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD, /* u32 */ + RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES, /* u32 */ /* * Always the end -- cgit v1.2.3 From dbd0472fd7a5bdd0b86c21c36f8afa713baa7653 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:16 +0200 Subject: RDMA/nldev: Expose kernel-internal FRMR pools in netlink Allow netlink users, through the usage of driver-details netlink attribute, to get information about internal FRMR pools that use the kernel_vendor_key FRMR key member. Signed-off-by: Michael Guralnik Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-11-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 28 +++++++++++++++++++++++----- include/uapi/rdma/rdma_netlink.h | 1 + 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0c2076d2f48c..cb18699633e8 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -186,6 +186,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FRMR_POOL_KEY_KERNEL_VENDOR_KEY] = { .type = NLA_U64 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2671,6 +2672,12 @@ static int fill_frmr_pool_key(struct sk_buff *msg, struct ib_frmr_key *key) key->num_dma_blocks, RDMA_NLDEV_ATTR_PAD)) goto err; + if (key->kernel_vendor_key && + nla_put_u64_64bit(msg, + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_KERNEL_VENDOR_KEY, + key->kernel_vendor_key, RDMA_NLDEV_ATTR_PAD)) + goto err; + nla_nest_end(msg, key_attr); return 0; @@ -2705,9 +2712,9 @@ err_unlock: return -EMSGSIZE; } -static void nldev_frmr_pools_parse_key(struct nlattr *tb[], - struct ib_frmr_key *key, - struct netlink_ext_ack *extack) +static int nldev_frmr_pools_parse_key(struct nlattr *tb[], + struct ib_frmr_key *key, + struct netlink_ext_ack *extack) { if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]) key->ats = nla_get_u8(tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_ATS]); @@ -2723,6 +2730,11 @@ static void nldev_frmr_pools_parse_key(struct nlattr *tb[], if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS]) key->num_dma_blocks = nla_get_u64( tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_NUM_DMA_BLOCKS]); + + if (tb[RDMA_NLDEV_ATTR_FRMR_POOL_KEY_KERNEL_VENDOR_KEY]) + return -EINVAL; + + return 0; } static int nldev_frmr_pools_set_pinned(struct ib_device *device, @@ -2746,7 +2758,9 @@ static int nldev_frmr_pools_set_pinned(struct ib_device *device, if (err) return err; - nldev_frmr_pools_parse_key(key_tb, &key, extack); + err = nldev_frmr_pools_parse_key(key_tb, &key, extack); + if (err) + return err; err = ib_frmr_pools_set_pinned(device, &key, pinned_handles); @@ -2762,6 +2776,7 @@ static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb, struct ib_frmr_pool *pool; struct nlattr *table_attr; struct nlattr *entry_attr; + bool show_details = false; struct ib_device *device; int start = cb->args[0]; struct rb_node *node; @@ -2778,6 +2793,9 @@ static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + pools = device->frmr_pools; if (!pools) { ib_device_put(device); @@ -2803,7 +2821,7 @@ static int nldev_frmr_pools_get_dumpit(struct sk_buff *skb, read_lock(&pools->rb_lock); for (node = rb_first(&pools->rb_root); node; node = rb_next(node)) { pool = rb_entry(node, struct ib_frmr_pool, node); - if (pool->key.kernel_vendor_key) + if (pool->key.kernel_vendor_key && !show_details) continue; if (idx < start) { diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 39178df104f0..aac9782ddc09 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -602,6 +602,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_FRMR_POOL_IN_USE, /* u64 */ RDMA_NLDEV_ATTR_FRMR_POOLS_AGING_PERIOD, /* u32 */ RDMA_NLDEV_ATTR_FRMR_POOL_PINNED_HANDLES, /* u32 */ + RDMA_NLDEV_ATTR_FRMR_POOL_KEY_KERNEL_VENDOR_KEY, /* u64 */ /* * Always the end -- cgit v1.2.3 From 9d2994f97ddf324ec1cb48333f62d3fbde6602da Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 26 Feb 2026 15:44:12 +0200 Subject: RDMA/core: Delete not-implemented get_vector_affinity No drivers implement .get_vector_affinity(), and no callers invoke ib_get_vector_affinity(), so remove it. Link: https://patch.msgid.link/20260226-get_vector_affinity-v1-1-910a899c4e5d@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 1 - include/rdma/ib_verbs.h | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index c7b227e2e657..8b1ec1f9c5e4 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2749,7 +2749,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); - SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ba34b131e9be..6142f7e39700 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2426,8 +2426,6 @@ struct ib_device_ops { int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); - const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, - int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); int (*query_port_speed)(struct ib_device *device, u32 port_num, @@ -4834,27 +4832,6 @@ static inline __be16 ib_lid_be16(u32 lid) return cpu_to_be16((u16)lid); } -/** - * ib_get_vector_affinity - Get the affinity mappings of a given completion - * vector - * @device: the rdma device - * @comp_vector: index of completion vector - * - * Returns NULL on failure, otherwise a corresponding cpu map of the - * completion vector (returns all-cpus map if the device driver doesn't - * implement get_vector_affinity). - */ -static inline const struct cpumask * -ib_get_vector_affinity(struct ib_device *device, int comp_vector) -{ - if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || - !device->ops.get_vector_affinity) - return NULL; - - return device->ops.get_vector_affinity(device, comp_vector); - -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. -- cgit v1.2.3 From 75b864f08773a6a69f8c467dc2516e5e06414fa7 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:35 +0200 Subject: RDMA/mlx5: Add support for TLP VAR allocation Extend the VAR allocation UAPI to accept an optional flags attribute, allowing userspace to request TLP VAR allocation via the MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP flag. When the TLP flag "MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP" is specified, the driver selects the TLP VAR region for allocation instead of the regular VirtIO VAR region. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 40 ++++++++++++++++++++++++++----- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 ++++ 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 64708a97f26c..ff2c02c85625 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4151,7 +4151,7 @@ static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c, } static struct mlx5_user_mmap_entry * -alloc_var_entry(struct mlx5_ib_ucontext *c) +alloc_var_entry(struct mlx5_ib_ucontext *c, u32 flags) { struct mlx5_user_mmap_entry *entry; struct mlx5_var_region *var_region; @@ -4160,7 +4160,11 @@ alloc_var_entry(struct mlx5_ib_ucontext *c) int err; var_table = &to_mdev(c->ibucontext.device)->var_table; - var_region = &var_table->var_region; + if (flags & MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP) + var_region = &var_table->tlp_var_region; + else + var_region = &var_table->var_region; + entry = kzalloc_obj(*entry); if (!entry) return ERR_PTR(-ENOMEM); @@ -4180,7 +4184,9 @@ alloc_var_entry(struct mlx5_ib_ucontext *c) entry->address = var_region->hw_start_addr + (page_idx * var_region->stride_size); entry->page_idx = page_idx; - entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR; + entry->mmap_flag = flags & MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP ? + MLX5_IB_MMAP_TYPE_TLP_VAR : + MLX5_IB_MMAP_TYPE_VAR; err = mlx5_rdma_user_mmap_entry_insert(c, entry, var_region->stride_size); @@ -4203,9 +4209,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)( { struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); - struct mlx5_ib_ucontext *c; struct mlx5_user_mmap_entry *entry; + struct mlx5_ib_ucontext *c; u64 mmap_offset; + u32 flags = 0; u32 length; int err; @@ -4213,7 +4220,24 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)( if (IS_ERR(c)) return PTR_ERR(c); - entry = alloc_var_entry(c); + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_VAR_OBJ_ALLOC_FLAGS, + MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP); + if (err) + return err; + + if (flags & MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP) { + if (!MLX5_CAP_GEN(to_mdev(c->ibucontext.device)->mdev, + tlp_device_emulation_manager)) + return -EOPNOTSUPP; + } else { + if (!(MLX5_CAP_GEN_64(to_mdev(c->ibucontext.device)->mdev, + general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q)) + return -EOPNOTSUPP; + } + + entry = alloc_var_entry(c, flags); if (IS_ERR(entry)) return PTR_ERR(entry); @@ -4243,6 +4267,9 @@ DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_OBJECT_VAR, UVERBS_ACCESS_NEW, UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_VAR_OBJ_ALLOC_FLAGS, + enum mlx5_ib_uapi_var_alloc_flags, + UA_OPTIONAL), UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, UVERBS_ATTR_TYPE(u32), UA_MANDATORY), @@ -4270,7 +4297,8 @@ static bool var_is_supported(struct ib_device *device) struct mlx5_ib_dev *dev = to_mdev(device); return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & - MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q); + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) || + MLX5_CAP_GEN(dev->mdev, tlp_device_emulation_manager); } static struct mlx5_user_mmap_entry * diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 18f9fe070213..01a2a050e468 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -139,6 +139,7 @@ enum mlx5_ib_var_alloc_attrs { MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + MLX5_IB_ATTR_VAR_OBJ_ALLOC_FLAGS, }; enum mlx5_ib_var_obj_destroy_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 8f86e79d78a5..ef295b38a1cf 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -100,6 +100,10 @@ enum mlx5_ib_uapi_query_port_flags { MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID = 1 << 5, }; +enum mlx5_ib_uapi_var_alloc_flags { + MLX5_IB_UAPI_VAR_ALLOC_FLAG_TLP = 1 << 0, +}; + struct mlx5_ib_uapi_reg { __u32 value; __u32 mask; -- cgit v1.2.3 From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 19 ++++++++++--------- include/linux/units.h | 3 +++ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 8dde3b0eb454..380893baf987 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -964,13 +965,13 @@ static int rapl_check_unit_core(struct rapl_domain *rd) } value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -995,10 +996,10 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = (1ULL << value) * 1000; + rd->power_unit = (1ULL << value) * MILLIWATT_PER_WATT; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1169,13 +1170,13 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) } value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1208,7 +1209,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = { .check_unit = rapl_check_unit_core, .set_floor_freq = set_floor_freq_default, .compute_time_window = rapl_compute_time_window_core, - .psys_domain_energy_unit = 1000000000, + .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 64 ++++++++++++++++-------------------- include/linux/intel_rapl.h | 17 ++++++++-- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 380893baf987..7c95eb658c16 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -221,20 +221,10 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) #define power_zone_to_rapl_domain(_zone) \ container_of(_zone, struct rapl_domain, power_zone) -struct rapl_defaults { - u8 floor_freq_reg_addr; - int (*check_unit)(struct rapl_domain *rd); - void (*set_floor_freq)(struct rapl_domain *rd, bool mode); - u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, - bool to_raw); - unsigned int dram_domain_energy_unit; - unsigned int psys_domain_energy_unit; - bool spr_psys_bits; -}; -static struct rapl_defaults *defaults_msr; +static const struct rapl_defaults *defaults_msr; static const struct rapl_defaults defaults_tpmi; -static struct rapl_defaults *get_defaults(struct rapl_package *rp) +static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { return rp->priv->defaults; } @@ -351,7 +341,7 @@ static int find_nr_power_limit(struct rapl_domain *rd) static int set_domain_enable(struct powercap_zone *power_zone, bool mode) { struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 val; int ret; @@ -640,7 +630,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, u64 value, int to_raw) { u64 units = 1; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 scale = 1; switch (type) { @@ -785,11 +775,11 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->defaults = (void *)defaults_msr; + rp->priv->defaults = defaults_msr; rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->defaults = (void *)&defaults_tpmi; + rp->priv->defaults = &defaults_tpmi; rp->priv->rpi = (void *)rpi_tpmi; break; default: @@ -806,7 +796,7 @@ static int rapl_config(struct rapl_package *rp) static enum rapl_primitives prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); if (!defaults->spr_psys_bits) return prim; @@ -951,7 +941,7 @@ static int rapl_write_pl_data(struct rapl_domain *rd, int pl, * power unit : microWatts : Represented in milliWatts by default * time unit : microseconds: Represented in seconds by default */ -static int rapl_check_unit_core(struct rapl_domain *rd) +int rapl_default_check_unit(struct rapl_domain *rd) { struct reg_action ra; u32 value; @@ -978,6 +968,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) return 0; } +EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); static int rapl_check_unit_atom(struct rapl_domain *rd) { @@ -1071,7 +1062,7 @@ static void package_power_limit_irq_restore(struct rapl_package *rp) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -static void set_floor_freq_default(struct rapl_domain *rd, bool mode) +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) { int i; @@ -1085,11 +1076,12 @@ static void set_floor_freq_default(struct rapl_domain *rd, bool mode) rapl_write_pl_data(rd, i, PL_CLAMP, mode); } } +EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) { static u32 power_ctrl_orig_val; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u32 mdata; if (!defaults->floor_freq_reg_addr) { @@ -1110,8 +1102,7 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) defaults->floor_freq_reg_addr, mdata); } -static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, - bool to_raw) +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) { u64 f, y; /* fraction and exp. used for time unit */ @@ -1142,6 +1133,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, } return value; } +EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, bool to_raw) @@ -1187,28 +1179,28 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) static const struct rapl_defaults defaults_tpmi = { .check_unit = rapl_check_unit_tpmi, /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_hsw_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .dram_domain_energy_unit = 15300, }; static const struct rapl_defaults rapl_defaults_spr_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; @@ -1242,7 +1234,7 @@ static const struct rapl_defaults rapl_defaults_cht = { }; static const struct rapl_defaults rapl_defaults_amd = { - .check_unit = rapl_check_unit_core, + .check_unit = rapl_default_check_unit, }; static const struct x86_cpu_id rapl_ids[] __initconst = { @@ -1448,7 +1440,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ static int rapl_get_domain_unit(struct rapl_domain *rd) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { @@ -2341,7 +2333,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (id) { - defaults_msr = (struct rapl_defaults *)id->driver_data; + defaults_msr = (const struct rapl_defaults *)id->driver_data; rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); if (!rapl_msr_platdev) diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From cc39325f927850473d3a84b029ae6f9b508e9bd1 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Mon, 2 Mar 2026 15:01:45 -0800 Subject: net: ethtool: Track pause storm events With TX pause enabled, if a device is unable to pass packets up to the stack (e.g., CPU is hanged), the device can cause pause storm. Given that devices can have native support to protect the neighbor from such flooding, such events need some tracking. This support is to track TX pause storm events for better observability. Reviewed-by: Oleksij Rempel Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260302230149.1580195-2-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 13 +++++++++++++ include/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink_generated.h | 1 + net/ethtool/pause.c | 4 +++- 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 0a2d2343f79a..4707063af3b4 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -879,6 +879,19 @@ attribute-sets: - name: rx-frames type: u64 + - + name: tx-pause-storm-events + type: u64 + doc: >- + TX pause storm event count. Increments each time device + detects that its pause assertion condition has been true + for too long for normal operation. As a result, the device + has temporarily disabled its own Pause TX function to + protect the network from itself. + This counter should never increment under normal overload + conditions; it indicates catastrophic failure like an OS + crash. The rate of incrementing is implementation specific. + - name: pause attr-cnt-name: __ethtool-a-pause-cnt diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 798abec67a1b..83c375840835 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -512,12 +512,14 @@ struct ethtool_eth_ctrl_stats { * * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` * from the standard. + * @tx_pause_storm_events: TX pause storm event count (see ethtool.yaml). */ struct ethtool_pause_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 tx_pause_frames; u64 rx_pause_frames; + u64 tx_pause_storm_events; ); }; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 556a0c834df5..114b83017297 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -381,6 +381,7 @@ enum { ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + ETHTOOL_A_PAUSE_STAT_TX_PAUSE_STORM_EVENTS, __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..5d28f642764c 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -130,7 +130,9 @@ static int pause_put_stats(struct sk_buff *skb, if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, - ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad) || + ethtool_put_stat(skb, pause_stats->tx_pause_storm_events, + ETHTOOL_A_PAUSE_STAT_TX_PAUSE_STORM_EVENTS, pad)) goto err_cancel; nla_nest_end(skb, nest); -- cgit v1.2.3 From 817de93c348a3086ecca6e03ff459138832157cc Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Mon, 2 Mar 2026 15:01:46 -0800 Subject: net: ethtool: Update doc for tunable ETHTOOL_PFC_PREVENTION_TOUT enables the configuration of timeout value for PFC storm prevention. This can also be used to configure storm detection timeout for global pause settings. In fact some existing drivers are already using it for the said purpose. Highlight that the knob can formally be used to configure timeout value for pause storm prevention mechanism. The update to the ethtool man page will follow afterwards. Link: https://lore.kernel.org/aa5f189a-ac62-4633-97b5-ebf939e9c535@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260302230149.1580195-3-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- include/uapi/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b74b80508553..1cdfb8341df2 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -225,7 +225,7 @@ enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, ETHTOOL_TX_COPYBREAK, - ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_PFC_PREVENTION_TOUT, /* both pause and pfc, see man ethtool */ ETHTOOL_TX_COPYBREAK_BUF_SIZE, /* * Add your fresh new tunable attribute above and remember to update -- cgit v1.2.3 From 31a6a07eefeb4c84bd6730fbe9e95fd9221712cf Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 13 Feb 2026 09:28:46 +0800 Subject: integrity: Make arch_ima_get_secureboot integrity-wide EVM and other LSMs need the ability to query the secure boot status of the system, without directly calling the IMA arch_ima_get_secureboot function. Refactor the secure boot status check into a general function named arch_get_secureboot. Reported-and-suggested-by: Mimi Zohar Suggested-by: Roberto Sassu Signed-off-by: Coiby Xu Acked-by: Ard Biesheuvel Signed-off-by: Mimi Zohar --- MAINTAINERS | 1 + arch/powerpc/kernel/ima_arch.c | 5 --- arch/powerpc/kernel/secure_boot.c | 6 +++ arch/s390/kernel/ima_arch.c | 6 --- arch/s390/kernel/ipl.c | 5 +++ arch/x86/include/asm/efi.h | 4 +- arch/x86/platform/efi/efi.c | 2 +- include/linux/ima.h | 7 +--- include/linux/secure_boot.h | 19 +++++++++ security/integrity/Makefile | 3 +- security/integrity/efi_secureboot.c | 56 +++++++++++++++++++++++++++ security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_efi.c | 47 +--------------------- security/integrity/ima/ima_main.c | 3 +- security/integrity/integrity.h | 1 + security/integrity/platform_certs/load_uefi.c | 2 +- security/integrity/secure_boot.c | 16 ++++++++ 17 files changed, 115 insertions(+), 70 deletions(-) create mode 100644 include/linux/secure_boot.h create mode 100644 security/integrity/efi_secureboot.c create mode 100644 security/integrity/secure_boot.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 61bf550fd37c..04823afa8b74 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12668,6 +12668,7 @@ R: Eric Snowberg L: linux-integrity@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git +F: include/linux/secure_boot.h F: security/integrity/ F: security/integrity/ima/ diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c index b7029beed847..0d8892a03526 100644 --- a/arch/powerpc/kernel/ima_arch.c +++ b/arch/powerpc/kernel/ima_arch.c @@ -7,11 +7,6 @@ #include #include -bool arch_ima_get_secureboot(void) -{ - return is_ppc_secureboot_enabled(); -} - /* * The "secure_rules" are enabled only on "secureboot" enabled systems. * These rules verify the file signatures against known good values. diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c index 3a28795b4ed8..28436c1599e0 100644 --- a/arch/powerpc/kernel/secure_boot.c +++ b/arch/powerpc/kernel/secure_boot.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include @@ -44,6 +45,11 @@ out: return enabled; } +bool arch_get_secureboot(void) +{ + return is_ppc_secureboot_enabled(); +} + bool is_ppc_trustedboot_enabled(void) { struct device_node *node; diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c index f3c3e6e1c5d3..6ccbe34ce408 100644 --- a/arch/s390/kernel/ima_arch.c +++ b/arch/s390/kernel/ima_arch.c @@ -1,12 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include - -bool arch_ima_get_secureboot(void) -{ - return ipl_secure_flag; -} const char * const *arch_get_ima_policy(void) { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 049c557c452f..bdbbedf52580 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2504,6 +2504,11 @@ out: return buf; } +bool arch_get_secureboot(void) +{ + return ipl_secure_flag; +} + int ipl_report_free(struct ipl_report *report) { struct ipl_report_component *comp, *ncomp; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index f227a70ac91f..ee382b56dd7b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -401,9 +401,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); -extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); +enum efi_secureboot_mode __x86_efi_boot_mode(void); -#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() +#define arch_efi_boot_mode __x86_efi_boot_mode() #ifdef CONFIG_EFI_RUNTIME_MAP int efi_get_runtime_map_size(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index d00c6de7f3b7..74032f3ab9b0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -920,7 +920,7 @@ umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n) return attr->mode; } -enum efi_secureboot_mode __x86_ima_efi_boot_mode(void) +enum efi_secureboot_mode __x86_efi_boot_mode(void) { return boot_params.secure_boot; } diff --git a/include/linux/ima.h b/include/linux/ima.h index abf8923f8fc5..8e08baf16c2f 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct linux_binprm; @@ -73,14 +74,8 @@ int ima_validate_range(phys_addr_t phys, size_t size); #endif #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT -extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else -static inline bool arch_ima_get_secureboot(void) -{ - return false; -} - static inline const char * const *arch_get_ima_policy(void) { return NULL; diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h new file mode 100644 index 000000000000..3ded3f03655c --- /dev/null +++ b/include/linux/secure_boot.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * + * Author: Coiby Xu + */ + +#ifndef _LINUX_SECURE_BOOT_H +#define _LINUX_SECURE_BOOT_H + +#include + +/* + * Returns true if the platform secure boot is enabled. + * Returns false if disabled or not supported. + */ +bool arch_get_secureboot(void); + +#endif /* _LINUX_SECURE_BOOT_H */ diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 92b63039c654..548665e2b702 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_INTEGRITY) += integrity.o -integrity-y := iint.o +integrity-y := iint.o secure_boot.o integrity-$(CONFIG_INTEGRITY_AUDIT) += integrity_audit.o integrity-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-$(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) += digsig_asymmetric.o @@ -18,6 +18,7 @@ integrity-$(CONFIG_LOAD_IPL_KEYS) += platform_certs/load_ipl_s390.o integrity-$(CONFIG_LOAD_PPC_KEYS) += platform_certs/efi_parser.o \ platform_certs/load_powerpc.o \ platform_certs/keyring_handler.o +integrity-$(CONFIG_EFI) += efi_secureboot.o # The relative order of the 'ima' and 'evm' LSMs depends on the order below. obj-$(CONFIG_IMA) += ima/ obj-$(CONFIG_EVM) += evm/ diff --git a/security/integrity/efi_secureboot.c b/security/integrity/efi_secureboot.c new file mode 100644 index 000000000000..bfd4260a83a3 --- /dev/null +++ b/security/integrity/efi_secureboot.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-1.0+ +/* + * Copyright (C) 2018 IBM Corporation + */ +#include +#include +#include + +#ifndef arch_efi_boot_mode +#define arch_efi_boot_mode efi_secureboot_mode_unset +#endif + +static enum efi_secureboot_mode get_sb_mode(void) +{ + enum efi_secureboot_mode mode; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + pr_info("integrity: secureboot mode unknown, no efi\n"); + return efi_secureboot_mode_unknown; + } + + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_disabled) + pr_info("integrity: secureboot mode disabled\n"); + else if (mode == efi_secureboot_mode_unknown) + pr_info("integrity: secureboot mode unknown\n"); + else + pr_info("integrity: secureboot mode enabled\n"); + return mode; +} + +/* + * Query secure boot status + * + * Note don't call this function too early e.g. in __setup hook otherwise the + * kernel may hang when calling efi_get_secureboot_mode. + * + */ +bool arch_get_secureboot(void) +{ + static enum efi_secureboot_mode sb_mode; + static bool initialized; + + if (!initialized && efi_enabled(EFI_BOOT)) { + sb_mode = arch_efi_boot_mode; + + if (sb_mode == efi_secureboot_mode_unset) + sb_mode = get_sb_mode(); + initialized = true; + } + + if (sb_mode == efi_secureboot_mode_enabled) + return true; + else + return false; +} diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 16c20c578ea8..ee2e0891febc 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -27,7 +27,7 @@ core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; - bool sb_state = arch_ima_get_secureboot(); + bool sb_state = arch_get_secureboot(); int appraisal_state = ima_appraise; if (!str) diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c index 138029bfcce1..78191879dd98 100644 --- a/security/integrity/ima/ima_efi.c +++ b/security/integrity/ima/ima_efi.c @@ -2,52 +2,9 @@ /* * Copyright (C) 2018 IBM Corporation */ -#include #include #include -#include - -#ifndef arch_ima_efi_boot_mode -#define arch_ima_efi_boot_mode efi_secureboot_mode_unset -#endif - -static enum efi_secureboot_mode get_sb_mode(void) -{ - enum efi_secureboot_mode mode; - - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { - pr_info("ima: secureboot mode unknown, no efi\n"); - return efi_secureboot_mode_unknown; - } - - mode = efi_get_secureboot_mode(efi.get_variable); - if (mode == efi_secureboot_mode_disabled) - pr_info("ima: secureboot mode disabled\n"); - else if (mode == efi_secureboot_mode_unknown) - pr_info("ima: secureboot mode unknown\n"); - else - pr_info("ima: secureboot mode enabled\n"); - return mode; -} - -bool arch_ima_get_secureboot(void) -{ - static enum efi_secureboot_mode sb_mode; - static bool initialized; - - if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = arch_ima_efi_boot_mode; - - if (sb_mode == efi_secureboot_mode_unset) - sb_mode = get_sb_mode(); - initialized = true; - } - - if (sb_mode == efi_secureboot_mode_enabled) - return true; - else - return false; -} +#include /* secureboot arch rules */ static const char * const sb_arch_rules[] = { @@ -67,7 +24,7 @@ static const char * const sb_arch_rules[] = { const char * const *arch_get_ima_policy(void) { - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_get_secureboot()) { if (IS_ENABLED(CONFIG_MODULE_SIG)) set_module_sig_enforced(); if (IS_ENABLED(CONFIG_KEXEC_SIG)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 1d6229b156fb..5808b52c8426 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -953,8 +953,7 @@ static int ima_load_data(enum kernel_load_data_id id, bool contents) switch (id) { case LOADING_KEXEC_IMAGE: - if (IS_ENABLED(CONFIG_KEXEC_SIG) - && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_KEXEC_SIG) && arch_get_secureboot()) { pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n"); return -EACCES; } diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 7b388b66cf80..4636629533af 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/security/integrity/platform_certs/load_uefi.c b/security/integrity/platform_certs/load_uefi.c index d1fdd113450a..c0d6948446c3 100644 --- a/security/integrity/platform_certs/load_uefi.c +++ b/security/integrity/platform_certs/load_uefi.c @@ -212,7 +212,7 @@ static int __init load_uefi_certs(void) } /* the MOK/MOKx can not be trusted when secure boot is disabled */ - if (!arch_ima_get_secureboot()) + if (!arch_get_secureboot()) return 0; mokx = get_cert_list(L"MokListXRT", &mok_var, &mokxsize, &status); diff --git a/security/integrity/secure_boot.c b/security/integrity/secure_boot.c new file mode 100644 index 000000000000..fc2693c286f8 --- /dev/null +++ b/security/integrity/secure_boot.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * + * Author: Coiby Xu + */ +#include + +/* + * Default weak implementation. + * Architectures that support secure boot must override this. + */ +__weak bool arch_get_secureboot(void) +{ + return false; +} -- cgit v1.2.3 From 0ec959cf4b5a609d7f27bf84064ef5372e30ab80 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Tue, 30 Sep 2025 10:26:56 +0800 Subject: evm: fix security.evm for a file with IMA signature When both IMA and EVM fix modes are enabled, accessing a file with IMA signature but missing EVM HMAC won't cause security.evm to be fixed. Add a function evm_fix_hmac which will be explicitly called to fix EVM HMAC for this case. Suggested-by: Mimi Zohar Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/evm.h | 8 ++++++++ security/integrity/evm/evm_main.c | 28 ++++++++++++++++++++++++++++ security/integrity/ima/ima_appraise.c | 5 +++++ 3 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/evm.h b/include/linux/evm.h index ddece4a6b25d..913f4573b203 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -18,6 +18,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len); +int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len); int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count); @@ -51,6 +53,12 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, { return INTEGRITY_UNKNOWN; } + +static inline int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len) +{ + return -EOPNOTSUPP; +} #endif static inline int evm_inode_init_security(struct inode *inode, struct inode *dir, diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index cfc3531cf53f..1b0089b4b796 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -795,6 +795,34 @@ bool evm_revalidate_status(const char *xattr_name) return true; } +/** + * evm_fix_hmac - Calculate the HMAC and add it to security.evm for fix mode + * @dentry: pointer to the affected dentry which doesn't yet have security.evm + * xattr + * @xattr_name: pointer to the affected extended attribute name + * @xattr_value: pointer to the new extended attribute value + * @xattr_value_len: pointer to the new extended attribute value length + * + * Expects to be called with i_mutex locked. + * + * Return: 0 on success, -EPERM/-ENOMEM/-EOPNOTSUPP on failure + */ +int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len) + +{ + if (!evm_fixmode || !evm_revalidate_status((xattr_name))) + return -EPERM; + + if (!(evm_initialized & EVM_INIT_HMAC)) + return -EPERM; + + if (is_unsupported_hmac_fs(dentry)) + return -EOPNOTSUPP; + + return evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); +} + /** * evm_inode_post_setxattr - update 'security.evm' to reflect the changes * @dentry: pointer to the affected dentry diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index ee2e0891febc..0d41d102626a 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -591,6 +591,11 @@ out: xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; + } else if (status == INTEGRITY_NOLABEL) { + if (!evm_fix_hmac(dentry, XATTR_NAME_IMA, + (const char *)xattr_value, + xattr_len)) + status = INTEGRITY_PASS; } /* -- cgit v1.2.3 From 8505bfb4e4eca28ef1b20d3369435ec2d6a125c6 Mon Sep 17 00:00:00 2001 From: Pengjie Zhang Date: Fri, 13 Feb 2026 18:09:35 +0800 Subject: ACPI: CPPC: Move reference performance to capabilities Currently, the `Reference Performance` register is read every time the CPU frequency is sampled in `cppc_get_perf_ctrs()`. This function is on the hot path of the cppc_cpufreq driver. Reference Performance indicates the performance level that corresponds to the Reference Counter incrementing and is not expected to change dynamically during runtime (unlike the Delivered and Reference counters). Reading this register in the hot path incurs unnecessary overhead, particularly on platforms where CPC registers are located in the PCC (Platform Communication Channel) subspace. This patch moves `reference_perf` from the dynamic feedback counters structure (`cppc_perf_fb_ctrs`) to the static capabilities structure (`cppc_perf_caps`). Signed-off-by: Pengjie Zhang [ rjw: Changelog adjustment ] Link: https://patch.msgid.link/20260213100935.19111-1-zhangpengjie2@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 55 ++++++++++++++++++------------------------ drivers/cpufreq/cppc_cpufreq.c | 21 +++++++++------- include/acpi/cppc_acpi.h | 2 +- 3 files changed, 37 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 53a6ffd995a1..07bbf5b366a4 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -177,12 +177,12 @@ __ATTR(_name, 0444, show_##_name, NULL) show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, highest_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_perf); +show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, reference_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_nonlinear_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, guaranteed_perf); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_freq); show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_freq); -show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, reference_perf); show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, wraparound_time); /* Check for valid access_width, otherwise, fallback to using bit_width */ @@ -1352,9 +1352,10 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); struct cpc_register_resource *highest_reg, *lowest_reg, - *lowest_non_linear_reg, *nominal_reg, *guaranteed_reg, - *low_freq_reg = NULL, *nom_freq_reg = NULL; - u64 high, low, guaranteed, nom, min_nonlinear, low_f = 0, nom_f = 0; + *lowest_non_linear_reg, *nominal_reg, *reference_reg, + *guaranteed_reg, *low_freq_reg = NULL, *nom_freq_reg = NULL; + u64 high, low, guaranteed, nom, ref, min_nonlinear, + low_f = 0, nom_f = 0; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); struct cppc_pcc_data *pcc_ss_data = NULL; int ret = 0, regs_in_pcc = 0; @@ -1368,6 +1369,7 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) lowest_reg = &cpc_desc->cpc_regs[LOWEST_PERF]; lowest_non_linear_reg = &cpc_desc->cpc_regs[LOW_NON_LINEAR_PERF]; nominal_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + reference_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; low_freq_reg = &cpc_desc->cpc_regs[LOWEST_FREQ]; nom_freq_reg = &cpc_desc->cpc_regs[NOMINAL_FREQ]; guaranteed_reg = &cpc_desc->cpc_regs[GUARANTEED_PERF]; @@ -1375,6 +1377,7 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) /* Are any of the regs PCC ?*/ if (CPC_IN_PCC(highest_reg) || CPC_IN_PCC(lowest_reg) || CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg) || + (CPC_SUPPORTED(reference_reg) && CPC_IN_PCC(reference_reg)) || CPC_IN_PCC(low_freq_reg) || CPC_IN_PCC(nom_freq_reg) || CPC_IN_PCC(guaranteed_reg)) { if (pcc_ss_id < 0) { @@ -1400,6 +1403,17 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) cpc_read(cpunum, nominal_reg, &nom); perf_caps->nominal_perf = nom; + /* + * If reference perf register is not supported then we should + * use the nominal perf value + */ + if (CPC_SUPPORTED(reference_reg)) { + cpc_read(cpunum, reference_reg, &ref); + perf_caps->reference_perf = ref; + } else { + perf_caps->reference_perf = nom; + } + if (guaranteed_reg->type != ACPI_TYPE_BUFFER || IS_NULL_REG(&guaranteed_reg->cpc_entry.reg)) { perf_caps->guaranteed_perf = 0; @@ -1411,7 +1425,7 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) cpc_read(cpunum, lowest_non_linear_reg, &min_nonlinear); perf_caps->lowest_nonlinear_perf = min_nonlinear; - if (!high || !low || !nom || !min_nonlinear) + if (!high || !low || !nom || !ref || !min_nonlinear) ret = -EFAULT; /* Read optional lowest and nominal frequencies if present */ @@ -1441,20 +1455,10 @@ EXPORT_SYMBOL_GPL(cppc_get_perf_caps); bool cppc_perf_ctrs_in_pcc_cpu(unsigned int cpu) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); - struct cpc_register_resource *ref_perf_reg; - - /* - * If reference perf register is not supported then we should use the - * nominal perf value - */ - ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; - if (!CPC_SUPPORTED(ref_perf_reg)) - ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; return CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || - CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME]) || - CPC_IN_PCC(ref_perf_reg); + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME]); } EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc_cpu); @@ -1491,10 +1495,10 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); struct cpc_register_resource *delivered_reg, *reference_reg, - *ref_perf_reg, *ctr_wrap_reg; + *ctr_wrap_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); struct cppc_pcc_data *pcc_ss_data = NULL; - u64 delivered, reference, ref_perf, ctr_wrap_time; + u64 delivered, reference, ctr_wrap_time; int ret = 0, regs_in_pcc = 0; if (!cpc_desc) { @@ -1504,19 +1508,11 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) delivered_reg = &cpc_desc->cpc_regs[DELIVERED_CTR]; reference_reg = &cpc_desc->cpc_regs[REFERENCE_CTR]; - ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; ctr_wrap_reg = &cpc_desc->cpc_regs[CTR_WRAP_TIME]; - /* - * If reference perf register is not supported then we should - * use the nominal perf value - */ - if (!CPC_SUPPORTED(ref_perf_reg)) - ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; - /* Are any of the regs PCC ?*/ if (CPC_IN_PCC(delivered_reg) || CPC_IN_PCC(reference_reg) || - CPC_IN_PCC(ctr_wrap_reg) || CPC_IN_PCC(ref_perf_reg)) { + CPC_IN_PCC(ctr_wrap_reg)) { if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id\n"); return -ENODEV; @@ -1533,8 +1529,6 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) cpc_read(cpunum, delivered_reg, &delivered); cpc_read(cpunum, reference_reg, &reference); - cpc_read(cpunum, ref_perf_reg, &ref_perf); - /* * Per spec, if ctr_wrap_time optional register is unsupported, then the * performance counters are assumed to never wrap during the lifetime of @@ -1544,14 +1538,13 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) if (CPC_SUPPORTED(ctr_wrap_reg)) cpc_read(cpunum, ctr_wrap_reg, &ctr_wrap_time); - if (!delivered || !reference || !ref_perf) { + if (!delivered || !reference) { ret = -EFAULT; goto out_err; } perf_fb_ctrs->delivered = delivered; perf_fb_ctrs->reference = reference; - perf_fb_ctrs->reference_perf = ref_perf; perf_fb_ctrs->wraparound_time = ctr_wrap_time; out_err: if (regs_in_pcc) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 94d489a4c90d..5dfb109cf1f4 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -50,7 +50,8 @@ struct cppc_freq_invariance { static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); static struct kthread_worker *kworker_fie; -static int cppc_perf_from_fbctrs(struct cppc_perf_fb_ctrs *fb_ctrs_t0, +static int cppc_perf_from_fbctrs(u64 reference_perf, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1); /** @@ -70,7 +71,7 @@ static void __cppc_scale_freq_tick(struct cppc_freq_invariance *cppc_fi) struct cppc_perf_fb_ctrs fb_ctrs = {0}; struct cppc_cpudata *cpu_data; unsigned long local_freq_scale; - u64 perf; + u64 perf, ref_perf; cpu_data = cppc_fi->cpu_data; @@ -79,7 +80,9 @@ static void __cppc_scale_freq_tick(struct cppc_freq_invariance *cppc_fi) return; } - perf = cppc_perf_from_fbctrs(&cppc_fi->prev_perf_fb_ctrs, &fb_ctrs); + ref_perf = cpu_data->perf_caps.reference_perf; + perf = cppc_perf_from_fbctrs(ref_perf, + &cppc_fi->prev_perf_fb_ctrs, &fb_ctrs); if (!perf) return; @@ -747,13 +750,11 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_perf_from_fbctrs(struct cppc_perf_fb_ctrs *fb_ctrs_t0, +static int cppc_perf_from_fbctrs(u64 reference_perf, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf; - - reference_perf = fb_ctrs_t0->reference_perf; delta_reference = get_delta(fb_ctrs_t1->reference, fb_ctrs_t0->reference); @@ -790,7 +791,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; struct cppc_cpudata *cpu_data; - u64 delivered_perf; + u64 delivered_perf, reference_perf; int ret; if (!policy) @@ -807,7 +808,9 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) return 0; } - delivered_perf = cppc_perf_from_fbctrs(&fb_ctrs_t0, &fb_ctrs_t1); + reference_perf = cpu_data->perf_caps.reference_perf; + delivered_perf = cppc_perf_from_fbctrs(reference_perf, + &fb_ctrs_t0, &fb_ctrs_t1); if (!delivered_perf) goto out_invalid_counters; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index f7afa20b8ad9..d8e405becdc3 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -121,6 +121,7 @@ struct cppc_perf_caps { u32 guaranteed_perf; u32 highest_perf; u32 nominal_perf; + u32 reference_perf; u32 lowest_perf; u32 lowest_nonlinear_perf; u32 lowest_freq; @@ -138,7 +139,6 @@ struct cppc_perf_ctrls { struct cppc_perf_fb_ctrs { u64 reference; u64 delivered; - u64 reference_perf; u64 wraparound_time; }; -- cgit v1.2.3 From fab0c75d500fd23de6ea1b30e44635418a6dae65 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Mar 2026 10:10:22 -0800 Subject: x86/cpu: Add platform ID to CPU matching structure The existing x86_match_cpu() infrastructure can be used to match a bunch of attributes of a CPU: vendor, family, model, steppings and CPU features. But, there's one more attribute that's missing and unable to be matched against: the platform ID, enumerated on Intel CPUs in MSR_IA32_PLATFORM_ID. It is a little more obscure and is only queried during microcode loading. This is because Intel sometimes has CPUs with identical family/model/stepping but which need different microcode. These CPUs are differentiated with the platform ID. Add a field in 'struct x86_cpu_id' for the platform ID. Similar to the stepping field, make the new field a mask of platform IDs. Some examples: 0x01: matches only platform ID 0x0 0x02: matches only platform ID 0x1 0x03: matches platform IDs 0x0 or 0x1 0x80: matches only platform ID 0x7 0xff: matches all 8 possible platform IDs Since the mask is only a byte wide, it nestles in next to another u8 and does not even increase the size of 'struct x86_cpu_id'. Reserve the all 0's value as the wildcard (X86_PLATFORM_ANY). This avoids forcing changes to existing 'struct x86_cpu_id' users. They can just continue to fill the field with 0's and their matching will work exactly as before. Note: If someone is ever looking for space in 'struct x86_cpu_id', this new field could probably get stuck over in ->driver_data for the one user that there is. Signed-off-by: Dave Hansen Reviewed-by: Sohil Mehta Link: https://patch.msgid.link/20260304181022.058DF07C@davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/match.c | 3 +++ include/linux/mod_devicetable.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 6af1e8baeb0f..4604802692da 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -76,6 +76,9 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) if (m->steppings != X86_STEPPING_ANY && !(BIT(c->x86_stepping) & m->steppings)) continue; + if (m->platform_mask != X86_PLATFORM_ANY && + !(BIT(c->intel_platform_id) & m->platform_mask)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; if (!x86_match_vendor_cpu_type(c, m)) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5b1725fe9707..23ff24080dfd 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -691,6 +691,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 platform_mask; __u8 type; kernel_ulong_t driver_data; }; @@ -702,6 +703,7 @@ struct x86_cpu_id { #define X86_STEPPING_ANY 0 #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf +#define X86_PLATFORM_ANY 0x0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ #define X86_CPU_TYPE_ANY 0 -- cgit v1.2.3 From c36218dc49f5e9ef9e3074670fdae7ac3a7e794f Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 4 Mar 2026 13:14:08 +0530 Subject: drm/ras: Introduce the DRM RAS infrastructure over generic netlink Introduces the DRM RAS infrastructure over generic netlink. The new interface allows drivers to expose RAS nodes and their associated error counters to userspace in a structured and extensible way. Each drm_ras node can register its own set of error counters, which are then discoverable and queryable through netlink operations. This lays the groundwork for reporting and managing hardware error states in a unified manner across different DRM drivers. Currently it only supports error-counter nodes. But it can be extended later. The registration is also not tied to any drm node, so it can be used by accel devices as well. It uses the new and mandatory YAML description format stored in Documentation/netlink/specs/. This forces a single generic netlink family namespace for the entire drm: "drm-ras". But multiple-endpoints are supported within the single family. Any modification to this API needs to be applied to Documentation/netlink/specs/drm_ras.yaml before regenerating the code: $ tools/net/ynl/pyynl/ynl_gen_c.py --spec \ Documentation/netlink/specs/drm_ras.yaml --mode uapi --header \ -o include/uapi/drm/drm_ras.h $ tools/net/ynl/pyynl/ynl_gen_c.py --spec \ Documentation/netlink/specs/drm_ras.yaml --mode kernel \ --header -o drivers/gpu/drm/drm_ras_nl.h $ tools/net/ynl/pyynl/ynl_gen_c.py --spec \ Documentation/netlink/specs/drm_ras.yaml \ --mode kernel --source -o drivers/gpu/drm/drm_ras_nl.c Cc: Zack McKevitt Cc: Lijo Lazar Cc: Hawking Zhang Cc: Jakub Kicinski Cc: David S. Miller Cc: Paolo Abeni Cc: Eric Dumazet Cc: netdev@vger.kernel.org Co-developed-by: Aravind Iddamsetty Signed-off-by: Aravind Iddamsetty Signed-off-by: Riana Tauro Reviewed-by: Zack McKevitt Acked-by: Jakub Kicinski Acked-by: Maarten Lankhorst Link: https://patch.msgid.link/20260304074412.464435-8-riana.tauro@intel.com Signed-off-by: Rodrigo Vivi --- Documentation/gpu/drm-ras.rst | 103 +++++++++ Documentation/gpu/index.rst | 1 + Documentation/netlink/specs/drm_ras.yaml | 115 ++++++++++ drivers/gpu/drm/Kconfig | 10 + drivers/gpu/drm/Makefile | 1 + drivers/gpu/drm/drm_drv.c | 6 + drivers/gpu/drm/drm_ras.c | 354 +++++++++++++++++++++++++++++++ drivers/gpu/drm/drm_ras_genl_family.c | 42 ++++ drivers/gpu/drm/drm_ras_nl.c | 56 +++++ drivers/gpu/drm/drm_ras_nl.h | 24 +++ include/drm/drm_ras.h | 75 +++++++ include/drm/drm_ras_genl_family.h | 17 ++ include/uapi/drm/drm_ras.h | 49 +++++ 13 files changed, 853 insertions(+) create mode 100644 Documentation/gpu/drm-ras.rst create mode 100644 Documentation/netlink/specs/drm_ras.yaml create mode 100644 drivers/gpu/drm/drm_ras.c create mode 100644 drivers/gpu/drm/drm_ras_genl_family.c create mode 100644 drivers/gpu/drm/drm_ras_nl.c create mode 100644 drivers/gpu/drm/drm_ras_nl.h create mode 100644 include/drm/drm_ras.h create mode 100644 include/drm/drm_ras_genl_family.h create mode 100644 include/uapi/drm/drm_ras.h (limited to 'include') diff --git a/Documentation/gpu/drm-ras.rst b/Documentation/gpu/drm-ras.rst new file mode 100644 index 000000000000..70b246a78fc8 --- /dev/null +++ b/Documentation/gpu/drm-ras.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +============================ +DRM RAS over Generic Netlink +============================ + +The DRM RAS (Reliability, Availability, Serviceability) interface provides a +standardized way for GPU/accelerator drivers to expose error counters and +other reliability nodes to user space via Generic Netlink. This allows +diagnostic tools, monitoring daemons, or test infrastructure to query hardware +health in a uniform way across different DRM drivers. + +Key Goals: + +* Provide a standardized RAS solution for GPU and accelerator drivers, enabling + data center monitoring and reliability operations. +* Implement a single drm-ras Generic Netlink family to meet modern Netlink YAML + specifications and centralize all RAS-related communication in one namespace. +* Support a basic error counter interface, addressing the immediate, essential + monitoring needs. +* Offer a flexible, future-proof interface that can be extended to support + additional types of RAS data in the future. +* Allow multiple nodes per driver, enabling drivers to register separate + nodes for different IP blocks, sub-blocks, or other logical subdivisions + as applicable. + +Nodes +===== + +Nodes are logical abstractions representing an error type or error source within +the device. Currently, only error counter nodes is supported. + +Drivers are responsible for registering and unregistering nodes via the +`drm_ras_node_register()` and `drm_ras_node_unregister()` APIs. + +Node Management +------------------- + +.. kernel-doc:: drivers/gpu/drm/drm_ras.c + :doc: DRM RAS Node Management +.. kernel-doc:: drivers/gpu/drm/drm_ras.c + :internal: + +Generic Netlink Usage +===================== + +The interface is implemented as a Generic Netlink family named ``drm-ras``. +User space tools can: + +* List registered nodes with the ``list-nodes`` command. +* List all error counters in an node with the ``get-error-counter`` command with ``node-id`` + as a parameter. +* Query specific error counter values with the ``get-error-counter`` command, using both + ``node-id`` and ``error-id`` as parameters. + +YAML-based Interface +-------------------- + +The interface is described in a YAML specification ``Documentation/netlink/specs/drm_ras.yaml`` + +This YAML is used to auto-generate user space bindings via +``tools/net/ynl/pyynl/ynl_gen_c.py``, and drives the structure of netlink +attributes and operations. + +Usage Notes +----------- + +* User space must first enumerate nodes to obtain their IDs. +* Node IDs or Node names can be used for all further queries, such as error counters. +* Error counters can be queried by either the Error ID or Error name. +* Query Parameters should be defined as part of the uAPI to ensure user interface stability. +* The interface supports future extension by adding new node types and + additional attributes. + +Example: List nodes using ynl + +.. code-block:: bash + + sudo ynl --family drm_ras --dump list-nodes + [{'device-name': '0000:03:00.0', + 'node-id': 0, + 'node-name': 'correctable-errors', + 'node-type': 'error-counter'}, + {'device-name': '0000:03:00.0', + 'node-id': 1, + 'node-name': 'uncorrectable-errors', + 'node-type': 'error-counter'}] + +Example: List all error counters using ynl + +.. code-block:: bash + + sudo ynl --family drm_ras --dump get-error-counter --json '{"node-id":0}' + [{'error-id': 1, 'error-name': 'error_name1', 'error-value': 0}, + {'error-id': 2, 'error-name': 'error_name2', 'error-value': 0}] + +Example: Query an error counter for a given node + +.. code-block:: bash + + sudo ynl --family drm_ras --do get-error-counter --json '{"node-id":0, "error-id":1}' + {'error-id': 1, 'error-name': 'error_name1', 'error-value': 0} + diff --git a/Documentation/gpu/index.rst b/Documentation/gpu/index.rst index 7dcb15850afd..60c73fdcfeed 100644 --- a/Documentation/gpu/index.rst +++ b/Documentation/gpu/index.rst @@ -9,6 +9,7 @@ GPU Driver Developer's Guide drm-mm drm-kms drm-kms-helpers + drm-ras drm-uapi drm-usage-stats driver-uapi diff --git a/Documentation/netlink/specs/drm_ras.yaml b/Documentation/netlink/specs/drm_ras.yaml new file mode 100644 index 000000000000..79af25dac3c5 --- /dev/null +++ b/Documentation/netlink/specs/drm_ras.yaml @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: drm-ras +protocol: genetlink +uapi-header: drm/drm_ras.h + +doc: >- + DRM RAS (Reliability, Availability, Serviceability) over Generic Netlink. + Provides a standardized mechanism for DRM drivers to register "nodes" + representing hardware/software components capable of reporting error counters. + Userspace tools can query the list of nodes or individual error counters + via the Generic Netlink interface. + +definitions: + - + type: enum + name: node-type + value-start: 1 + entries: [error-counter] + doc: >- + Type of the node. Currently, only error-counter nodes are + supported, which expose reliability counters for a hardware/software + component. + +attribute-sets: + - + name: node-attrs + attributes: + - + name: node-id + type: u32 + doc: >- + Unique identifier for the node. + Assigned dynamically by the DRM RAS core upon registration. + - + name: device-name + type: string + doc: >- + Device name chosen by the driver at registration. + Can be a PCI BDF, UUID, or module name if unique. + - + name: node-name + type: string + doc: >- + Node name chosen by the driver at registration. + Can be an IP block name, or any name that identifies the + RAS node inside the device. + - + name: node-type + type: u32 + doc: Type of this node, identifying its function. + enum: node-type + - + name: error-counter-attrs + attributes: + - + name: node-id + type: u32 + doc: Node ID targeted by this error counter operation. + - + name: error-id + type: u32 + doc: Unique identifier for a specific error counter within an node. + - + name: error-name + type: string + doc: Name of the error. + - + name: error-value + type: u32 + doc: Current value of the requested error counter. + +operations: + list: + - + name: list-nodes + doc: >- + Retrieve the full list of currently registered DRM RAS nodes. + Each node includes its dynamically assigned ID, name, and type. + **Important:** User space must call this operation first to obtain + the node IDs. These IDs are required for all subsequent + operations on nodes, such as querying error counters. + attribute-set: node-attrs + flags: [admin-perm] + dump: + reply: + attributes: + - node-id + - device-name + - node-name + - node-type + - + name: get-error-counter + doc: >- + Retrieve error counter for a given node. + The response includes the id, the name, and even the current + value of each counter. + attribute-set: error-counter-attrs + flags: [admin-perm] + do: + request: + attributes: + - node-id + - error-id + reply: + attributes: &errorinfo + - error-id + - error-name + - error-value + dump: + request: + attributes: + - node-id + reply: + attributes: *errorinfo diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index d3d52310c9cc..18da88e050ce 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -130,6 +130,16 @@ config DRM_PANIC_SCREEN_QR_VERSION Smaller QR code are easier to read, but will contain less debugging data. Default is 40. +config DRM_RAS + bool "DRM RAS support" + depends on DRM + depends on NET + help + Enables the DRM RAS (Reliability, Availability and Serviceability) + support for DRM drivers. This provides a Generic Netlink interface + for error reporting and queries. + If in doubt, say "N". + config DRM_DEBUG_DP_MST_TOPOLOGY_REFS bool "Enable refcount backtrace history in the DP MST helpers" depends on STACKTRACE_SUPPORT diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 0c21029c446f..d1ad4ce873a3 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -95,6 +95,7 @@ drm-$(CONFIG_DRM_ACCEL) += ../../accel/drm_accel.o drm-$(CONFIG_DRM_PANIC) += drm_panic.o drm-$(CONFIG_DRM_DRAW) += drm_draw.o drm-$(CONFIG_DRM_PANIC_SCREEN_QR_CODE) += drm_panic_qr.o +drm-$(CONFIG_DRM_RAS) += drm_ras.o drm_ras_nl.o drm_ras_genl_family.o obj-$(CONFIG_DRM) += drm.o obj-$(CONFIG_DRM_PANEL_ORIENTATION_QUIRKS) += drm_panel_orientation_quirks.o diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 2915118436ce..6b965c3d3307 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "drm_crtc_internal.h" #include "drm_internal.h" @@ -1223,6 +1224,7 @@ static const struct file_operations drm_stub_fops = { static void drm_core_exit(void) { + drm_ras_genl_family_unregister(); drm_privacy_screen_lookup_exit(); drm_panic_exit(); accel_core_exit(); @@ -1261,6 +1263,10 @@ static int __init drm_core_init(void) drm_privacy_screen_lookup_init(); + ret = drm_ras_genl_family_register(); + if (ret < 0) + goto error; + drm_core_init_complete = true; DRM_DEBUG("Initialized\n"); diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c new file mode 100644 index 000000000000..b2fa5ab86d87 --- /dev/null +++ b/drivers/gpu/drm/drm_ras.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include + +#include "drm_ras_nl.h" + +/** + * DOC: DRM RAS Node Management + * + * This module provides the infrastructure to manage RAS (Reliability, + * Availability, and Serviceability) nodes for DRM drivers. Each + * DRM driver may register one or more RAS nodes, which represent + * logical components capable of reporting error counters and other + * reliability metrics. + * + * The nodes are stored in a global xarray `drm_ras_xa` to allow + * efficient lookup by ID. Nodes can be registered or unregistered + * dynamically at runtime. + * + * A Generic Netlink family `drm_ras` exposes two main operations to + * userspace: + * + * 1. LIST_NODES: Dump all currently registered RAS nodes. + * The user receives an array of node IDs, names, and types. + * + * 2. GET_ERROR_COUNTER: Get error counters of a given node. + * Userspace must provide Node ID, Error ID (Optional for specific counter). + * Returns all counters of a node if only Node ID is provided or specific + * error counters. + * + * Node registration: + * + * - drm_ras_node_register(): Registers a new node and assigns + * it a unique ID in the xarray. + * - drm_ras_node_unregister(): Removes a previously registered + * node from the xarray. + * + * Node type: + * + * - ERROR_COUNTER: + * + Currently, only error counters are supported. + * + The driver must implement the query_error_counter() callback to provide + * the name and the value of the error counter. + * + The driver must provide a error_counter_range.last value informing the + * last valid error ID. + * + The driver can provide a error_counter_range.first value informing the + * first valid error ID. + * + The error counters in the driver doesn't need to be contiguous, but the + * driver must return -ENOENT to the query_error_counter as an indication + * that the ID should be skipped and not listed in the netlink API. + * + * Netlink handlers: + * + * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES + * operation, iterating over the xarray. + * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit + * operation, fetching all counters from a specific node. + * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit + * operation, fetching a counter value from a specific node. + */ + +static DEFINE_XARRAY_ALLOC(drm_ras_xa); + +/* + * The netlink callback context carries dump state across multiple dumpit calls + */ +struct drm_ras_ctx { + /* Which xarray id to restart the dump from */ + unsigned long restart; +}; + +/** + * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes + * @skb: Netlink message buffer + * @cb: Callback context for multi-part dumps + * + * Iterates over all registered RAS nodes in the global xarray and appends + * their attributes (ID, name, type) to the given netlink message buffer. + * Uses @cb->ctx to track progress in case the message buffer fills up, allowing + * multi-part dump support. On buffer overflow, updates the context to resume + * from the last node on the next invocation. + * + * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if + * the buffer filled up (requires multi-part continuation), or + * a negative error code on failure. + */ +int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + struct drm_ras_ctx *ctx = (void *)cb->ctx; + struct drm_ras_node *node; + struct nlattr *hdr; + unsigned long id; + int ret; + + xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) { + hdr = genlmsg_iput(skb, info); + if (!hdr) { + ret = -EMSGSIZE; + break; + } + + ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id); + if (ret) { + genlmsg_cancel(skb, hdr); + break; + } + + ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME, + node->device_name); + if (ret) { + genlmsg_cancel(skb, hdr); + break; + } + + ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME, + node->node_name); + if (ret) { + genlmsg_cancel(skb, hdr); + break; + } + + ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE, + node->type); + if (ret) { + genlmsg_cancel(skb, hdr); + break; + } + + genlmsg_end(skb, hdr); + } + + if (ret == -EMSGSIZE) + ctx->restart = id; + + return ret; +} + +static int get_node_error_counter(u32 node_id, u32 error_id, + const char **name, u32 *value) +{ + struct drm_ras_node *node; + + node = xa_load(&drm_ras_xa, node_id); + if (!node || !node->query_error_counter) + return -ENOENT; + + if (error_id < node->error_counter_range.first || + error_id > node->error_counter_range.last) + return -EINVAL; + + return node->query_error_counter(node, error_id, name, value); +} + +static int msg_reply_value(struct sk_buff *msg, u32 error_id, + const char *error_name, u32 value) +{ + int ret; + + ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); + if (ret) + return ret; + + ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, + error_name); + if (ret) + return ret; + + return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, + value); +} + +static int doit_reply_value(struct genl_info *info, u32 node_id, + u32 error_id) +{ + struct sk_buff *msg; + struct nlattr *hdr; + const char *error_name; + u32 value; + int ret; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_iput(msg, info); + if (!hdr) { + nlmsg_free(msg); + return -EMSGSIZE; + } + + ret = get_node_error_counter(node_id, error_id, + &error_name, &value); + if (ret) + return ret; + + ret = msg_reply_value(msg, error_id, error_name, value); + if (ret) { + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); +} + +/** + * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters + * @skb: Netlink message buffer + * @cb: Callback context for multi-part dumps + * + * Iterates over all error counters in a given Node and appends + * their attributes (ID, name, value) to the given netlink message buffer. + * Uses @cb->ctx to track progress in case the message buffer fills up, allowing + * multi-part dump support. On buffer overflow, updates the context to resume + * from the last node on the next invocation. + * + * Return: 0 if all errors fit in @skb, number of bytes added to @skb if + * the buffer filled up (requires multi-part continuation), or + * a negative error code on failure. + */ +int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + struct drm_ras_ctx *ctx = (void *)cb->ctx; + struct drm_ras_node *node; + struct nlattr *hdr; + const char *error_name; + u32 node_id, error_id, value; + int ret; + + if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID)) + return -EINVAL; + + node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); + + node = xa_load(&drm_ras_xa, node_id); + if (!node) + return -ENOENT; + + for (error_id = max(node->error_counter_range.first, ctx->restart); + error_id <= node->error_counter_range.last; + error_id++) { + ret = get_node_error_counter(node_id, error_id, + &error_name, &value); + /* + * For non-contiguous range, driver return -ENOENT as indication + * to skip this ID when listing all errors. + */ + if (ret == -ENOENT) + continue; + if (ret) + return ret; + + hdr = genlmsg_iput(skb, info); + + if (!hdr) { + ret = -EMSGSIZE; + break; + } + + ret = msg_reply_value(skb, error_id, error_name, value); + if (ret) { + genlmsg_cancel(skb, hdr); + break; + } + + genlmsg_end(skb, hdr); + } + + if (ret == -EMSGSIZE) + ctx->restart = error_id; + + return ret; +} + +/** + * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node + * @skb: Netlink message buffer + * @info: Generic Netlink info containing attributes of the request + * + * Extracts the node ID and error ID from the netlink attributes and + * retrieves the current value of the corresponding error counter. Sends the + * result back to the requesting user via the standard Genl reply. + * + * Return: 0 on success, or negative errno on failure. + */ +int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, + struct genl_info *info) +{ + u32 node_id, error_id; + + if (!info->attrs || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) + return -EINVAL; + + node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); + error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); + + return doit_reply_value(info, node_id, error_id); +} + +/** + * drm_ras_node_register() - Register a new RAS node + * @node: Node structure to register + * + * Adds the given RAS node to the global node xarray and assigns it + * a unique ID. Both @node->name and @node->type must be valid. + * + * Return: 0 on success, or negative errno on failure: + */ +int drm_ras_node_register(struct drm_ras_node *node) +{ + if (!node->device_name || !node->node_name) + return -EINVAL; + + /* Currently, only Error Counter Endpoints are supported */ + if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER) + return -EINVAL; + + /* Mandatory entries for Error Counter Node */ + if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER && + (!node->error_counter_range.last || !node->query_error_counter)) + return -EINVAL; + + return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL); +} +EXPORT_SYMBOL(drm_ras_node_register); + +/** + * drm_ras_node_unregister() - Unregister a previously registered node + * @node: Node structure to unregister + * + * Removes the given node from the global node xarray using its ID. + */ +void drm_ras_node_unregister(struct drm_ras_node *node) +{ + xa_erase(&drm_ras_xa, node->id); +} +EXPORT_SYMBOL(drm_ras_node_unregister); diff --git a/drivers/gpu/drm/drm_ras_genl_family.c b/drivers/gpu/drm/drm_ras_genl_family.c new file mode 100644 index 000000000000..6f406d3d48c5 --- /dev/null +++ b/drivers/gpu/drm/drm_ras_genl_family.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include +#include "drm_ras_nl.h" + +/* Track family registration so the drm_exit can be called at any time */ +static bool registered; + +/** + * drm_ras_genl_family_register() - Register drm-ras genl family + * + * Only to be called one at drm_drv_init() + */ +int drm_ras_genl_family_register(void) +{ + int ret; + + registered = false; + + ret = genl_register_family(&drm_ras_nl_family); + if (ret) + return ret; + + registered = true; + return 0; +} + +/** + * drm_ras_genl_family_unregister() - Unregister drm-ras genl family + * + * To be called one at drm_drv_exit() at any moment, but only once. + */ +void drm_ras_genl_family_unregister(void) +{ + if (registered) { + genl_unregister_family(&drm_ras_nl_family); + registered = false; + } +} diff --git a/drivers/gpu/drm/drm_ras_nl.c b/drivers/gpu/drm/drm_ras_nl.c new file mode 100644 index 000000000000..16803d0c4a44 --- /dev/null +++ b/drivers/gpu/drm/drm_ras_nl.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/drm_ras.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include +#include + +#include "drm_ras_nl.h" + +#include + +/* DRM_RAS_CMD_GET_ERROR_COUNTER - do */ +static const struct nla_policy drm_ras_get_error_counter_do_nl_policy[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID + 1] = { + [DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID] = { .type = NLA_U32, }, + [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, +}; + +/* DRM_RAS_CMD_GET_ERROR_COUNTER - dump */ +static const struct nla_policy drm_ras_get_error_counter_dump_nl_policy[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID + 1] = { + [DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for drm_ras */ +static const struct genl_split_ops drm_ras_nl_ops[] = { + { + .cmd = DRM_RAS_CMD_LIST_NODES, + .dumpit = drm_ras_nl_list_nodes_dumpit, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = DRM_RAS_CMD_GET_ERROR_COUNTER, + .doit = drm_ras_nl_get_error_counter_doit, + .policy = drm_ras_get_error_counter_do_nl_policy, + .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DRM_RAS_CMD_GET_ERROR_COUNTER, + .dumpit = drm_ras_nl_get_error_counter_dumpit, + .policy = drm_ras_get_error_counter_dump_nl_policy, + .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, +}; + +struct genl_family drm_ras_nl_family __ro_after_init = { + .name = DRM_RAS_FAMILY_NAME, + .version = DRM_RAS_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = drm_ras_nl_ops, + .n_split_ops = ARRAY_SIZE(drm_ras_nl_ops), +}; diff --git a/drivers/gpu/drm/drm_ras_nl.h b/drivers/gpu/drm/drm_ras_nl.h new file mode 100644 index 000000000000..06ccd9342773 --- /dev/null +++ b/drivers/gpu/drm/drm_ras_nl.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/drm_ras.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_DRM_RAS_GEN_H +#define _LINUX_DRM_RAS_GEN_H + +#include +#include + +#include + +int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, + struct genl_info *info); +int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +extern struct genl_family drm_ras_nl_family; + +#endif /* _LINUX_DRM_RAS_GEN_H */ diff --git a/include/drm/drm_ras.h b/include/drm/drm_ras.h new file mode 100644 index 000000000000..5d50209e51db --- /dev/null +++ b/include/drm/drm_ras.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef __DRM_RAS_H__ +#define __DRM_RAS_H__ + +#include + +/** + * struct drm_ras_node - A DRM RAS Node + */ +struct drm_ras_node { + /** @id: Unique identifier for the node. Dynamically assigned. */ + u32 id; + /** + * @device_name: Human-readable name of the device. Given by the driver. + */ + const char *device_name; + /** @node_name: Human-readable name of the node. Given by the driver. */ + const char *node_name; + /** @type: Type of the node (enum drm_ras_node_type). */ + enum drm_ras_node_type type; + + /* Error-Counter Related Callback and Variables */ + + /** @error_counter_range: Range of valid Error IDs for this node. */ + struct { + /** @first: First valid Error ID. */ + u32 first; + /** @last: Last valid Error ID. Mandatory entry. */ + u32 last; + } error_counter_range; + + /** + * @query_error_counter: + * + * This callback is used by drm-ras to query a specific error counter. + * Used for input check and to iterate all error counters in a node. + * + * Driver should expect query_error_counter() to be called with + * error_id from `error_counter_range.first` to + * `error_counter_range.last`. + * + * The @query_error_counter is a mandatory callback for + * error_counter_node. + * + * Returns: 0 on success, + * -ENOENT when error_id is not supported as an indication that + * drm_ras should silently skip this entry. Used for + * supporting non-contiguous error ranges. + * Driver is responsible for maintaining the list of + * supported error IDs in the range of first to last. + * Other negative values on errors that should terminate the + * netlink query. + */ + int (*query_error_counter)(struct drm_ras_node *node, u32 error_id, + const char **name, u32 *val); + + /** @priv: Driver private data */ + void *priv; +}; + +struct drm_device; + +#if IS_ENABLED(CONFIG_DRM_RAS) +int drm_ras_node_register(struct drm_ras_node *node); +void drm_ras_node_unregister(struct drm_ras_node *node); +#else +static inline int drm_ras_node_register(struct drm_ras_node *node) { return 0; } +static inline void drm_ras_node_unregister(struct drm_ras_node *node) { } +#endif + +#endif diff --git a/include/drm/drm_ras_genl_family.h b/include/drm/drm_ras_genl_family.h new file mode 100644 index 000000000000..910fb3943a75 --- /dev/null +++ b/include/drm/drm_ras_genl_family.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef __DRM_RAS_GENL_FAMILY_H__ +#define __DRM_RAS_GENL_FAMILY_H__ + +#if IS_ENABLED(CONFIG_DRM_RAS) +int drm_ras_genl_family_register(void); +void drm_ras_genl_family_unregister(void); +#else +static inline int drm_ras_genl_family_register(void) { return 0; } +static inline void drm_ras_genl_family_unregister(void) { } +#endif + +#endif diff --git a/include/uapi/drm/drm_ras.h b/include/uapi/drm/drm_ras.h new file mode 100644 index 000000000000..5f40fa5b869d --- /dev/null +++ b/include/uapi/drm/drm_ras.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/drm_ras.yaml */ +/* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _UAPI_LINUX_DRM_RAS_H +#define _UAPI_LINUX_DRM_RAS_H + +#define DRM_RAS_FAMILY_NAME "drm-ras" +#define DRM_RAS_FAMILY_VERSION 1 + +/* + * Type of the node. Currently, only error-counter nodes are supported, which + * expose reliability counters for a hardware/software component. + */ +enum drm_ras_node_type { + DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1, +}; + +enum { + DRM_RAS_A_NODE_ATTRS_NODE_ID = 1, + DRM_RAS_A_NODE_ATTRS_DEVICE_NAME, + DRM_RAS_A_NODE_ATTRS_NODE_NAME, + DRM_RAS_A_NODE_ATTRS_NODE_TYPE, + + __DRM_RAS_A_NODE_ATTRS_MAX, + DRM_RAS_A_NODE_ATTRS_MAX = (__DRM_RAS_A_NODE_ATTRS_MAX - 1) +}; + +enum { + DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID = 1, + DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, + DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, + DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, + + __DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX, + DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX = (__DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX - 1) +}; + +enum { + DRM_RAS_CMD_LIST_NODES = 1, + DRM_RAS_CMD_GET_ERROR_COUNTER, + + __DRM_RAS_CMD_MAX, + DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1) +}; + +#endif /* _UAPI_LINUX_DRM_RAS_H */ -- cgit v1.2.3 From b40db12b542f503b5ec689d18d473299d49eeb60 Mon Sep 17 00:00:00 2001 From: Riana Tauro Date: Wed, 4 Mar 2026 13:14:09 +0530 Subject: drm/xe/xe_drm_ras: Add support for XE DRM RAS Allocate correctable, uncorrectable nodes for every xe device. Each node contains error component, counters and respective query counter functions. Add basic functionality to create and register drm nodes. Below operations can be performed using Generic netlink DRM RAS interface: 1) List Nodes: $ sudo ynl --family drm_ras --dump list-nodes [{'device-name': '0000:03:00.0', 'node-id': 0, 'node-name': 'correctable-errors', 'node-type': 'error-counter'}, {'device-name': '0000:03:00.0', 'node-id': 1, 'node-name': 'uncorrectable-errors', 'node-type': 'error-counter'}] 2) Get Error counters: $ sudo ynl --family drm_ras --dump get-error-counter --json '{"node-id":0}' [{'error-id': 1, 'error-name': 'core-compute', 'error-value': 0}, {'error-id': 2, 'error-name': 'soc-internal', 'error-value': 0}] 3) Get specific Error counter: $ sudo ynl --family drm_ras --do get-error-counter --json '{"node-id":0, "error-id":1}' {'error-id': 1, 'error-name': 'core-compute', 'error-value': 0} Signed-off-by: Riana Tauro Reviewed-by: Raag Jadav Link: https://patch.msgid.link/20260304074412.464435-9-riana.tauro@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_device_types.h | 4 + drivers/gpu/drm/xe/xe_drm_ras.c | 186 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_drm_ras.h | 15 +++ drivers/gpu/drm/xe/xe_drm_ras_types.h | 40 ++++++++ include/uapi/drm/xe_drm.h | 79 +++++++++++++++ 6 files changed, 325 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.c create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.h create mode 100644 drivers/gpu/drm/xe/xe_drm_ras_types.h (limited to 'include') diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 4a7eaeeca293..a32d370c3d30 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -41,6 +41,7 @@ xe-y += xe_bb.o \ xe_device_sysfs.o \ xe_dma_buf.o \ xe_drm_client.o \ + xe_drm_ras.o \ xe_eu_stall.o \ xe_exec.o \ xe_exec_queue.o \ diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index e9032014923d..3e04e80e0815 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -13,6 +13,7 @@ #include #include "xe_devcoredump_types.h" +#include "xe_drm_ras_types.h" #include "xe_heci_gsc.h" #include "xe_late_bind_fw_types.h" #include "xe_oa_types.h" @@ -511,6 +512,9 @@ struct xe_device { /** @pmu: performance monitoring unit */ struct xe_pmu pmu; + /** @ras: RAS structure for device */ + struct xe_drm_ras ras; + /** @i2c: I2C host controller */ struct xe_i2c *i2c; diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c new file mode 100644 index 000000000000..e07dc23a155e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include + +#include +#include +#include + +#include "xe_device_types.h" +#include "xe_drm_ras.h" + +static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; +static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; + +static int hw_query_error_counter(struct xe_drm_ras_counter *info, + u32 error_id, const char **name, u32 *val) +{ + if (!info || !info[error_id].name) + return -ENOENT; + + *name = info[error_id].name; + *val = atomic_read(&info[error_id].counter); + + return 0; +} + +static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, + const char **name, u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; + + return hw_query_error_counter(info, error_id, name, val); +} + +static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, + const char **name, u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; + + return hw_query_error_counter(info, error_id, name, val); +} + +static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) +{ + struct xe_drm_ras_counter *counter; + int i; + + counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { + if (!error_components[i]) + continue; + + counter[i].name = error_components[i]; + atomic_set(&counter[i].counter, 0); + } + + return counter; +} + +static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, + const enum drm_xe_ras_error_severity severity) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct xe_drm_ras *ras = &xe->ras; + const char *device_name; + + device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + + if (!device_name) + return -ENOMEM; + + node->device_name = device_name; + node->node_name = error_severity[severity]; + node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; + node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; + node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; + node->priv = xe; + + ras->info[severity] = allocate_and_copy_counters(xe); + if (IS_ERR(ras->info[severity])) + return PTR_ERR(ras->info[severity]); + + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) + node->query_error_counter = query_correctable_error_counter; + else + node->query_error_counter = query_uncorrectable_error_counter; + + return 0; +} + +static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) +{ + struct drm_ras_node *node = &ras->node[severity]; + + kfree(ras->info[severity]); + ras->info[severity] = NULL; + + kfree(node->device_name); + node->device_name = NULL; +} + +static int register_nodes(struct xe_device *xe) +{ + struct xe_drm_ras *ras = &xe->ras; + int i; + + for_each_error_severity(i) { + struct drm_ras_node *node = &ras->node[i]; + int ret; + + ret = assign_node_params(xe, node, i); + if (ret) { + cleanup_node_param(ras, i); + return ret; + } + + ret = drm_ras_node_register(node); + if (ret) { + cleanup_node_param(ras, i); + return ret; + } + } + + return 0; +} + +static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) +{ + struct xe_device *xe = arg; + struct xe_drm_ras *ras = &xe->ras; + int i; + + for_each_error_severity(i) { + struct drm_ras_node *node = &ras->node[i]; + + drm_ras_node_unregister(node); + cleanup_node_param(ras, i); + } +} + +/** + * xe_drm_ras_init() - Initialize DRM RAS + * @xe: xe device instance + * + * Allocate and register DRM RAS nodes per device + * + * Return: 0 on success, negative error code otherwise. + */ +int xe_drm_ras_init(struct xe_device *xe) +{ + struct xe_drm_ras *ras = &xe->ras; + struct drm_ras_node *node; + int err; + + node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + ras->node = node; + + err = register_nodes(xe); + if (err) { + drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); + return err; + } + + err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); + if (err) { + drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); + return err; + } + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h new file mode 100644 index 000000000000..5cc8f0124411 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ +#ifndef XE_DRM_RAS_H_ +#define XE_DRM_RAS_H_ + +struct xe_device; + +#define for_each_error_severity(i) \ + for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++) + +int xe_drm_ras_init(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h b/drivers/gpu/drm/xe/xe_drm_ras_types.h new file mode 100644 index 000000000000..7acc5e7377b2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef _XE_DRM_RAS_TYPES_H_ +#define _XE_DRM_RAS_TYPES_H_ + +#include +#include + +struct drm_ras_node; + +/** + * struct xe_drm_ras_counter - XE RAS counter + * + * This structure contains error component and counter information + */ +struct xe_drm_ras_counter { + /** @name: error component name */ + const char *name; + + /** @counter: count of error */ + atomic_t counter; +}; + +/** + * struct xe_drm_ras - XE DRM RAS structure + * + * This structure has details of error counters + */ +struct xe_drm_ras { + /** @node: DRM RAS node */ + struct drm_ras_node *node; + + /** @info: info array for all types of errors */ + struct xe_drm_ras_counter *info[DRM_XE_RAS_ERR_SEV_MAX]; +}; + +#endif diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index ef2565048bdf..b0264c32ceb2 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -2357,6 +2357,85 @@ struct drm_xe_exec_queue_set_property { __u64 reserved[2]; }; +/** + * DOC: Xe DRM RAS + * + * The enums and strings defined below map to the attributes of the DRM RAS Netlink Interface. + * Refer to Documentation/netlink/specs/drm_ras.yaml for complete interface specification. + * + * Node Registration + * ================= + * + * The driver registers DRM RAS nodes for each error severity level. + * enum drm_xe_ras_error_severity defines the node-id, while DRM_XE_RAS_ERROR_SEVERITY_NAMES maps + * node-id to node-name. + * + * Error Classification + * ==================== + * + * Each node contains a list of error counters. Each error is identified by a error-id and + * an error-name. enum drm_xe_ras_error_component defines the error-id, while + * DRM_XE_RAS_ERROR_COMPONENT_NAMES maps error-id to error-name. + * + * User Interface + * ============== + * + * To retrieve error values of a error counter, userspace applications should + * follow the below steps: + * + * 1. Use command LIST_NODES to enumerate all available nodes + * 2. Select node by node-id or node-name + * 3. Use command GET_ERROR_COUNTERS to list errors of specific node + * 4. Query specific error values using either error-id or error-name + * + * .. code-block:: C + * + * // Lookup tables for ID-to-name resolution + * static const char *nodes[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; + * static const char *errors[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; + * + */ + +/** + * enum drm_xe_ras_error_severity - DRM RAS error severity. + */ +enum drm_xe_ras_error_severity { + /** @DRM_XE_RAS_ERR_SEV_CORRECTABLE: Correctable Error */ + DRM_XE_RAS_ERR_SEV_CORRECTABLE = 0, + /** @DRM_XE_RAS_ERR_SEV_UNCORRECTABLE: Uncorrectable Error */ + DRM_XE_RAS_ERR_SEV_UNCORRECTABLE, + /** @DRM_XE_RAS_ERR_SEV_MAX: Max severity */ + DRM_XE_RAS_ERR_SEV_MAX /* non-ABI */ +}; + +/** + * enum drm_xe_ras_error_component - DRM RAS error component. + */ +enum drm_xe_ras_error_component { + /** @DRM_XE_RAS_ERR_COMP_CORE_COMPUTE: Core Compute Error */ + DRM_XE_RAS_ERR_COMP_CORE_COMPUTE = 1, + /** @DRM_XE_RAS_ERR_COMP_SOC_INTERNAL: SoC Internal Error */ + DRM_XE_RAS_ERR_COMP_SOC_INTERNAL, + /** @DRM_XE_RAS_ERR_COMP_MAX: Max Error */ + DRM_XE_RAS_ERR_COMP_MAX /* non-ABI */ +}; + +/* + * Error severity to name mapping. + */ +#define DRM_XE_RAS_ERROR_SEVERITY_NAMES { \ + [DRM_XE_RAS_ERR_SEV_CORRECTABLE] = "correctable-errors", \ + [DRM_XE_RAS_ERR_SEV_UNCORRECTABLE] = "uncorrectable-errors", \ +} + +/* + * Error component to name mapping. + */ +#define DRM_XE_RAS_ERROR_COMPONENT_NAMES { \ + [DRM_XE_RAS_ERR_COMP_CORE_COMPUTE] = "core-compute", \ + [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = "soc-internal" \ +} + #if defined(__cplusplus) } #endif -- cgit v1.2.3 From de70eef32e10883fe74f6df635c616785b24b867 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:13:09 -0800 Subject: ARM: omap: fix all kernel-doc warnings Use the correct struct member names to avoid kernel-doc warnings: Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'volt_nominal' not described in 'omap_volt_data' Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'vp_errgain' not described in 'omap_volt_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051309.556228-1-rdunlap@infradead.org Signed-off-by: Kevin Hilman --- include/linux/platform_data/voltage-omap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/voltage-omap.h b/include/linux/platform_data/voltage-omap.h index 6d74e507dbd2..2b48f2b0135d 100644 --- a/include/linux/platform_data/voltage-omap.h +++ b/include/linux/platform_data/voltage-omap.h @@ -10,14 +10,14 @@ /** * struct omap_volt_data - Omap voltage specific data. - * @voltage_nominal: The possible voltage value in uV + * @volt_nominal: The possible voltage value in uV * @sr_efuse_offs: The offset of the efuse register(from system * control module base address) from where to read * the n-target value for the smartreflex module. * @sr_errminlimit: Error min limit value for smartreflex. This value * differs at differnet opp and thus is linked * with voltage. - * @vp_errorgain: Error gain value for the voltage processor. This + * @vp_errgain: Error gain value for the voltage processor. This * field also differs according to the voltage/opp. */ struct omap_volt_data { -- cgit v1.2.3 From 46cb1fcdb75b2dab2f3ed62caad04fe939549943 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 02:27:06 +0000 Subject: tcp: move tcp_v6_early_demux() to net/ipv6/ip6_input.c tcp_v6_early_demux() has a single caller : ip6_rcv_finish_core(). Move it to net/ipv6/ip6_input.c and mark it static, for possible compiler/linker optimizations. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260304022706.1062459-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - net/ipv6/ip6_input.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 38 -------------------------------------- 3 files changed, 40 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3f1fe954e6aa..a64641423806 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1142,7 +1142,6 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); -void tcp_v6_early_demux(struct sk_buff *skb); #endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 2bcb981c91aa..967b07aeb683 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -44,6 +44,46 @@ #include #include #include +#include + +static void tcp_v6_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net_rcu(skb->dev); + const struct ipv6hdr *hdr; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr))) + return; + + hdr = ipv6_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + /* Note : We use inet6_iif() here, not tcp_v6_iif() */ + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + inet6_iif(skb), inet6_sdif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk_fullsock(sk)) { + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); + + if (dst) + dst = dst_check(dst, sk->sk_rx_dst_cookie); + if (dst && + sk->sk_rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } +} static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a62dd2999aec..164dceb842af 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1972,44 +1972,6 @@ do_time_wait: goto discard_it; } -void tcp_v6_early_demux(struct sk_buff *skb) -{ - struct net *net = dev_net_rcu(skb->dev); - const struct ipv6hdr *hdr; - const struct tcphdr *th; - struct sock *sk; - - if (skb->pkt_type != PACKET_HOST) - return; - - if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; - - hdr = ipv6_hdr(skb); - th = tcp_hdr(skb); - - if (th->doff < sizeof(struct tcphdr) / 4) - return; - - /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, &hdr->saddr, th->source, - &hdr->daddr, ntohs(th->dest), - inet6_iif(skb), inet6_sdif(skb)); - if (sk) { - skb->sk = sk; - skb->destructor = sock_edemux; - if (sk_fullsock(sk)) { - struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); - - if (dst) - dst = dst_check(dst, sk->sk_rx_dst_cookie); - if (dst && - sk->sk_rx_dst_ifindex == skb->skb_iif) - skb_dst_set_noref(skb, dst); - } - } -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), }; -- cgit v1.2.3 From d4d8c6e6fd2a1c5144339884ca5f66e654ad54a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Mar 2026 23:54:16 +0000 Subject: tcp: Initialise ehash secrets during connect() and listen(). inet_ehashfn() and inet6_ehashfn() initialise random secrets on the first call by net_get_random_once(). While the init part is patched out using static keys, with CONFIG_STACKPROTECTOR_STRONG=y, this causes a compiler to generate a stack canary due to an automatic variable, unsigned long ___flags, in the DO_ONCE() macro being passed to __do_once_start(). With FDO, this is visible in __inet_lookup_established() and __inet6_lookup_established() too. Let's initialise the secrets by get_random_sleepable_once() in the slow paths: inet_hash() for listen(), and inet_hash_connect() and inet6_hash_connect() for connect(). Note that IPv6 listener will initialise both IPv4 & IPv6 secrets in inet_hash() for IPv4-mapped IPv6 address. With the patch, the stack size is reduced by 16 bytes (___flags + a stack canary) and NOPs for the static key go away. Before: __inet6_lookup_established() ... push %rbx sub $0x38,%rsp # stack is 56 bytes mov %edx,%ebx # sport mov %gs:0x299419f(%rip),%rax # load stack canary mov %rax,0x30(%rsp) and store it onto stack mov 0x440(%rdi),%r15 # net->ipv4.tcp_death_row.hashinfo nop 32: mov %r8d,%ebp # hnum shl $0x10,%ebp # hnum << 16 nop 3d: mov 0x70(%rsp),%r14d # sdif or %ebx,%ebp # INET_COMBINED_PORTS(sport, hnum) mov 0x11a8382(%rip),%eax # inet6_ehashfn() ... After: __inet6_lookup_established() ... push %rbx sub $0x28,%rsp # stack is 40 bytes mov 0x60(%rsp),%ebp # sdif mov %r8d,%r14d # hnum shl $0x10,%r14d # hnum << 16 or %edx,%r14d # INET_COMBINED_PORTS(sport, hnum) mov 0x440(%rdi),%rax # net->ipv4.tcp_death_row.hashinfo mov 0x1194f09(%rip),%r10d # inet6_ehashfn() ... Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260303235424.3877267-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 ++ include/net/inet6_hashtables.h | 2 ++ net/ipv4/inet_hashtables.c | 17 +++++++++++++++-- net/ipv6/inet6_hashtables.c | 13 ++++++++++--- 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index f58b38ab37f8..a8e818de95b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -304,6 +304,8 @@ do { \ #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) +#define net_get_random_sleepable_once(buf, nbytes) \ + get_random_sleepable_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index c16de5b7963f..2cc5d416bbb5 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -24,6 +24,8 @@ struct inet_hashinfo; +void inet6_init_ehash_secret(void); + static inline unsigned int __inet6_ehashfn(const u32 lhash, const u16 lport, const u32 fhash, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 61e654b395be..ac7b67c603b5 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -30,12 +30,16 @@ #include #include +static void inet_init_ehash_secret(void) +{ + net_get_random_sleepable_once(&inet_ehash_secret, + sizeof(inet_ehash_secret)); +} + u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { - net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); - return lport + __inet_ehashfn(laddr, 0, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } @@ -793,6 +797,13 @@ int inet_hash(struct sock *sk) local_bh_enable(); return 0; } + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + inet6_init_ehash_secret(); +#endif + inet_init_ehash_secret(); + WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); @@ -1239,6 +1250,8 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); + inet_init_ehash_secret(); + hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, inet->inet_daddr, inet->inet_dport); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 182d38e6d6d8..72bc68fef48a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -23,15 +23,20 @@ #include #include +void inet6_init_ehash_secret(void) +{ + net_get_random_sleepable_once(&inet6_ehash_secret, + sizeof(inet6_ehash_secret)); + net_get_random_sleepable_once(&tcp_ipv6_hash_secret, + sizeof(tcp_ipv6_hash_secret)); +} + u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; - net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); - net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); @@ -363,6 +368,8 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); + inet6_init_ehash_secret(); + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); return __inet_hash_connect(death_row, sk, port_offset, hash_port0, -- cgit v1.2.3 From f3e334fb7f82cd63734faeb395419ab713b4bb5c Mon Sep 17 00:00:00 2001 From: Ricardo Robaina Date: Tue, 3 Mar 2026 10:35:28 -0300 Subject: audit: fix coding style issues Fix various coding style issues across the audit subsystem flagged by checkpatch.pl script to adhere to kernel coding standards. Specific changes include: - kernel/auditfilter.c: Move the open brace '{' to the previous line for the audit_ops array declaration. - lib/audit.c: Add a required space before the open parenthesis '('. - include/uapi/linux/audit.h: Enclose the complex macro value for AUDIT_UID_UNSET in parentheses. Signed-off-by: Ricardo Robaina Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 2 +- kernel/auditfilter.c | 3 +-- lib/audit.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 71cbdc542ce9..e8f5ce677df7 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -508,7 +508,7 @@ struct audit_tty_status { __u32 log_passwd; /* 1 = enabled, 0 = disabled */ }; -#define AUDIT_UID_UNSET (unsigned int)-1 +#define AUDIT_UID_UNSET ((unsigned int)-1) #define AUDIT_SID_UNSET ((unsigned int)-1) /* audit_rule_data supports filter rules with both integer and string diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6e3abbf08e3d..093425123f6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -303,8 +303,7 @@ exit_err: return ERR_PTR(err); } -static u32 audit_ops[] = -{ +static u32 audit_ops[] = { [Audit_equal] = AUDIT_EQUAL, [Audit_not_equal] = AUDIT_NOT_EQUAL, [Audit_bitmask] = AUDIT_BIT_MASK, diff --git a/lib/audit.c b/lib/audit.c index 738bda22dd39..bc07fbd3a698 100644 --- a/lib/audit.c +++ b/lib/audit.c @@ -42,7 +42,7 @@ int audit_classify_syscall(int abi, unsigned syscall) if (audit_is_compat(abi)) return audit_classify_compat_syscall(abi, syscall); - switch(syscall) { + switch (syscall) { #ifdef __NR_open case __NR_open: return AUDITSC_OPEN; -- cgit v1.2.3 From 5b30afc20b3fea29b9beb83c6415c4ff06f774aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2026 11:26:47 -1000 Subject: cgroup: Expose some cgroup helpers Expose the following through cgroup.h: - cgroup_on_dfl() - cgroup_is_dead() - cgroup_for_each_live_child() - cgroup_for_each_live_descendant_pre() - cgroup_for_each_live_descendant_post() Until now, these didn't need to be exposed because controllers only cared about the css hierarchy. The planned sched_ext hierarchical scheduler support will be based on the default cgroup hierarchy, which is in line with the existing BPF cgroup support, and thus needs these exposed. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 65 +++++++++++++++++++++++++++++++++++++++-- kernel/cgroup/cgroup-internal.h | 6 ---- kernel/cgroup/cgroup.c | 55 ---------------------------------- 3 files changed, 63 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bc892e3b37ee..e52160e85af4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -42,6 +42,14 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -76,6 +84,7 @@ enum cgroup_lifetime_events { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; @@ -103,6 +112,8 @@ extern struct blocking_notifier_head cgroup_lifetime_notifier; #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) +bool cgroup_on_dfl(const struct cgroup *cgrp); + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, @@ -274,6 +285,32 @@ void css_task_iter_end(struct css_task_iter *it); for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + +/* walk live descendants in pre order */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor @@ -336,6 +373,27 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) return cgrp->kn->id; } +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (CGROUP_HAS_SUBSYS_CONFIG && ss) + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->self; +} + /** * css_is_dying - test whether the specified css is dying * @css: target css @@ -372,6 +430,11 @@ static inline bool css_is_self(struct cgroup_subsys_state *css) return false; } +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); @@ -387,8 +450,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } -extern struct mutex cgroup_mutex; - static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 3bfe37693d68..58797123b752 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible; for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset) } bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index be1d71dda317..cdc63be63f2c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -68,14 +68,6 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) -/* - * To avoid confusing the compiler (and generating warnings) with code - * that attempts to access what would be a 0-element array (i.e. sized - * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this - * constant expression can be added. - */ -#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -509,27 +501,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp) return cgrp->root->subsys_mask; } -/** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns @cgrp->self) - * - * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This - * function must be called either under cgroup_mutex or rcu_read_lock() and - * the caller is responsible for pinning the returned css if it wants to - * keep accessing it outside the said locks. This function may return - * %NULL if @cgrp doesn't have @subsys_id enabled. - */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - if (CGROUP_HAS_SUBSYS_CONFIG && ss) - return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); - else - return &cgrp->self; -} - /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest @@ -741,32 +712,6 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) -/* iterate over child cgrps, lock should be held throughout iteration */ -#define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - cgroup_is_dead(child); })) \ - ; \ - else - -/* walk live descendants in pre order */ -#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ - css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - -/* walk live descendants in postorder */ -#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ - css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state -- cgit v1.2.3 From 336faf5d9115ca6982b82cf122e68738ea8c4173 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:16:53 +1100 Subject: VFS: make lookup_one_qstr_excl() static. lookup_one_qstr_excl() is no longer used outside of namei.c, so make it static. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-9-neilb@ownmail.net Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namei.c | 5 ++--- include/linux/namei.h | 3 --- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 52ff1d19405b..1dd31ab417a2 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1361,3 +1361,10 @@ to match what strlen() would return if it was ran on the string. However, if the string is freely accessible for the duration of inode's lifetime, consider using inode_set_cached_link() instead. + +--- + +**mandatory** + +lookup_one_qstr_excl() is no longer exported - use start_creating() or +similar. diff --git a/fs/namei.c b/fs/namei.c index 11c9a4a6c396..a5daa62399d7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1782,8 +1782,8 @@ static struct dentry *lookup_dcache(const struct qstr *name, * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, unsigned int flags) +static struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) { struct dentry *dentry; struct dentry *old; @@ -1820,7 +1820,6 @@ found: } return dentry; } -EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry diff --git a/include/linux/namei.h b/include/linux/namei.h index 58600cf234bc..c7a7288cdd25 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -54,9 +54,6 @@ extern int path_pts(struct path *path); extern int user_path_at(int, const char __user *, unsigned, struct path *); -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -- cgit v1.2.3 From 08e6183ed2568e733e05e7e1c9de737d91c21155 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Feb 2026 18:36:07 +0100 Subject: wifi: move action code from per-type frame structs The action code actually serves to identify the type of action frame, so it really isn't part of the per-type structure. Pull it out and have it in the general action frame format. In theory, whether or not the action code is present in this way is up to each category, but all categories that are defined right now all have that value. While at it, and since this change requires changing all users, remove the 'u' and make it an anonymous union in this case, so that all code using this changes. Change IEEE80211_MIN_ACTION_SIZE to take an argument which says how much of the frame is needed, e.g. category, action_code or the specific frame type that's defined in the union. Again this also ensures that all code is updated. In some cases, fix bugs where the SKB length was checked after having accessed beyond the checked length, in particular in FTM code, e.g. ieee80211_is_ftm(). Link: https://patch.msgid.link/20260226183607.67e71846b59e.I9a24328e3ffcaae179466a935f1c3345029f9961@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 4 +- drivers/net/wireless/ath/ath12k/mac.c | 4 +- drivers/net/wireless/ath/ath12k/wifi7/hw.c | 2 +- drivers/net/wireless/intel/iwlwifi/mld/time_sync.c | 6 +- .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 7 +- drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c | 6 +- drivers/net/wireless/marvell/mwifiex/tdls.c | 12 +- drivers/net/wireless/marvell/mwl8k.c | 4 +- .../net/wireless/mediatek/mt76/mt76_connac_mac.c | 6 +- drivers/net/wireless/mediatek/mt76/mt7925/mac.c | 4 +- drivers/net/wireless/mediatek/mt76/mt7996/mac.c | 4 +- drivers/net/wireless/realtek/rtl8xxxu/core.c | 14 +-- drivers/net/wireless/realtek/rtlwifi/base.c | 28 ++--- drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- drivers/net/wireless/silabs/wfx/data_rx.c | 8 +- include/linux/ieee80211.h | 83 +++++--------- net/mac80211/agg-rx.c | 27 ++--- net/mac80211/agg-tx.c | 28 ++--- net/mac80211/eht.c | 21 ++-- net/mac80211/ht.c | 31 +++--- net/mac80211/ibss.c | 18 +-- net/mac80211/iface.c | 18 +-- net/mac80211/mesh.c | 14 +-- net/mac80211/mesh_hwmp.c | 20 ++-- net/mac80211/mesh_plink.c | 21 ++-- net/mac80211/mlme.c | 82 ++++++-------- net/mac80211/rx.c | 123 +++++++++------------ net/mac80211/s1g.c | 28 ++--- net/mac80211/spectmgmt.c | 31 +++--- net/mac80211/tdls.c | 29 ++--- net/mac80211/util.c | 5 +- net/mac80211/vht.c | 10 +- 32 files changed, 308 insertions(+), 392 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index d1c121e943cb..a48b6bf1f29a 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -6288,10 +6288,10 @@ static int ath11k_mac_mgmt_action_frame_fill_elem_data(struct ath11k_vif *arvif, lockdep_assert_held(&ar->conf_mutex); /* make sure category field is present */ - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return -EINVAL; - remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE; + remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE(category); has_protected = ieee80211_has_protected(hdr->frame_control); /* In case of SW crypto and hdr protected (PMF), packet will already be encrypted, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index af7590639dbf..a03881c73d68 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -9119,10 +9119,10 @@ static int ath12k_mac_mgmt_action_frame_fill_elem_data(struct ath12k_link_vif *a lockdep_assert_wiphy(wiphy); /* make sure category field is present */ - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return -EINVAL; - remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE; + remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE(category); has_protected = ieee80211_has_protected(hdr->frame_control); /* In case of SW crypto and hdr protected (PMF), packet will already be encrypted, diff --git a/drivers/net/wireless/ath/ath12k/wifi7/hw.c b/drivers/net/wireless/ath/ath12k/wifi7/hw.c index 27acdfc35459..ec6dba96640b 100644 --- a/drivers/net/wireless/ath/ath12k/wifi7/hw.c +++ b/drivers/net/wireless/ath/ath12k/wifi7/hw.c @@ -104,7 +104,7 @@ static bool ath12k_is_addba_resp_action_code(struct ieee80211_mgmt *mgmt) if (mgmt->u.action.category != WLAN_CATEGORY_BACK) return false; - if (mgmt->u.action.u.addba_resp.action_code != WLAN_ACTION_ADDBA_RESP) + if (mgmt->u.action.action_code != WLAN_ACTION_ADDBA_RESP) return false; return true; diff --git a/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c b/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c index 897ab65b71aa..474dd555e70b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2025 Intel Corporation + * Copyright (C) 2025-2026 Intel Corporation */ #include "mld.h" @@ -116,9 +116,9 @@ static bool iwl_mld_is_skb_match(struct sk_buff *skb, u8 *addr, u8 dialog_token) u8 skb_dialog_token; if (ieee80211_is_timing_measurement(skb)) - skb_dialog_token = mgmt->u.action.u.wnm_timing_msr.dialog_token; + skb_dialog_token = mgmt->u.action.wnm_timing_msr.dialog_token; else - skb_dialog_token = mgmt->u.action.u.ftm.dialog_token; + skb_dialog_token = mgmt->u.action.ftm.dialog_token; if ((ether_addr_equal(mgmt->sa, addr) || ether_addr_equal(mgmt->da, addr)) && diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index ebc569e94f55..1b67836b1fac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include #include @@ -1409,8 +1409,7 @@ void iwl_mvm_ftm_lc_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) struct iwl_mvm_loc_entry *entry; const u8 *ies, *lci, *civic, *msr_ie; size_t ies_len, lci_len = 0, civic_len = 0; - size_t baselen = IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.ftm); + size_t baselen = IEEE80211_MIN_ACTION_SIZE(ftm); static const u8 rprt_type_lci = IEEE80211_SPCT_MSR_RPRT_TYPE_LCI; static const u8 rprt_type_civic = IEEE80211_SPCT_MSR_RPRT_TYPE_CIVIC; @@ -1419,7 +1418,7 @@ void iwl_mvm_ftm_lc_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) lockdep_assert_held(&mvm->mutex); - ies = mgmt->u.action.u.ftm.variable; + ies = mgmt->u.action.ftm.variable; ies_len = len - baselen; msr_ie = cfg80211_find_ie_match(WLAN_EID_MEASURE_REPORT, ies, ies_len, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c index edae3e24192b..039b4daac73f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022, 2026 Intel Corporation */ #include "mvm.h" @@ -18,9 +18,9 @@ static bool iwl_mvm_is_skb_match(struct sk_buff *skb, u8 *addr, u8 dialog_token) u8 skb_dialog_token; if (ieee80211_is_timing_measurement(skb)) - skb_dialog_token = mgmt->u.action.u.wnm_timing_msr.dialog_token; + skb_dialog_token = mgmt->u.action.wnm_timing_msr.dialog_token; else - skb_dialog_token = mgmt->u.action.u.ftm.dialog_token; + skb_dialog_token = mgmt->u.action.ftm.dialog_token; if ((ether_addr_equal(mgmt->sa, addr) || ether_addr_equal(mgmt->da, addr)) && diff --git a/drivers/net/wireless/marvell/mwifiex/tdls.c b/drivers/net/wireless/marvell/mwifiex/tdls.c index a4cf323e704b..845f2a22e071 100644 --- a/drivers/net/wireless/marvell/mwifiex/tdls.c +++ b/drivers/net/wireless/marvell/mwifiex/tdls.c @@ -755,16 +755,12 @@ mwifiex_construct_tdls_action_frame(struct mwifiex_private *priv, switch (action_code) { case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: /* See the layout of 'struct ieee80211_mgmt'. */ - extra = sizeof(mgmt->u.action.u.tdls_discover_resp) + - sizeof(mgmt->u.action.category); + extra = IEEE80211_MIN_ACTION_SIZE(tdls_discover_resp) - 24; skb_put(skb, extra); mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; - mgmt->u.action.u.tdls_discover_resp.action_code = - WLAN_PUB_ACTION_TDLS_DISCOVER_RES; - mgmt->u.action.u.tdls_discover_resp.dialog_token = - dialog_token; - mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(capab); + mgmt->u.action.action_code = WLAN_PUB_ACTION_TDLS_DISCOVER_RES; + mgmt->u.action.tdls_discover_resp.dialog_token = dialog_token; + mgmt->u.action.tdls_discover_resp.capability = cpu_to_le16(capab); /* move back for addr4 */ memmove(pos + ETH_ALEN, &mgmt->u.action, extra); /* init address 4 */ diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 99321d180f34..b1af02180341 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -1985,9 +1985,9 @@ mwl8k_txq_xmit(struct ieee80211_hw *hw, */ if (unlikely(ieee80211_is_action(wh->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ && + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ && priv->ap_fw)) { - u16 capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + u16 capab = le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; index = mwl8k_tid_queue_mapping(tid); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c index b41ca1410da9..f946ddc20a47 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c @@ -413,10 +413,10 @@ mt76_connac2_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + 1 + 2 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(addba_req.capab) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { - u16 capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) { + u16 capab = le16_to_cpu(mgmt->u.action.addba_req.capab); txwi[5] |= cpu_to_le32(MT_TXD5_ADD_BA); tid = (capab >> 2) & IEEE80211_QOS_CTL_TID_MASK; diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c index 0d9435900423..caaf71c31480 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c @@ -668,9 +668,9 @@ mt7925_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(action_code) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) tid = MT_TX_ADDBA; else if (ieee80211_is_mgmt(hdr->frame_control)) tid = MT_TX_NORMAL; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c index d4f3ee943b47..84cbf36b493c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c @@ -800,9 +800,9 @@ mt7996_mac_write_txwi_80211(struct mt7996_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(action_code) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) { if (is_mt7990(&dev->mt76)) txwi[6] |= cpu_to_le32(FIELD_PREP(MT_TXD6_TID_ADDBA, tid)); else diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c index 794187d28caa..804fc604e5f8 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c @@ -5146,10 +5146,10 @@ static void rtl8xxxu_dump_action(struct device *dev, if (!(rtl8xxxu_debug & RTL8XXXU_DEBUG_ACTION)) return; - switch (mgmt->u.action.u.addba_resp.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_RESP: - cap = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); - timeout = le16_to_cpu(mgmt->u.action.u.addba_resp.timeout); + cap = le16_to_cpu(mgmt->u.action.addba_resp.capab); + timeout = le16_to_cpu(mgmt->u.action.addba_resp.timeout); dev_info(dev, "WLAN_ACTION_ADDBA_RESP: " "timeout %i, tid %02x, buf_size %02x, policy %02x, " "status %02x\n", @@ -5157,11 +5157,11 @@ static void rtl8xxxu_dump_action(struct device *dev, (cap & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2, (cap & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6, (cap >> 1) & 0x1, - le16_to_cpu(mgmt->u.action.u.addba_resp.status)); + le16_to_cpu(mgmt->u.action.addba_resp.status)); break; case WLAN_ACTION_ADDBA_REQ: - cap = le16_to_cpu(mgmt->u.action.u.addba_req.capab); - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + cap = le16_to_cpu(mgmt->u.action.addba_req.capab); + timeout = le16_to_cpu(mgmt->u.action.addba_req.timeout); dev_info(dev, "WLAN_ACTION_ADDBA_REQ: " "timeout %i, tid %02x, buf_size %02x, policy %02x\n", timeout, @@ -5171,7 +5171,7 @@ static void rtl8xxxu_dump_action(struct device *dev, break; default: dev_info(dev, "action frame %02x\n", - mgmt->u.action.u.addba_resp.action_code); + mgmt->u.action.action_code); break; } } diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 0ac9cf0937aa..aad377864e73 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1409,7 +1409,7 @@ bool rtl_action_proc(struct ieee80211_hw *hw, struct sk_buff *skb, u8 is_tx) sta_entry = (struct rtl_sta_info *)sta->drv_priv; capab = - le16_to_cpu(mgmt->u.action.u.addba_req.capab); + le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; if (tid >= MAX_TID_COUNT) { @@ -2392,35 +2392,35 @@ static struct sk_buff *rtl_make_smps_action(struct ieee80211_hw *hw, struct sk_buff *skb; struct ieee80211_mgmt *action_frame; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(27 + hw->extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(ht_smps) + + hw->extra_tx_headroom); if (!skb) return NULL; skb_reserve(skb, hw->extra_tx_headroom); - action_frame = skb_put_zero(skb, 27); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ht_smps)); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, rtlefuse->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; - action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + action_frame->u.action.action_code = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC:/* 0 */ case IEEE80211_SMPS_NUM_MODES:/* 4 */ WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF:/* 1 */ /*MIMO_PS_NOLIMIT*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED;/* 0 */ break; case IEEE80211_SMPS_STATIC:/* 2 */ /*MIMO_PS_STATIC*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC;/* 1 */ break; case IEEE80211_SMPS_DYNAMIC:/* 3 */ /*MIMO_PS_DYNAMIC*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC;/* 3 */ break; } @@ -2519,25 +2519,25 @@ struct sk_buff *rtl_make_del_ba(struct ieee80211_hw *hw, struct ieee80211_mgmt *action_frame; u16 params; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(34 + hw->extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(delba) + + hw->extra_tx_headroom); if (!skb) return NULL; skb_reserve(skb, hw->extra_tx_headroom); - action_frame = skb_put_zero(skb, 34); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(delba)); memcpy(action_frame->sa, sa, ETH_ALEN); memcpy(action_frame->da, rtlefuse->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_BACK; - action_frame->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + action_frame->u.action.action_code = WLAN_ACTION_DELBA; params = (u16)(1 << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ - action_frame->u.action.u.delba.params = cpu_to_le16(params); - action_frame->u.action.u.delba.reason_code = + action_frame->u.action.delba.params = cpu_to_le16(params); + action_frame->u.action.delba.reason_code = cpu_to_le16(WLAN_REASON_QSTA_TIMEOUT); return skb; diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index d080469264cf..19e2ff62d9f1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -507,7 +507,7 @@ static void _rtl_pci_tx_isr(struct ieee80211_hw *hw, int prio) if (ieee80211_is_action(fc)) { struct ieee80211_mgmt *action_frame = (struct ieee80211_mgmt *)skb->data; - if (action_frame->u.action.u.ht_smps.action == + if (action_frame->u.action.action_code == WLAN_HT_ACTION_SMPS) { dev_kfree_skb(skb); goto tx_status_ok; diff --git a/drivers/net/wireless/silabs/wfx/data_rx.c b/drivers/net/wireless/silabs/wfx/data_rx.c index e099a9e65bae..15c06b2b8633 100644 --- a/drivers/net/wireless/silabs/wfx/data_rx.c +++ b/drivers/net/wireless/silabs/wfx/data_rx.c @@ -21,14 +21,14 @@ static void wfx_rx_handle_ba(struct wfx_vif *wvif, struct ieee80211_mgmt *mgmt) if (wfx_api_older_than(wvif->wdev, 3, 6)) return; - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: - params = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + params = le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (params & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; ieee80211_start_rx_ba_session_offl(vif, mgmt->sa, tid); break; case WLAN_ACTION_DELBA: - params = le16_to_cpu(mgmt->u.action.u.delba.params); + params = le16_to_cpu(mgmt->u.action.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; ieee80211_stop_rx_ba_session_offl(vif, mgmt->sa, tid); break; @@ -80,7 +80,7 @@ void wfx_rx_cb(struct wfx_vif *wvif, const struct wfx_hif_ind_rx *arg, struct sk */ if (ieee80211_is_action(frame->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - skb->len > IEEE80211_MIN_ACTION_SIZE) { + skb->len > IEEE80211_MIN_ACTION_SIZE(action_code)) { wfx_rx_handle_ba(wvif, mgmt); goto drop; } diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 3651b2e6c518..aea360e90cb1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1046,31 +1046,28 @@ struct ieee80211_mgmt { } __packed probe_resp; struct { u8 category; + u8 action_code; union { struct { - u8 action_code; u8 dialog_token; u8 status_code; u8 variable[]; } __packed wme_action; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed chan_switch; struct{ - u8 action_code; struct ieee80211_ext_chansw_ie data; u8 variable[]; } __packed ext_chan_switch; struct{ - u8 action_code; u8 dialog_token; u8 element_id; u8 length; struct ieee80211_msrment_ie msr_elem; } __packed measurement; struct{ - u8 action_code; u8 dialog_token; __le16 capab; __le16 timeout; @@ -1079,7 +1076,6 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_req; struct{ - u8 action_code; u8 dialog_token; __le16 status; __le16 capab; @@ -1088,54 +1084,45 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_resp; struct{ - u8 action_code; __le16 params; __le16 reason_code; } __packed delba; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed self_prot; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed mesh_action; struct { - u8 action; u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN]; } __packed sa_query; struct { - u8 action; u8 smps_control; } __packed ht_smps; struct { - u8 action_code; u8 chanwidth; } __packed ht_notify_cw; struct { - u8 action_code; u8 dialog_token; __le16 capability; u8 variable[]; } __packed tdls_discover_resp; struct { - u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; struct { - u8 action_code; u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; } __packed vht_group_notif; struct { - u8 action_code; u8 dialog_token; u8 tpc_elem_id; u8 tpc_elem_length; struct ieee80211_tpc_report_ie tpc; } __packed tpc_report; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u8 tod[6]; @@ -1145,11 +1132,10 @@ struct ieee80211_mgmt { u8 variable[]; } __packed ftm; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed s1g; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u32 tod; @@ -1158,41 +1144,37 @@ struct ieee80211_mgmt { u8 max_toa_error; } __packed wnm_timing_msr; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ttlm_req; struct { - u8 action_code; u8 dialog_token; __le16 status_code; u8 variable[]; } __packed ttlm_res; struct { - u8 action_code; + u8 no_fixed_fields[0]; + /* no variable fields either */ } __packed ttlm_tear_down; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ml_reconf_req; struct { - u8 action_code; u8 dialog_token; u8 count; u8 variable[]; } __packed ml_reconf_resp; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed epcs; struct { - u8 action_code; u8 dialog_token; u8 control; u8 variable[]; } __packed eml_omn; - } u; + }; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ } u; @@ -1210,8 +1192,7 @@ struct ieee80211_mgmt { #define BSS_MEMBERSHIP_SELECTOR_MIN BSS_MEMBERSHIP_SELECTOR_UHR_PHY -/* mgmt header + 1 byte category code */ -#define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) +#define IEEE80211_MIN_ACTION_SIZE(type) offsetofend(struct ieee80211_mgmt, u.action.type) /* Management MIC information element (IEEE 802.11w) for CMAC */ @@ -2391,7 +2372,7 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) if (!ieee80211_is_action(fc)) return false; - if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return true; /* action frame - additionally check for non-bufferable FTM */ @@ -2400,8 +2381,8 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) return true; - if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST || - mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) + if (mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_REQUEST || + mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) return false; return true; @@ -2451,7 +2432,7 @@ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_robust_mgmt_frame((void *)skb->data); } @@ -2467,7 +2448,7 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, { struct ieee80211_mgmt *mgmt = (void *)hdr; - if (len < IEEE80211_MIN_ACTION_SIZE) + if (len < IEEE80211_MIN_ACTION_SIZE(category)) return false; if (!ieee80211_is_action(hdr->frame_control)) return false; @@ -2485,13 +2466,14 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, static inline bool ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) { + struct ieee80211_mgmt *mgmt = (void *)skb->data; u8 action; if (!ieee80211_is_public_action((void *)skb->data, skb->len) || - skb->len < IEEE80211_MIN_ACTION_SIZE + 1) + skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return false; - action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE); + action = mgmt->u.action.action_code; return action != WLAN_PUB_ACTION_20_40_BSS_COEX && action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN && @@ -2530,7 +2512,7 @@ static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_group_privacy_action((void *)skb->data); } @@ -2626,8 +2608,7 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) if (!ieee80211_is_action(mgmt->frame_control)) return false; - if (skb->len < IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.tpc_report)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(tpc_report)) return false; /* @@ -2646,12 +2627,11 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) return false; /* both spectrum mgmt and link measurement have same action code */ - if (mgmt->u.action.u.tpc_report.action_code != - WLAN_ACTION_SPCT_TPC_RPRT) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_TPC_RPRT) return false; - if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || - mgmt->u.action.u.tpc_report.tpc_elem_length != + if (mgmt->u.action.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || + mgmt->u.action.tpc_report.tpc_elem_length != sizeof(struct ieee80211_tpc_report_ie)) return false; @@ -2667,16 +2647,15 @@ static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(wnm_timing_msr)) return false; if (!ieee80211_is_action(mgmt->frame_control)) return false; if (mgmt->u.action.category == WLAN_CATEGORY_WNM_UNPROTECTED && - mgmt->u.action.u.wnm_timing_msr.action_code == - WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.wnm_timing_msr)) + mgmt->u.action.action_code == + WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE) return true; return false; @@ -2691,15 +2670,13 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(ftm)) return false; - if (mgmt->u.action.u.ftm.action_code == - WLAN_PUB_ACTION_FTM_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm)) - return true; + if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + return false; - return false; + return mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE; } struct element { diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index f301a8622bee..0a2be8cb600f 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ /** @@ -251,19 +251,20 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); + skb_put(skb, 2 + sizeof(mgmt->u.action.addba_resp)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; - mgmt->u.action.u.addba_resp.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_ACTION_ADDBA_RESP; + + mgmt->u.action.addba_resp.dialog_token = dialog_token; capab = u16_encode_bits(amsdu, IEEE80211_ADDBA_PARAM_AMSDU_MASK); capab |= u16_encode_bits(policy, IEEE80211_ADDBA_PARAM_POLICY_MASK); capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); capab |= u16_encode_bits(buf_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); - mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); - mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + mgmt->u.action.addba_resp.capab = cpu_to_le16(capab); + mgmt->u.action.addba_resp.timeout = cpu_to_le16(timeout); + mgmt->u.action.addba_resp.status = cpu_to_le16(status); if (sta->sta.valid_links || sta->sta.deflink.he_cap.has_he) ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size); @@ -477,22 +478,22 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, u8 dialog_token, addba_ext_data; /* extract session parameters from addba request frame */ - dialog_token = mgmt->u.action.u.addba_req.dialog_token; - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + dialog_token = mgmt->u.action.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.addba_req.timeout); start_seq_num = - le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + le16_to_cpu(mgmt->u.action.addba_req.start_seq_num) >> 4; - capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + capab = le16_to_cpu(mgmt->u.action.addba_req.capab); ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; addba_ext_data = ieee80211_retrieve_addba_ext_data(sta, - mgmt->u.action.u.addba_req.variable, + mgmt->u.action.addba_req.variable, len - offsetof(typeof(*mgmt), - u.action.u.addba_req.variable), + u.action.addba_req.variable), &buf_size); __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 93b47a7ba9c4..d5a62b8d5a80 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -68,7 +68,7 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, struct ieee80211_mgmt *mgmt; u16 capab; - skb = dev_alloc_skb(sizeof(*mgmt) + + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(addba_req) + 2 + sizeof(struct ieee80211_addba_ext_ie) + local->hw.extra_tx_headroom); if (!skb) @@ -77,21 +77,21 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, sta->sta.addr, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); + skb_put(skb, 2 + sizeof(mgmt->u.action.addba_req)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; + mgmt->u.action.action_code = WLAN_ACTION_ADDBA_REQ; - mgmt->u.action.u.addba_req.dialog_token = dialog_token; + mgmt->u.action.addba_req.dialog_token = dialog_token; capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK; capab |= IEEE80211_ADDBA_PARAM_POLICY_MASK; capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); capab |= u16_encode_bits(agg_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); - mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); + mgmt->u.action.addba_req.capab = cpu_to_le16(capab); - mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_req.start_seq_num = + mgmt->u.action.addba_req.timeout = cpu_to_le16(timeout); + mgmt->u.action.addba_req.start_seq_num = cpu_to_le16(start_seq_num << 4); if (sta->sta.deflink.he_cap.has_he) @@ -978,15 +978,15 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, lockdep_assert_wiphy(sta->local->hw.wiphy); - capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + capab = le16_to_cpu(mgmt->u.action.addba_resp.capab); amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK); buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); ieee80211_retrieve_addba_ext_data(sta, - mgmt->u.action.u.addba_resp.variable, + mgmt->u.action.addba_resp.variable, len - offsetof(typeof(*mgmt), - u.action.u.addba_resp.variable), + u.action.addba_resp.variable), &buf_size); buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); @@ -999,7 +999,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, if (!tid_tx) return; - if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) { + if (mgmt->u.action.addba_resp.dialog_token != tid_tx->dialog_token) { ht_dbg(sta->sdata, "wrong addBA response token, %pM tid %d\n", sta->sta.addr, tid); return; @@ -1029,7 +1029,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, * is set to 0, the Buffer Size subfield is set to a value * of at least 1. */ - if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) + if (le16_to_cpu(mgmt->u.action.addba_resp.status) == WLAN_STATUS_SUCCESS && buf_size) { if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { @@ -1046,7 +1046,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, sta->ampdu_mlme.addba_req_num[tid] = 0; tid_tx->timeout = - le16_to_cpu(mgmt->u.action.u.addba_resp.timeout); + le16_to_cpu(mgmt->u.action.addba_resp.timeout); if (tid_tx->timeout) { mod_timer(&tid_tx->session_timer, diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index 078e1e23d8d1..768bfc4e737d 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -108,7 +108,7 @@ static void ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *req, int opt_len) { - int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + int len = IEEE80211_MIN_ACTION_SIZE(eml_omn); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; @@ -127,16 +127,15 @@ ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.eml_omn.action_code = - WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; - mgmt->u.action.u.eml_omn.dialog_token = - req->u.action.u.eml_omn.dialog_token; - mgmt->u.action.u.eml_omn.control = req->u.action.u.eml_omn.control & + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; + mgmt->u.action.eml_omn.dialog_token = + req->u.action.eml_omn.dialog_token; + mgmt->u.action.eml_omn.control = req->u.action.eml_omn.control & ~(IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE | IEEE80211_EML_CTRL_INDEV_COEX_ACT); /* Copy optional fields from the received notification frame */ - memcpy(mgmt->u.action.u.eml_omn.variable, - req->u.action.u.eml_omn.variable, opt_len); + memcpy(mgmt->u.action.eml_omn.variable, + req->u.action.eml_omn.variable, opt_len); ieee80211_tx_skb(sdata, skb); } @@ -144,14 +143,14 @@ ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + int len = IEEE80211_MIN_ACTION_SIZE(eml_omn); enum nl80211_iftype type = ieee80211_vif_type_p2p(&sdata->vif); struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); const struct wiphy_iftype_ext_capab *ift_ext_capa; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_local *local = sdata->local; - u8 control = mgmt->u.action.u.eml_omn.control; - u8 *ptr = mgmt->u.action.u.eml_omn.variable; + u8 control = mgmt->u.action.eml_omn.control; + u8 *ptr = mgmt->u.action.eml_omn.variable; struct ieee80211_eml_params eml_params = { .link_id = status->link_id, .control = control, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 1c82a28b03de..9e2469a8ce64 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020-2025 Intel Corporation + * Copyright(c) 2020-2026 Intel Corporation */ #include @@ -462,22 +462,23 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u16 params; - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(delba) + + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); + skb_put(skb, 2 + sizeof(mgmt->u.action.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + mgmt->u.action.action_code = WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ - mgmt->u.action.u.delba.params = cpu_to_le16(params); - mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); + mgmt->u.action.delba.params = cpu_to_le16(params); + mgmt->u.action.delba.reason_code = cpu_to_le16(reason_code); ieee80211_tx_skb(sdata, skb); } @@ -489,14 +490,14 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, u16 tid, params; u16 initiator; - params = le16_to_cpu(mgmt->u.action.u.delba.params); + params = le16_to_cpu(mgmt->u.action.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; ht_dbg_ratelimited(sdata, "delba from %pM (%s) tid %d reason code %d\n", mgmt->sa, initiator ? "initiator" : "recipient", tid, - le16_to_cpu(mgmt->u.action.u.delba.reason_code)); + le16_to_cpu(mgmt->u.action.delba.reason_code)); if (initiator == WLAN_BACK_INITIATOR) __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, @@ -530,20 +531,20 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info; u8 status_link_id = link_id < 0 ? 0 : link_id; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(ht_smps) + + local->hw.extra_tx_headroom); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); - action_frame = skb_put(skb, 27); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ht_smps)); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; - action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + action_frame->u.action.action_code = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: @@ -551,15 +552,15 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, smps = IEEE80211_SMPS_OFF; fallthrough; case IEEE80211_SMPS_OFF: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; break; case IEEE80211_SMPS_STATIC: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC; break; case IEEE80211_SMPS_DYNAMIC: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC; break; } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 168f84a1353b..0298272c37ec 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2025 Intel Corporation + * Copyright(c) 2018-2026 Intel Corporation */ #include @@ -888,19 +888,11 @@ ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { - int required_len; - - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(chan_switch)) return; /* CSA is the only action we handle for now */ - if (mgmt->u.action.u.measurement.action_code != - WLAN_ACTION_SPCT_CHL_SWITCH) - return; - - required_len = IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.chan_switch); - if (len < required_len) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; if (!sdata->vif.bss_conf.csa_active) @@ -1613,12 +1605,12 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); if (ies_len < 0) break; - elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 676b2a43c9f2..2e391cec73a0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1579,7 +1579,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: ieee80211_process_addba_request(local, sta, mgmt, len); @@ -1599,9 +1599,9 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT) { - switch (mgmt->u.action.u.ht_smps.action) { + switch (mgmt->u.action.action_code) { case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { - u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + u8 chanwidth = mgmt->u.action.ht_notify_cw.chanwidth; struct ieee80211_rx_status *status; struct link_sta_info *link_sta; struct sta_info *sta; @@ -1628,7 +1628,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { - switch (mgmt->u.action.u.vht_group_notif.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { struct ieee80211_rx_status *status; enum nl80211_band band; @@ -1637,7 +1637,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, status = IEEE80211_SKB_RXCB(skb); band = status->band; - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + opmode = mgmt->u.action.vht_opmode_notif.operating_mode; sta = sta_info_get_bss(sdata, mgmt->sa); @@ -1658,7 +1658,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_rx_twt_action(sdata, skb); @@ -1669,7 +1669,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) { if (sdata->vif.type == NL80211_IFTYPE_AP) { - switch (mgmt->u.action.u.eml_omn.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: ieee80211_rx_eml_op_mode_notif(sdata, skb); break; @@ -1677,7 +1677,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, break; } } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { - switch (mgmt->u.action.u.ttlm_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: ieee80211_process_neg_ttlm_req(sdata, mgmt, skb->len); @@ -1765,7 +1765,7 @@ static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_status_twt_action(sdata, skb); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 28624e57aa49..6696c611dfa4 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona */ @@ -19,8 +19,7 @@ static struct kmem_cache *rm_cache; bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt) { - return (mgmt->u.action.u.mesh_action.action_code == - WLAN_MESH_ACTION_HWMP_PATH_SELECTION); + return mgmt->u.action.action_code == WLAN_MESH_ACTION_HWMP_PATH_SELECTION; } void ieee80211s_init(void) @@ -1618,13 +1617,12 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, size_t baselen; u8 *pos; - if (mgmt->u.action.u.measurement.action_code != - WLAN_ACTION_SPCT_CHL_SWITCH) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; - pos = mgmt->u.action.u.chan_switch.variable; + pos = mgmt->u.action.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); elems = ieee802_11_parse_elems(pos, len - baselen, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -1670,7 +1668,7 @@ static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata, { switch (mgmt->u.action.category) { case WLAN_CATEGORY_SELF_PROTECTED: - switch (mgmt->u.action.u.self_prot.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_SP_MESH_PEERING_OPEN: case WLAN_SP_MESH_PEERING_CLOSE: case WLAN_SP_MESH_PEERING_CONFIRM: diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 98d5aaa36d00..9d89ebcce1c1 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation + * Copyright (C) 2019, 2021-2023, 2025-2026 Intel Corporation * Author: Luis Carlos Cobo */ @@ -105,12 +105,11 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, u32 lifetime, u32 metric, u32 preq_id, struct ieee80211_sub_if_data *sdata) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(mesh_action); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.mesh_action); skb = dev_alloc_skb(local->tx_headroom + hdr_len + @@ -127,8 +126,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, /* BSSID == SA */ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION; - mgmt->u.action.u.mesh_action.action_code = - WLAN_MESH_ACTION_HWMP_PATH_SELECTION; + mgmt->u.action.action_code = WLAN_MESH_ACTION_HWMP_PATH_SELECTION; switch (action) { case MPATH_PREQ: @@ -237,13 +235,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, u8 ttl, const u8 *target, u32 target_sn, u16 target_rcode, const u8 *ra) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(mesh_action); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.mesh_action); if (time_before(jiffies, ifmsh->next_perr)) return -EAGAIN; @@ -265,8 +262,7 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, /* BSSID == SA */ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION; - mgmt->u.action.u.mesh_action.action_code = - WLAN_MESH_ACTION_HWMP_PATH_SELECTION; + mgmt->u.action.action_code = WLAN_MESH_ACTION_HWMP_PATH_SELECTION; ie_len = 15; pos = skb_put(skb, 2 + ie_len); *pos++ = WLAN_EID_PERR; @@ -938,7 +934,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; /* need action_code */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(mesh_action)) return; rcu_read_lock(); @@ -949,8 +945,8 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, } rcu_read_unlock(); - baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; - elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, + baselen = mgmt->u.action.mesh_action.variable - (u8 *)mgmt; + elems = ieee802_11_parse_elems(mgmt->u.action.mesh_action.variable, len - baselen, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 04c931cd2063..7d823a55636f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2025 Intel Corporation + * Copyright (C) 2019, 2021-2026 Intel Corporation * Author: Luis Carlos Cobo */ #include @@ -13,7 +13,7 @@ #include "rate.h" #include "mesh.h" -#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.u.self_prot.variable + 2) +#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.self_prot.variable + 2) #define PLINK_GET_LLID(p) (p + 2) #define PLINK_GET_PLID(p) (p + 4) @@ -215,6 +215,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, enum ieee80211_self_protected_actioncode action, u8 *da, u16 llid, u16 plid, u16 reason) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(self_prot); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_tx_info *info; @@ -223,7 +224,6 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, u16 peering_proto = 0; u8 *pos, ie_len = 4; u8 ie_len_he_cap, ie_len_eht_cap; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); @@ -260,7 +260,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_SELF_PROTECTED; - mgmt->u.action.u.self_prot.action_code = action; + mgmt->u.action.action_code = action; if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; @@ -1141,7 +1141,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, return; } - ftype = mgmt->u.action.u.self_prot.action_code; + ftype = mgmt->u.action.action_code; if ((ftype == WLAN_SP_MESH_PEERING_OPEN && ie_len != 4) || (ftype == WLAN_SP_MESH_PEERING_CONFIRM && ie_len != 6) || (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len != 6 @@ -1224,8 +1224,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, size_t baselen; u8 *baseaddr; - /* need action_code, aux */ - if (len < IEEE80211_MIN_ACTION_SIZE + 3) + /* need aux */ + if (len < IEEE80211_MIN_ACTION_SIZE(self_prot) + 1) return; if (sdata->u.mesh.user_mpm) @@ -1238,10 +1238,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, return; } - baseaddr = mgmt->u.action.u.self_prot.variable; - baselen = (u8 *) mgmt->u.action.u.self_prot.variable - (u8 *) mgmt; - if (mgmt->u.action.u.self_prot.action_code == - WLAN_SP_MESH_PEERING_CONFIRM) { + baseaddr = mgmt->u.action.self_prot.variable; + baselen = mgmt->u.action.self_prot.variable - (u8 *)mgmt; + if (mgmt->u.action.action_code == WLAN_SP_MESH_PEERING_CONFIRM) { baseaddr += 4; baselen += 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 170330d924a3..da79df92994d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7957,7 +7957,7 @@ ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); + int hdr_len = IEEE80211_MIN_ACTION_SIZE(ttlm_req); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; @@ -7974,9 +7974,8 @@ ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_req.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; - mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; + mgmt->u.action.ttlm_req.dialog_token = dialog_token; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); ieee80211_tx_skb(sdata, skb); } @@ -8026,7 +8025,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); + int hdr_len = IEEE80211_MIN_ACTION_SIZE(ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; u16 status_code; @@ -8044,9 +8043,8 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_res.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_RES; - mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_RES; + mgmt->u.action.ttlm_res.dialog_token = dialog_token; switch (ttlm_res) { default: WARN_ON(1); @@ -8063,7 +8061,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, break; } - mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); + mgmt->u.action.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } @@ -8163,10 +8161,9 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, if (!ieee80211_vif_is_mld(&sdata->vif)) return; - dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; - ies_len = len - offsetof(struct ieee80211_mgmt, - u.action.u.ttlm_req.variable); - elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, + dialog_token = mgmt->u.action.ttlm_req.dialog_token; + ies_len = len - IEEE80211_MIN_ACTION_SIZE(ttlm_req); + elems = ieee802_11_parse_elems(mgmt->u.action.ttlm_req.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8217,8 +8214,7 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || - mgmt->u.action.u.ttlm_req.dialog_token != - sdata->u.mgd.dialog_token_alloc) + mgmt->u.action.ttlm_res.dialog_token != sdata->u.mgd.dialog_token_alloc) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, @@ -8232,7 +8228,7 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, * This can be better implemented in the future, to handle request * rejections. */ - if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) + if (le16_to_cpu(mgmt->u.action.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } @@ -8265,12 +8261,11 @@ static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { + int frame_len = IEEE80211_MIN_ACTION_SIZE(ttlm_tear_down); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int frame_len = offsetofend(struct ieee80211_mgmt, - u.action.u.ttlm_tear_down); struct ieee80211_tx_info *info; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); @@ -8286,8 +8281,7 @@ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_tear_down.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; @@ -8370,13 +8364,13 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); if (ies_len < 0) break; /* CSA IE cannot be overridden, no need for BSSID */ - elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8398,7 +8392,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.ext_chan_switch.variable); + u.action.ext_chan_switch.variable); if (ies_len < 0) break; @@ -8407,7 +8401,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.ext_chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8424,7 +8418,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, /* for the handling code pretend it was an IE */ elems->ext_chansw_ie = - &mgmt->u.action.u.ext_chan_switch.data; + &mgmt->u.action.ext_chan_switch.data; ieee80211_sta_process_chanswitch(link, rx_status->mactime, @@ -10426,25 +10420,25 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, u8 *pos; if (!ieee80211_vif_is_mld(&sdata->vif) || - len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) || - mgmt->u.action.u.ml_reconf_resp.dialog_token != - sdata->u.mgd.reconf.dialog_token || + len < IEEE80211_MIN_ACTION_SIZE(ml_reconf_resp) || + mgmt->u.action.ml_reconf_resp.dialog_token != + sdata->u.mgd.reconf.dialog_token || !sta_changed_links) return; - pos = mgmt->u.action.u.ml_reconf_resp.variable; - len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp); + pos = mgmt->u.action.ml_reconf_resp.variable; + len -= offsetofend(typeof(*mgmt), u.action.ml_reconf_resp); /* each status duple is 3 octets */ - if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) { + if (len < mgmt->u.action.ml_reconf_resp.count * 3) { sdata_info(sdata, "mlo: reconf: unexpected len=%zu, count=%u\n", - len, mgmt->u.action.u.ml_reconf_resp.count); + len, mgmt->u.action.ml_reconf_resp.count); goto disconnect; } link_mask = sta_changed_links; - for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) { + for (i = 0; i < mgmt->u.action.ml_reconf_resp.count; i++) { u16 status = get_unaligned_le16(pos + 1); link_id = *pos; @@ -10729,8 +10723,7 @@ ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt, - u.action.u.ml_reconf_req)); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ml_reconf_req)); /* Add the MAC header */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | @@ -10741,12 +10734,11 @@ ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, /* Add the action frame fixed fields */ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ml_reconf_req.action_code = - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; /* allocate a dialog token and store it */ sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc; - mgmt->u.action.u.ml_reconf_req.dialog_token = + mgmt->u.action.ml_reconf_req.dialog_token = sdata->u.mgd.reconf.dialog_token; /* Add the ML reconfiguration element and the common information */ @@ -11116,11 +11108,10 @@ static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) { + int frame_len = IEEE80211_MIN_ACTION_SIZE(epcs) + (enable ? 1 : 0); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int frame_len = offsetofend(struct ieee80211_mgmt, - u.action.u.epcs) + (enable ? 1 : 0); if (!ieee80211_mgd_epcs_supp(sdata)) return -EINVAL; @@ -11149,15 +11140,15 @@ int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; if (enable) { - u8 *pos = mgmt->u.action.u.epcs.variable; + u8 *pos = mgmt->u.action.epcs.variable; - mgmt->u.action.u.epcs.action_code = + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; *pos = ++sdata->u.mgd.dialog_token_alloc; sdata->u.mgd.epcs.dialog_token = *pos; } else { - mgmt->u.action.u.epcs.action_code = + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; ieee80211_epcs_teardown(sdata); @@ -11246,7 +11237,7 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, return; /* Handle dialog token and status code */ - pos = mgmt->u.action.u.epcs.variable; + pos = mgmt->u.action.epcs.variable; dialog_token = *pos; status_code = get_unaligned_le16(pos + 1); @@ -11268,8 +11259,7 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, return; pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; - ies_len = len - offsetof(struct ieee80211_mgmt, - u.action.u.epcs.variable) - + ies_len = len - IEEE80211_MIN_ACTION_SIZE(epcs) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; elems = ieee802_11_parse_elems(pos, ies_len, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6c4b549444c6..3bd046bebf9e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -274,7 +274,7 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, if (!sdata) return; - BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1); + BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE(action_code)); if (skb->len < rtap_space + sizeof(action) + VHT_MUMIMO_GROUPS_DATA_LEN) @@ -1162,7 +1162,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) u8 category; /* make sure category field is present */ - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return RX_DROP_U_RUNT_ACTION; mgmt = (struct ieee80211_mgmt *)hdr; @@ -3422,7 +3422,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; } - if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) { + if (len < IEEE80211_MIN_ACTION_SIZE(sa_query)) { /* Too short SA Query request frame */ return; } @@ -3432,17 +3432,16 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; skb_reserve(skb, local->hw.extra_tx_headroom); - resp = skb_put_zero(skb, 24); + resp = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(sa_query)); memcpy(resp->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(resp->sa, sdata->vif.addr, ETH_ALEN); memcpy(resp->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); resp->u.action.category = WLAN_CATEGORY_SA_QUERY; - resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE; - memcpy(resp->u.action.u.sa_query.trans_id, - mgmt->u.action.u.sa_query.trans_id, + resp->u.action.action_code = WLAN_ACTION_SA_QUERY_RESPONSE; + memcpy(resp->u.action.sa_query.trans_id, + mgmt->u.action.sa_query.trans_id, WLAN_SA_QUERY_TR_ID_LEN); ieee80211_tx_skb(sdata, skb); @@ -3516,7 +3515,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && - rx->skb->len < IEEE80211_MIN_ACTION_SIZE) + rx->skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return RX_DROP_U_RUNT_ACTION; /* Drop non-broadcast Beacon frames */ @@ -3565,29 +3564,28 @@ ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx) if (!rx->sta) return false; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: { struct ieee80211_twt_setup *twt; - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + - 1 + /* action code */ + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + sizeof(struct ieee80211_twt_setup) + 2 /* TWT req_type agrt */) break; - twt = (void *)mgmt->u.action.u.s1g.variable; + twt = (void *)mgmt->u.action.s1g.variable; if (twt->element_id != WLAN_EID_S1G_TWT) break; - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + - 4 + /* action code + token + tlv */ + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + + 3 + /* token + tlv */ twt->length) break; return true; /* queue the frame */ } case WLAN_S1G_TWT_TEARDOWN: - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2) + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + 1) break; return true; /* queue the frame */ @@ -3632,10 +3630,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action & smps_control/chanwidth are present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 2) + if (len < IEEE80211_MIN_ACTION_SIZE(ht_smps)) goto invalid; - switch (mgmt->u.action.u.ht_smps.action) { + switch (mgmt->u.action.action_code) { case WLAN_HT_ACTION_SMPS: { struct ieee80211_supported_band *sband; enum ieee80211_smps_mode smps_mode; @@ -3646,7 +3644,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; /* convert to HT capability */ - switch (mgmt->u.action.u.ht_smps.smps_control) { + switch (mgmt->u.action.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DISABLED: smps_mode = IEEE80211_SMPS_OFF; break; @@ -3679,7 +3677,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { - u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + u8 chanwidth = mgmt->u.action.ht_notify_cw.chanwidth; if (chanwidth != IEEE80211_HT_CHANWIDTH_20MHZ && chanwidth != IEEE80211_HT_CHANWIDTH_ANY) @@ -3699,7 +3697,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; case WLAN_CATEGORY_PUBLIC: case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) goto invalid; if (sdata->vif.type != NL80211_IFTYPE_STATION) break; @@ -3707,11 +3705,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; if (!ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid)) break; - if (mgmt->u.action.u.ext_chan_switch.action_code != + if (mgmt->u.action.action_code != WLAN_PUB_ACTION_EXT_CHANSW_ANN) break; - if (len < offsetof(struct ieee80211_mgmt, - u.action.u.ext_chan_switch.variable)) + if (len < IEEE80211_MIN_ACTION_SIZE(ext_chan_switch)) goto invalid; goto queue; case WLAN_CATEGORY_VHT: @@ -3723,18 +3720,18 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) goto invalid; - switch (mgmt->u.action.u.vht_opmode_notif.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { /* verify opmode is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 2) + if (len < IEEE80211_MIN_ACTION_SIZE(vht_opmode_notif)) goto invalid; goto queue; } case WLAN_VHT_ACTION_GROUPID_MGMT: { - if (len < IEEE80211_MIN_ACTION_SIZE + 25) + if (len < IEEE80211_MIN_ACTION_SIZE(vht_group_notif)) goto invalid; goto queue; } @@ -3751,23 +3748,20 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action_code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.addba_req))) + if (len < IEEE80211_MIN_ACTION_SIZE(addba_req)) goto invalid; break; case WLAN_ACTION_ADDBA_RESP: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.addba_resp))) + if (len < IEEE80211_MIN_ACTION_SIZE(addba_resp)) goto invalid; break; case WLAN_ACTION_DELBA: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.delba))) + if (len < IEEE80211_MIN_ACTION_SIZE(delba)) goto invalid; break; default: @@ -3777,16 +3771,15 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto queue; case WLAN_CATEGORY_SPECTRUM_MGMT: /* verify action_code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.measurement.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: if (status->band != NL80211_BAND_5GHZ) break; - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.measurement))) + if (len < IEEE80211_MIN_ACTION_SIZE(measurement)) break; if (sdata->vif.type != NL80211_IFTYPE_STATION) @@ -3796,8 +3789,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; case WLAN_ACTION_SPCT_CHL_SWITCH: { u8 *bssid; - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.chan_switch))) + if (len < IEEE80211_MIN_ACTION_SIZE(chan_switch)) break; if (sdata->vif.type != NL80211_IFTYPE_STATION && @@ -3822,11 +3814,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_SELF_PROTECTED: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.self_prot.action_code))) + if (len < IEEE80211_MIN_ACTION_SIZE(self_prot)) break; - switch (mgmt->u.action.u.self_prot.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_SP_MESH_PEERING_OPEN: case WLAN_SP_MESH_PEERING_CLOSE: case WLAN_SP_MESH_PEERING_CONFIRM: @@ -3844,8 +3835,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_MESH_ACTION: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.mesh_action.action_code))) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; if (!ieee80211_vif_is_mesh(&sdata->vif)) @@ -3855,11 +3845,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; goto queue; case WLAN_CATEGORY_S1G: - if (len < offsetofend(typeof(*mgmt), - u.action.u.s1g.action_code)) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: case WLAN_S1G_TWT_TEARDOWN: if (ieee80211_process_rx_twt_action(rx)) @@ -3870,33 +3859,29 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_PROTECTED_EHT: - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_req.action_code)) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.ttlm_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_req)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_req)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_TTLM_RES: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_res)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_res)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_tear_down)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_tear_down)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP: @@ -3906,34 +3891,29 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) /* The reconfiguration response action frame must * least one 'Status Duple' entry (3 octets) */ - if (len < - offsetofend(typeof(*mgmt), - u.action.u.ml_reconf_resp) + 3) + if (len < IEEE80211_MIN_ACTION_SIZE(ml_reconf_resp) + 3) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.epcs) + - IEEE80211_EPCS_ENA_RESP_BODY_LEN) + if (len < IEEE80211_MIN_ACTION_SIZE(epcs) + + IEEE80211_EPCS_ENA_RESP_BODY_LEN) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.epcs)) + if (len < IEEE80211_MIN_ACTION_SIZE(epcs)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: if (sdata->vif.type != NL80211_IFTYPE_AP) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.eml_omn)) + if (len < IEEE80211_MIN_ACTION_SIZE(eml_omn)) goto invalid; goto queue; default: @@ -4015,11 +3995,10 @@ ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) switch (mgmt->u.action.category) { case WLAN_CATEGORY_SA_QUERY: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.sa_query))) + if (len < IEEE80211_MIN_ACTION_SIZE(sa_query)) break; - switch (mgmt->u.action.u.sa_query.action) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_SA_QUERY_REQUEST: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 1f68df6e8067..297abaa6fecf 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -2,7 +2,7 @@ /* * S1G handling * Copyright(c) 2020 Adapt-IP - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023, 2026 Intel Corporation */ #include #include @@ -27,14 +27,14 @@ bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb) if (likely(mgmt->u.action.category != WLAN_CATEGORY_S1G)) return false; - return mgmt->u.action.u.s1g.action_code == WLAN_S1G_TWT_SETUP; + return mgmt->u.action.action_code == WLAN_S1G_TWT_SETUP; } static void ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, struct ieee80211_twt_setup *twt) { - int len = IEEE80211_MIN_ACTION_SIZE + 4 + twt->length; + int len = IEEE80211_MIN_ACTION_SIZE(s1g) + 3 + twt->length; struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; @@ -52,8 +52,8 @@ ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_S1G; - mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_SETUP; - memcpy(mgmt->u.action.u.s1g.variable, twt, 3 + twt->length); + mgmt->u.action.action_code = WLAN_S1G_TWT_SETUP; + memcpy(mgmt->u.action.s1g.variable, twt, 3 + twt->length); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_MLME_CONN_TX | @@ -71,12 +71,12 @@ ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, u8 *id; skb = dev_alloc_skb(local->hw.extra_tx_headroom + - IEEE80211_MIN_ACTION_SIZE + 2); + IEEE80211_MIN_ACTION_SIZE(s1g) + 1); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE + 2); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(s1g) + 1); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, da, ETH_ALEN); @@ -84,8 +84,8 @@ ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_S1G; - mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_TEARDOWN; - id = (u8 *)mgmt->u.action.u.s1g.variable; + mgmt->u.action.action_code = WLAN_S1G_TWT_TEARDOWN; + id = (u8 *)mgmt->u.action.s1g.variable; *id = flowid; IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | @@ -98,7 +98,7 @@ ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.s1g.variable; struct ieee80211_twt_params *twt_agrt = (void *)twt->params; twt_agrt->req_type &= cpu_to_le16(~IEEE80211_TWT_REQTYPE_REQUEST); @@ -128,7 +128,7 @@ ieee80211_s1g_rx_twt_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; drv_twt_teardown_request(sdata->local, sdata, &sta->sta, - mgmt->u.action.u.s1g.variable[0]); + mgmt->u.action.s1g.variable[0]); } static void @@ -136,7 +136,7 @@ ieee80211_s1g_tx_twt_setup_fail(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; - struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.s1g.variable; struct ieee80211_twt_params *twt_agrt = (void *)twt->params; u8 flowid = le16_get_bits(twt_agrt->req_type, IEEE80211_TWT_REQTYPE_FLOWID); @@ -160,7 +160,7 @@ void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, if (!sta) return; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: ieee80211_s1g_rx_twt_setup(sdata, sta, skb); break; @@ -185,7 +185,7 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, if (!sta) return; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: /* process failed twt setup frames */ ieee80211_s1g_tx_twt_setup_fail(sdata, sta, skb); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 7422888d3640..e2eaf8d8d7ff 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg - * Copyright (C) 2018, 2020, 2022-2024 Intel Corporation + * Copyright (C) 2018, 2020, 2022-2024, 2026 Intel Corporation */ #include @@ -409,35 +409,30 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da struct sk_buff *skb; struct ieee80211_mgmt *msr_report; - skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom + - sizeof(struct ieee80211_msrment_ie)); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(measurement) + + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); - msr_report = skb_put_zero(skb, 24); + msr_report = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(measurement)); memcpy(msr_report->da, da, ETH_ALEN); memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN); memcpy(msr_report->bssid, bssid, ETH_ALEN); msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement)); msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; - msr_report->u.action.u.measurement.action_code = - WLAN_ACTION_SPCT_MSR_RPRT; - msr_report->u.action.u.measurement.dialog_token = dialog_token; + msr_report->u.action.action_code = WLAN_ACTION_SPCT_MSR_RPRT; - msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT; - msr_report->u.action.u.measurement.length = + msr_report->u.action.measurement.dialog_token = dialog_token; + msr_report->u.action.measurement.element_id = WLAN_EID_MEASURE_REPORT; + msr_report->u.action.measurement.length = sizeof(struct ieee80211_msrment_ie); - - memset(&msr_report->u.action.u.measurement.msr_elem, 0, - sizeof(struct ieee80211_msrment_ie)); - msr_report->u.action.u.measurement.msr_elem.token = request_ie->token; - msr_report->u.action.u.measurement.msr_elem.mode |= + msr_report->u.action.measurement.msr_elem.token = request_ie->token; + msr_report->u.action.measurement.msr_elem.mode |= IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; - msr_report->u.action.u.measurement.msr_elem.type = request_ie->type; + msr_report->u.action.measurement.msr_elem.type = request_ie->type; ieee80211_tx_skb(sdata, skb); } @@ -454,7 +449,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * TODO: Answer basic measurement as unmeasured */ ieee80211_send_refuse_measurement_request(sdata, - &mgmt->u.action.u.measurement.msr_elem, + &mgmt->u.action.measurement.msr_elem, mgmt->sa, mgmt->bssid, - mgmt->u.action.u.measurement.dialog_token); + mgmt->u.action.measurement.dialog_token); } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index dbbfe2d6842f..1f30a4eda374 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2025 Intel Corporation + * Copyright (C) 2019, 2021-2026 Intel Corporation */ #include @@ -879,28 +879,23 @@ ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_mgmt *mgmt; - mgmt = skb_put_zero(skb, 24); + if (action_code != WLAN_PUB_ACTION_TDLS_DISCOVER_RES) + return -EINVAL; + + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(tdls_discover_resp)); memcpy(mgmt->da, peer, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, link->u.mgd.bssid, ETH_ALEN); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - switch (action_code) { - case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - skb_put(skb, 1 + sizeof(mgmt->u.action.u.tdls_discover_resp)); - mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; - mgmt->u.action.u.tdls_discover_resp.action_code = - WLAN_PUB_ACTION_TDLS_DISCOVER_RES; - mgmt->u.action.u.tdls_discover_resp.dialog_token = - dialog_token; - mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(link, - status_code)); - break; - default: - return -EINVAL; - } + mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; + mgmt->u.action.action_code = WLAN_PUB_ACTION_TDLS_DISCOVER_RES; + + mgmt->u.action.tdls_discover_resp.dialog_token = dialog_token; + mgmt->u.action.tdls_discover_resp.capability = + cpu_to_le16(ieee80211_get_tdls_sta_capab(link, + status_code)); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b2e6c8b98381..55054de62508 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3766,12 +3766,11 @@ again: int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(chan_switch); struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && @@ -3800,7 +3799,7 @@ int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; - mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; + mgmt->u.action.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index b099d79e8fbb..80120f9f17b6 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -723,17 +723,17 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, if (!link_conf->mu_mimo_owner) return; - if (!memcmp(mgmt->u.action.u.vht_group_notif.position, + if (!memcmp(mgmt->u.action.vht_group_notif.position, link_conf->mu_group.position, WLAN_USER_POSITION_LEN) && - !memcmp(mgmt->u.action.u.vht_group_notif.membership, + !memcmp(mgmt->u.action.vht_group_notif.membership, link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(link_conf->mu_group.membership, - mgmt->u.action.u.vht_group_notif.membership, + mgmt->u.action.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, - mgmt->u.action.u.vht_group_notif.position, + mgmt->u.action.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, link, -- cgit v1.2.3 From 9aa84d5c6c99480c523aeb7a6ce93b6635f3e290 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Mar 2026 14:41:48 +0100 Subject: wifi: ieee80211: fix UHR operation DBE vs. P-EDCA order Draft P802.11bn_D1.3 switched the order here to align with the order of the fields. Adjust the code accordingly. Link: https://patch.msgid.link/20260304144148.ce45942294e1.I22ab3f16e6376a19c3953cf81dd67105ea8e529d@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 9729d23e4766..d199f3ebdba0 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -12,8 +12,8 @@ #define IEEE80211_UHR_OPER_PARAMS_DPS_ENA 0x0001 #define IEEE80211_UHR_OPER_PARAMS_NPCA_ENA 0x0002 -#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0004 -#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0008 +#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0004 +#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0008 struct ieee80211_uhr_operation { __le16 params; -- cgit v1.2.3 From 98acd4c1d9f7dc9c426e840c16e81b57315ff84b Mon Sep 17 00:00:00 2001 From: Ria Thomas Date: Thu, 5 Mar 2026 14:43:04 +0530 Subject: wifi: mac80211: add support for NDP ADDBA/DELBA for S1G S1G defines use of NDP Block Ack (BA) for aggregation, requiring negotiation of NDP ADDBA/DELBA action frames. If the S1G recipient supports HT-immediate block ack, the sender must send an NDP ADDBA Request indicating it expects only NDP BlockAck frames for the agreement. Introduce support for NDP ADDBA and DELBA exchange in mac80211. The implementation negotiates the BA mechanism during setup based on station capabilities and driver support (IEEE80211_HW_SUPPORTS_NDP_BLOCKACK). If negotiation fails due to mismatched expectations, a rejection with status code WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED is returned as per IEEE 802.11-2024. Trace sample: IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Request (0x80) Dialog token: 0x01 Block Ack Parameters: 0x1003, A-MSDUs, Block Ack Policy .... .... .... ...1 = A-MSDUs: Permitted in QoS Data MPDUs .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Block Ack Starting Sequence Control (SSC): 0x0010 .... .... .... 0000 = Fragment: 0 0000 0000 0001 .... = Starting Sequence Number: 1 IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Response (0x81) Dialog token: 0x02 Status code: BlockAck negotiation refused because, due to buffer constraints and other unspecified reasons, the recipient prefers to generate only NDP BlockAck frames (0x006d) Block Ack Parameters: 0x1002, Block Ack Policy .... .... .... ...0 = A-MSDUs: Not Permitted .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Signed-off-by: Ria Thomas Link: https://patch.msgid.link/20260305091304.310990-1-ria.thomas@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-ht.h | 3 +++ include/linux/ieee80211.h | 2 ++ include/net/mac80211.h | 4 ++++ net/mac80211/agg-rx.c | 24 +++++++++++++++++++++--- net/mac80211/agg-tx.c | 13 +++++++++---- net/mac80211/debugfs.c | 1 + net/mac80211/ht.c | 8 +++++--- net/mac80211/ieee80211_i.h | 6 +++++- net/mac80211/iface.c | 3 +++ net/mac80211/rx.c | 11 +++++++++-- net/mac80211/s1g.c | 8 ++++++++ net/mac80211/sta_info.h | 2 ++ 12 files changed, 72 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h index 21bbf470540f..7612b72f9c7c 100644 --- a/include/linux/ieee80211-ht.h +++ b/include/linux/ieee80211-ht.h @@ -281,6 +281,9 @@ enum ieee80211_back_actioncode { WLAN_ACTION_ADDBA_REQ = 0, WLAN_ACTION_ADDBA_RESP = 1, WLAN_ACTION_DELBA = 2, + WLAN_ACTION_NDP_ADDBA_REQ = 128, + WLAN_ACTION_NDP_ADDBA_RESP = 129, + WLAN_ACTION_NDP_DELBA = 130, }; /* BACK (block-ack) parties */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aea360e90cb1..52db36120314 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1482,6 +1482,8 @@ enum ieee80211_statuscode { WLAN_STATUS_REJECT_DSE_BAND = 96, WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, + /* 802.11ah */ + WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED = 109, /* 802.11ai */ WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112, WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9f8251fb9832..9cc482191ab9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2913,6 +2913,9 @@ struct ieee80211_txq { * HW flag so drivers can opt in according to their own control, e.g. in * testing. * + * @IEEE80211_HW_SUPPORTS_NDP_BLOCKACK: HW can transmit/receive S1G NDP + * BlockAck frames. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2973,6 +2976,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_HANDLES_QUIET_CSA, IEEE80211_HW_STRICT, + IEEE80211_HW_SUPPORTS_NDP_BLOCKACK, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 0a2be8cb600f..0140ac826b23 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -94,7 +94,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, /* check if this is a self generated aggregation halt */ if (initiator == WLAN_BACK_RECIPIENT && tx) ieee80211_send_delba(sta->sdata, sta->sta.addr, - tid, WLAN_BACK_RECIPIENT, reason); + tid, WLAN_BACK_RECIPIENT, reason, + ieee80211_s1g_use_ndp_ba(sta->sdata, sta)); /* * return here in case tid_rx is not assigned - which will happen if @@ -240,6 +241,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); + bool use_ndp = ieee80211_s1g_use_ndp_ba(sdata, sta); u16 capab; skb = dev_alloc_skb(sizeof(*mgmt) + @@ -253,7 +255,8 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, skb_put(skb, 2 + sizeof(mgmt->u.action.addba_resp)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_ADDBA_RESP; + mgmt->u.action.action_code = use_ndp ? + WLAN_ACTION_NDP_ADDBA_RESP : WLAN_ACTION_ADDBA_RESP; mgmt->u.action.addba_resp.dialog_token = dialog_token; @@ -276,6 +279,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, + bool req_ndp, const u8 addba_ext_data) { struct ieee80211_local *local = sta->sdata->local; @@ -301,6 +305,18 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (tx && ieee80211_s1g_use_ndp_ba(sta->sdata, sta) && !req_ndp) { + /* + * According to IEEE 802.11-2024: Inform S1G originator + * ADDBA rejected as NDP BlockAck is preferred + */ + status = WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED; + ht_dbg(sta->sdata, + "Rejecting AddBA Req from %pM tid %u - require NDP BlockAck\n", + sta->sta.addr, tid); + goto end; + } + if (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported && !sta->sta.deflink.he_cap.has_he && @@ -474,6 +490,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len) { + bool req_ndp = mgmt->u.action.action_code == WLAN_ACTION_NDP_ADDBA_REQ; u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; u8 dialog_token, addba_ext_data; @@ -498,7 +515,8 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true, false, addba_ext_data); + buf_size, true, false, + req_ndp, addba_ext_data); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index d5a62b8d5a80..01d927b88264 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -60,7 +60,7 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, u8 dialog_token, u16 start_seq_num, - u16 agg_size, u16 timeout) + u16 agg_size, u16 timeout, bool ndp) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; @@ -80,7 +80,8 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, skb_put(skb, 2 + sizeof(mgmt->u.action.addba_req)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_ADDBA_REQ; + mgmt->u.action.action_code = ndp ? + WLAN_ACTION_NDP_ADDBA_REQ : WLAN_ACTION_ADDBA_REQ; mgmt->u.action.addba_req.dialog_token = dialog_token; capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK; @@ -484,7 +485,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, /* send AddBA request */ ieee80211_send_addba_request(sta, tid, tid_tx->dialog_token, - tid_tx->ssn, buf_size, tid_tx->timeout); + tid_tx->ssn, buf_size, tid_tx->timeout, + tid_tx->ndp); WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); } @@ -521,6 +523,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ synchronize_net(); + tid_tx->ndp = ieee80211_s1g_use_ndp_ba(sdata, sta); params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); tid_tx->ssn = params.ssn; @@ -940,7 +943,9 @@ void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, if (send_delba) ieee80211_send_delba(sdata, sta->sta.addr, tid, - WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + WLAN_BACK_INITIATOR, + WLAN_REASON_QSTA_NOT_USE, + tid_tx->ndp); } void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index d02f07368c51..e8d0a8b71d59 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -490,6 +490,7 @@ static const char *hw_flag_names[] = { FLAG(DISALLOW_PUNCTURING), FLAG(HANDLES_QUIET_CSA), FLAG(STRICT), + FLAG(SUPPORTS_NDP_BLOCKACK), #undef FLAG }; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 9e2469a8ce64..33f1e1b235e9 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -379,7 +379,7 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, - false, true, 0); + false, true, false, 0); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) @@ -455,7 +455,8 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, - u16 initiator, u16 reason_code) + u16 initiator, u16 reason_code, + bool use_ndp) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; @@ -473,7 +474,8 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, skb_put(skb, 2 + sizeof(mgmt->u.action.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_DELBA; + mgmt->u.action.action_code = use_ndp ? + WLAN_ACTION_NDP_DELBA : WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a4babf7624e5..d71e0c6d2165 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2190,7 +2190,8 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, - u16 initiator, u16 reason_code); + u16 initiator, u16 reason_code, + bool use_ndp); int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id); @@ -2206,6 +2207,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, + bool req_ndp, const u8 addba_ext_data); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); @@ -2331,6 +2333,8 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_s1g_cap *s1g_cap_ie, struct link_sta_info *link_sta); +bool ieee80211_s1g_use_ndp_ba(const struct ieee80211_sub_if_data *sdata, + const struct sta_info *sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 2e391cec73a0..40ce0bb72726 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1581,14 +1581,17 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, if (sta) { switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: + case WLAN_ACTION_NDP_ADDBA_REQ: ieee80211_process_addba_request(local, sta, mgmt, len); break; case WLAN_ACTION_ADDBA_RESP: + case WLAN_ACTION_NDP_ADDBA_RESP: ieee80211_process_addba_resp(local, sta, mgmt, len); break; case WLAN_ACTION_DELBA: + case WLAN_ACTION_NDP_DELBA: ieee80211_process_delba(sdata, sta, mgmt, len); break; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3bd046bebf9e..19c33f7a8193 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1475,7 +1475,9 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP); + WLAN_REASON_QSTA_REQUIRE_SETUP, + ieee80211_s1g_use_ndp_ba(rx->sdata, + rx->sta)); goto dont_reorder; } @@ -3372,7 +3374,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP); + WLAN_REASON_QSTA_REQUIRE_SETUP, + ieee80211_s1g_use_ndp_ba(rx->sdata, + rx->sta)); tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) @@ -3753,14 +3757,17 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: + case WLAN_ACTION_NDP_ADDBA_REQ: if (len < IEEE80211_MIN_ACTION_SIZE(addba_req)) goto invalid; break; case WLAN_ACTION_ADDBA_RESP: + case WLAN_ACTION_NDP_ADDBA_RESP: if (len < IEEE80211_MIN_ACTION_SIZE(addba_resp)) goto invalid; break; case WLAN_ACTION_DELBA: + case WLAN_ACTION_NDP_DELBA: if (len < IEEE80211_MIN_ACTION_SIZE(delba)) goto invalid; break; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 297abaa6fecf..5af4a0c6c642 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -220,3 +220,11 @@ void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } + +bool ieee80211_s1g_use_ndp_ba(const struct ieee80211_sub_if_data *sdata, + const struct sta_info *sta) +{ + return sdata->vif.cfg.s1g && + ieee80211_hw_check(&sdata->local->hw, SUPPORTS_NDP_BLOCKACK) && + (sta && sta->sta.deflink.s1g_cap.s1g); +} diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index f1b1bbf2a2d4..58ccbea7f6f6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -171,6 +171,7 @@ struct sta_info; * @bar_pending: BAR needs to be re-sent * @amsdu: support A-MSDU within A-MDPU * @ssn: starting sequence number of the session + * @ndp: this session is using NDP Block ACKs * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -199,6 +200,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; bool amsdu; + bool ndp; u8 tid; }; -- cgit v1.2.3 From 8300438dc424f367875acce54c7e91fa819e5ab1 Mon Sep 17 00:00:00 2001 From: Odelu Kukatla Date: Tue, 24 Feb 2026 13:43:06 +0200 Subject: dt-bindings: interconnect: document the RPMh Network-On-Chip interconnect in Eliza SoC Document the RPMh Network-On-Chip Interconnect of the Eliza platform. Signed-off-by: Odelu Kukatla Reviewed-by: Krzysztof Kozlowski Signed-off-by: Abel Vesa Link: https://msgid.link/20260224-eliza-interconnect-v4-1-ad75855d5018@oss.qualcomm.com Signed-off-by: Georgi Djakov --- .../bindings/interconnect/qcom,eliza-rpmh.yaml | 142 +++++++++++++++++++++ include/dt-bindings/interconnect/qcom,eliza-rpmh.h | 136 ++++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 Documentation/devicetree/bindings/interconnect/qcom,eliza-rpmh.yaml create mode 100644 include/dt-bindings/interconnect/qcom,eliza-rpmh.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/interconnect/qcom,eliza-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,eliza-rpmh.yaml new file mode 100644 index 000000000000..9a926a97e7bf --- /dev/null +++ b/Documentation/devicetree/bindings/interconnect/qcom,eliza-rpmh.yaml @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interconnect/qcom,eliza-rpmh.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm RPMh Network-On-Chip Interconnect on Eliza SoC + +maintainers: + - Odelu Kukatla + +description: | + RPMh interconnect providers support system bandwidth requirements through + RPMh hardware accelerators known as Bus Clock Manager (BCM). The provider is + able to communicate with the BCM through the Resource State Coordinator (RSC) + associated with each execution environment. Provider nodes must point to at + least one RPMh device child node pertaining to their RSC and each provider + can map to multiple RPMh resources. + + See also: include/dt-bindings/interconnect/qcom,eliza-rpmh.h + +properties: + compatible: + enum: + - qcom,eliza-aggre1-noc + - qcom,eliza-aggre2-noc + - qcom,eliza-clk-virt + - qcom,eliza-cnoc-cfg + - qcom,eliza-cnoc-main + - qcom,eliza-gem-noc + - qcom,eliza-lpass-ag-noc + - qcom,eliza-lpass-lpiaon-noc + - qcom,eliza-lpass-lpicx-noc + - qcom,eliza-mc-virt + - qcom,eliza-mmss-noc + - qcom,eliza-nsp-noc + - qcom,eliza-pcie-anoc + - qcom,eliza-system-noc + + reg: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 2 + +required: + - compatible + +allOf: + - $ref: qcom,rpmh-common.yaml# + - if: + properties: + compatible: + contains: + enum: + - qcom,eliza-clk-virt + - qcom,eliza-mc-virt + then: + properties: + reg: false + else: + required: + - reg + + - if: + properties: + compatible: + contains: + enum: + - qcom,eliza-aggre1-noc + then: + properties: + clocks: + items: + - description: aggre UFS PHY AXI clock + - description: aggre USB3 PRIM AXI clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,eliza-aggre2-noc + then: + properties: + clocks: + items: + - description: RPMH CC IPA clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,eliza-pcie-anoc + then: + properties: + clocks: + items: + - description: aggre-NOC PCIe AXI clock + - description: cfg-NOC PCIe a-NOC AHB clock + + - if: + properties: + compatible: + contains: + enum: + - qcom,eliza-aggre1-noc + - qcom,eliza-aggre2-noc + - qcom,eliza-pcie-anoc + then: + required: + - clocks + else: + properties: + clocks: false + +unevaluatedProperties: false + +examples: + - | + gem_noc: interconnect@24100000 { + compatible = "qcom,eliza-gem-noc"; + reg = <0x24100000 0x163080>; + #interconnect-cells = <2>; + qcom,bcm-voters = <&apps_bcm_voter>; + }; + + mc_virt: interconnect-2 { + compatible = "qcom,eliza-mc-virt"; + #interconnect-cells = <2>; + qcom,bcm-voters = <&apps_bcm_voter>; + }; + + aggre1_noc: interconnect@16e0000 { + compatible = "qcom,eliza-aggre1-noc"; + reg = <0x16e0000 0x16400>; + #interconnect-cells = <2>; + clocks = <&gcc_phy_axi_clk>, <&gcc_prim_axi_clk>; + qcom,bcm-voters = <&apps_bcm_voter>; + }; diff --git a/include/dt-bindings/interconnect/qcom,eliza-rpmh.h b/include/dt-bindings/interconnect/qcom,eliza-rpmh.h new file mode 100644 index 000000000000..95db2fe647de --- /dev/null +++ b/include/dt-bindings/interconnect/qcom,eliza-rpmh.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_ELIZA_H +#define __DT_BINDINGS_INTERCONNECT_QCOM_ELIZA_H + +#define MASTER_QSPI_0 0 +#define MASTER_QUP_1 1 +#define MASTER_UFS_MEM 2 +#define MASTER_USB3_0 3 +#define SLAVE_A1NOC_SNOC 4 + +#define MASTER_QUP_2 0 +#define MASTER_CRYPTO 1 +#define MASTER_IPA 2 +#define MASTER_SOCCP_AGGR_NOC 3 +#define MASTER_QDSS_ETR 4 +#define MASTER_QDSS_ETR_1 5 +#define MASTER_SDCC_1 6 +#define MASTER_SDCC_2 7 +#define SLAVE_A2NOC_SNOC 8 + +#define MASTER_QUP_CORE_1 0 +#define MASTER_QUP_CORE_2 1 +#define SLAVE_QUP_CORE_1 2 +#define SLAVE_QUP_CORE_2 3 + +#define MASTER_CNOC_CFG 0 +#define SLAVE_AHB2PHY_SOUTH 1 +#define SLAVE_AHB2PHY_NORTH 2 +#define SLAVE_CAMERA_CFG 3 +#define SLAVE_CLK_CTL 4 +#define SLAVE_CRYPTO_0_CFG 5 +#define SLAVE_DISPLAY_CFG 6 +#define SLAVE_GFX3D_CFG 7 +#define SLAVE_I3C_IBI0_CFG 8 +#define SLAVE_I3C_IBI1_CFG 9 +#define SLAVE_IMEM_CFG 10 +#define SLAVE_CNOC_MSS 11 +#define SLAVE_PCIE_0_CFG 12 +#define SLAVE_PRNG 13 +#define SLAVE_QDSS_CFG 14 +#define SLAVE_QSPI_0 15 +#define SLAVE_QUP_1 16 +#define SLAVE_QUP_2 17 +#define SLAVE_SDCC_2 18 +#define SLAVE_TCSR 19 +#define SLAVE_TLMM 20 +#define SLAVE_UFS_MEM_CFG 21 +#define SLAVE_USB3_0 22 +#define SLAVE_VENUS_CFG 23 +#define SLAVE_VSENSE_CTRL_CFG 24 +#define SLAVE_CNOC_MNOC_HF_CFG 25 +#define SLAVE_CNOC_MNOC_SF_CFG 26 +#define SLAVE_PCIE_ANOC_CFG 27 +#define SLAVE_QDSS_STM 28 +#define SLAVE_TCU 29 + +#define MASTER_GEM_NOC_CNOC 0 +#define MASTER_GEM_NOC_PCIE_SNOC 1 +#define SLAVE_AOSS 2 +#define SLAVE_IPA_CFG 3 +#define SLAVE_IPC_ROUTER_CFG 4 +#define SLAVE_SOCCP 5 +#define SLAVE_TME_CFG 6 +#define SLAVE_APPSS 7 +#define SLAVE_CNOC_CFG 8 +#define SLAVE_DDRSS_CFG 9 +#define SLAVE_BOOT_IMEM 10 +#define SLAVE_IMEM 11 +#define SLAVE_BOOT_IMEM_2 12 +#define SLAVE_SERVICE_CNOC 13 +#define SLAVE_PCIE_0 14 +#define SLAVE_PCIE_1 15 + +#define MASTER_GPU_TCU 0 +#define MASTER_SYS_TCU 1 +#define MASTER_APPSS_PROC 2 +#define MASTER_GFX3D 3 +#define MASTER_LPASS_GEM_NOC 4 +#define MASTER_MSS_PROC 5 +#define MASTER_MNOC_HF_MEM_NOC 6 +#define MASTER_MNOC_SF_MEM_NOC 7 +#define MASTER_COMPUTE_NOC 8 +#define MASTER_ANOC_PCIE_GEM_NOC 9 +#define MASTER_SNOC_SF_MEM_NOC 10 +#define MASTER_WLAN_Q6 11 +#define MASTER_GIC 12 +#define SLAVE_GEM_NOC_CNOC 13 +#define SLAVE_LLCC 14 +#define SLAVE_MEM_NOC_PCIE_SNOC 15 + +#define MASTER_LPIAON_NOC 0 +#define SLAVE_LPASS_GEM_NOC 1 + +#define MASTER_LPASS_LPINOC 0 +#define SLAVE_LPIAON_NOC_LPASS_AG_NOC 1 + +#define MASTER_LPASS_PROC 0 +#define SLAVE_LPICX_NOC_LPIAON_NOC 1 + +#define MASTER_LLCC 0 +#define SLAVE_EBI1 1 + +#define MASTER_CAMNOC_NRT_ICP_SF 0 +#define MASTER_CAMNOC_RT_CDM_SF 1 +#define MASTER_CAMNOC_SF 2 +#define MASTER_VIDEO_MVP 3 +#define MASTER_VIDEO_V_PROC 4 +#define MASTER_CNOC_MNOC_SF_CFG 5 +#define MASTER_CAMNOC_HF 6 +#define MASTER_MDP 7 +#define MASTER_CNOC_MNOC_HF_CFG 8 +#define SLAVE_MNOC_SF_MEM_NOC 9 +#define SLAVE_SERVICE_MNOC_SF 10 +#define SLAVE_MNOC_HF_MEM_NOC 11 +#define SLAVE_SERVICE_MNOC_HF 12 + +#define MASTER_CDSP_PROC 0 +#define SLAVE_CDSP_MEM_NOC 1 + +#define MASTER_PCIE_ANOC_CFG 0 +#define MASTER_PCIE_0 1 +#define MASTER_PCIE_1 2 +#define SLAVE_ANOC_PCIE_GEM_NOC 3 +#define SLAVE_SERVICE_PCIE_ANOC 4 + +#define MASTER_A1NOC_SNOC 0 +#define MASTER_A2NOC_SNOC 1 +#define MASTER_CNOC_SNOC 2 +#define MASTER_NSINOC_SNOC 3 +#define SLAVE_SNOC_GEM_NOC_SF 4 + +#endif -- cgit v1.2.3 From 96fefcabf340fcf8b3208dcd8685961955a66040 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:31 -0500 Subject: vfs: widen inode hash/lookup functions to u64 Change the inode hash/lookup VFS API functions to accept u64 parameters instead of unsigned long for inode numbers and hash values. This is preparation for widening i_ino itself to u64, which will allow filesystems to store full 64-bit inode numbers on 32-bit architectures. Since unsigned long implicitly widens to u64 on all architectures, this change is backward-compatible with all existing callers. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-1-2257ad83d372@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/f2fs/node.c | 2 +- fs/inode.c | 36 ++++++++++++++++++------------------ include/linux/fs.h | 46 ++++++++++++++++++++++------------------------ 3 files changed, 41 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 74992fd9c9b6..5ca6f518cfa1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1997,7 +1997,7 @@ out: return ret; } -static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +static int f2fs_match_ino(struct inode *inode, u64 ino, void *data) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); bool clean; diff --git a/fs/inode.c b/fs/inode.c index cc12b68e021b..62df5dda0589 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -672,7 +672,7 @@ static inline void inode_sb_list_del(struct inode *inode) } } -static unsigned long hash(struct super_block *sb, unsigned long hashval) +static unsigned long hash(struct super_block *sb, u64 hashval) { unsigned long tmp; @@ -685,12 +685,12 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) /** * __insert_inode_hash - hash an inode * @inode: unhashed inode - * @hashval: unsigned long value used to locate this object in the + * @hashval: u64 value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ -void __insert_inode_hash(struct inode *inode, unsigned long hashval) +void __insert_inode_hash(struct inode *inode, u64 hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); @@ -1087,7 +1087,7 @@ repeat: * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino, + struct hlist_head *head, u64 ino, bool hash_locked, bool *isnew) { struct inode *inode = NULL; @@ -1301,7 +1301,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ -struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1378,7 +1378,7 @@ EXPORT_SYMBOL(inode_insert5); * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, +struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1408,7 +1408,7 @@ EXPORT_SYMBOL(iget5_locked); * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ -struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, +struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1455,7 +1455,7 @@ EXPORT_SYMBOL_GPL(iget5_locked_rcu); * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ -struct inode *iget_locked(struct super_block *sb, unsigned long ino) +struct inode *iget_locked(struct super_block *sb, u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1527,7 +1527,7 @@ EXPORT_SYMBOL(iget_locked); * * Returns 1 if the inode number is unique, 0 if it is not. */ -static int test_inode_iunique(struct super_block *sb, unsigned long ino) +static int test_inode_iunique(struct super_block *sb, u64 ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1616,7 +1616,7 @@ EXPORT_SYMBOL(igrab); * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ -struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, +struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, bool *isnew) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); @@ -1647,7 +1647,7 @@ EXPORT_SYMBOL(ilookup5_nowait); * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ -struct inode *ilookup5(struct super_block *sb, unsigned long hashval, +struct inode *ilookup5(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; @@ -1677,7 +1677,7 @@ EXPORT_SYMBOL(ilookup5); * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ -struct inode *ilookup(struct super_block *sb, unsigned long ino) +struct inode *ilookup(struct super_block *sb, u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1726,8 +1726,8 @@ EXPORT_SYMBOL(ilookup); * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, - unsigned long hashval, - int (*match)(struct inode *, unsigned long, + u64 hashval, + int (*match)(struct inode *, u64, void *), void *data) { @@ -1773,7 +1773,7 @@ EXPORT_SYMBOL(find_inode_nowait); * * The caller must hold the RCU read lock. */ -struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, +struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); @@ -1812,7 +1812,7 @@ EXPORT_SYMBOL(find_inode_rcu); * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, - unsigned long ino) + u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1833,7 +1833,7 @@ EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; - ino_t ino = inode->i_ino; + u64 ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); bool isnew; @@ -1884,7 +1884,7 @@ repeat: } EXPORT_SYMBOL(insert_inode_locked); -int insert_inode_locked4(struct inode *inode, unsigned long hashval, +int insert_inode_locked4(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..b4f5e5fdbe4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2934,33 +2934,31 @@ static inline int inode_generic_drop(struct inode *inode) } extern void d_mark_dontcache(struct inode *inode); -extern struct inode *ilookup5_nowait(struct super_block *sb, - unsigned long hashval, int (*test)(struct inode *, void *), - void *data, bool *isnew); -extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data); -extern struct inode *ilookup(struct super_block *sb, unsigned long ino); - -extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data); -struct inode *iget5_locked(struct super_block *, unsigned long, +struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data, + bool *isnew); +struct inode *ilookup5(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *ilookup(struct super_block *sb, u64 ino); + +struct inode *inode_insert5(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, unsigned long, +struct inode *iget5_locked_rcu(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -extern struct inode * iget_locked(struct super_block *, unsigned long); -extern struct inode *find_inode_nowait(struct super_block *, - unsigned long, - int (*match)(struct inode *, - unsigned long, void *), - void *data); -extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - int (*)(struct inode *, void *), void *); -extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); -extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); +struct inode *iget_locked(struct super_block *, u64); +struct inode *find_inode_nowait(struct super_block *, u64, + int (*match)(struct inode *, u64, void *), + void *data); +struct inode *find_inode_rcu(struct super_block *, u64, + int (*)(struct inode *, void *), void *); +struct inode *find_inode_by_ino_rcu(struct super_block *, u64); +int insert_inode_locked4(struct inode *, u64, + int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); @@ -3015,7 +3013,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -extern void __insert_inode_hash(struct inode *, unsigned long hashval); +void __insert_inode_hash(struct inode *, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); -- cgit v1.2.3 From 125dfa218134df7cc112667e92984de9d8cd0bf6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:32 -0500 Subject: audit: widen ino fields to u64 inode->i_ino is being widened from unsigned long to u64. The audit subsystem uses unsigned long ino in struct fields, function parameters, and local variables that store inode numbers from arbitrary filesystems. On 32-bit platforms this truncates inode numbers that exceed 32 bits, which will cause incorrect audit log entries and broken watch/mark comparisons. Widen all audit ino fields, parameters, and locals to u64, and update the inode format string from %lu to %llu to match. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-2-2257ad83d372@kernel.org Acked-by: Paul Moore Signed-off-by: Christian Brauner --- include/linux/audit.h | 2 +- kernel/audit.h | 13 ++++++------- kernel/audit_fsnotify.c | 4 ++-- kernel/audit_watch.c | 12 ++++++------ kernel/auditsc.c | 4 ++-- 5 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..b915aaa7ed73 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,7 +15,7 @@ #include #include -#define AUDIT_INO_UNSET ((unsigned long)-1) +#define AUDIT_INO_UNSET ((u64)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { diff --git a/kernel/audit.h b/kernel/audit.h index 7c401729e21b..ac81fa02bcd7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -76,7 +76,7 @@ struct audit_names { int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - unsigned long ino; + u64 ino; dev_t dev; umode_t mode; kuid_t uid; @@ -225,9 +225,9 @@ extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -static inline int audit_hash_ino(u32 ino) +static inline int audit_hash_ino(u64 ino) { - return (ino & (AUDIT_INODE_BUCKETS-1)); + return ((u32)ino & (AUDIT_INODE_BUCKETS-1)); } /* Indicates that audit should log the full pathname. */ @@ -277,16 +277,15 @@ extern int audit_to_watch(struct audit_krule *krule, char *path, int len, extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, - dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev); extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, - unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, + dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index a4401f651060..711454f9f724 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -25,7 +25,7 @@ */ struct audit_fsnotify_mark { dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ char *path; /* insertion path */ struct fsnotify_mark mark; /* fsnotify mark on the inode */ struct audit_krule *rule; @@ -57,7 +57,7 @@ char *audit_mark_path(struct audit_fsnotify_mark *mark) return mark->path; } -int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, dev_t dev) { if (mark->ino == AUDIT_INO_UNSET) return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 096faac2435c..33577f0f54ef 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -37,7 +37,7 @@ struct audit_watch { refcount_t count; /* reference count */ dev_t dev; /* associated superblock device */ char *path; /* insertion path */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ struct list_head rules; /* anchor for krule->rlist */ @@ -125,7 +125,7 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) +int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev) { return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && @@ -244,7 +244,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const struct qstr *dname, dev_t dev, - unsigned long ino, unsigned invalidating) + u64 ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; @@ -285,7 +285,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { - int h = audit_hash_ino((u32)ino); + int h = audit_hash_ino(ino); /* * nentry->rule.watch == oentry->rule.watch so @@ -439,7 +439,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - h = audit_hash_ino((u32)watch->ino); + h = audit_hash_ino(watch->ino); *list = &audit_inode_hash[h]; error: path_put(&parent_path); @@ -527,7 +527,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) { struct file *exe_file; - unsigned long ino; + u64 ino; dev_t dev; /* only do exe filtering if we are recording @current events/records */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f6af6a8f68c4..ab54fccba215 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -886,7 +886,7 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int h = audit_hash_ino((u32)n->ino); + int h = audit_hash_ino(n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); @@ -1534,7 +1534,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + audit_log_format(ab, " inode=%llu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), -- cgit v1.2.3 From 0fe27e5985925de8f0243f91658af7a6fea05725 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:33 -0500 Subject: net: change sock.sk_ino and sock_i_ino() to u64 inode->i_ino is being converted to a u64. sock.sk_ino (which caches the inode number) must also be widened to avoid truncation on 32-bit architectures where unsigned long is only 32 bits. Change sk_ino from unsigned long to u64, and update the return type of sock_i_ino() to match. Fix all format strings that print the result of sock_i_ino() (%lu -> %llu), and widen the intermediate variables and function parameters in the diag modules that were using int to hold the inode number. Note that the UAPI socket diag structures (inet_diag_msg.idiag_inode, unix_diag_msg.udiag_ino, etc.) are all __u32 and cannot be changed without breaking the ABI. The assignments to those fields will silently truncate, which is the existing behavior. Acked-by: Marc Kleine-Budde # for net/can Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-3-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/net/sock.h | 4 ++-- net/ax25/af_ax25.c | 2 +- net/bluetooth/af_bluetooth.c | 4 ++-- net/can/bcm.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/key/af_key.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netlink/diag.c | 2 +- net/packet/af_packet.c | 2 +- net/packet/diag.c | 2 +- net/phonet/socket.c | 4 ++-- net/sctp/proc.c | 4 ++-- net/unix/af_unix.c | 2 +- net/unix/diag.c | 6 +++--- net/xdp/xsk_diag.c | 2 +- 20 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 66b56288c1d3..b08cb0347e5e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -537,7 +537,7 @@ struct sock { rwlock_t sk_callback_lock; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - unsigned long sk_ino; + u64 sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; @@ -2140,7 +2140,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -static inline unsigned long sock_i_ino(const struct sock *sk) +static inline u64 sock_i_ino(const struct sock *sk) { /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ return READ_ONCE(sk->sk_ino); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a76f4793aed2..9d236e64f5f5 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1986,7 +1986,7 @@ static int ax25_info_show(struct seq_file *seq, void *v) ax25->paclen); if (ax25->sk != NULL) { - seq_printf(seq, " %d %d %lu\n", + seq_printf(seq, " %d %d %llu\n", sk_wmem_alloc_get(ax25->sk), sk_rmem_alloc_get(ax25->sk), sock_i_ino(ax25->sk)); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 2b94e2077203..33d053d63407 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -817,14 +817,14 @@ static int bt_seq_show(struct seq_file *seq, void *v) struct bt_sock *bt = bt_sk(sk); seq_printf(seq, - "%pK %-6d %-6u %-6u %-6u %-6lu %-6lu", + "%pK %-6d %-6u %-6u %-6u %-6llu %-6llu", sk, refcount_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), from_kuid(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), - bt->parent ? sock_i_ino(bt->parent) : 0LU); + bt->parent ? sock_i_ino(bt->parent) : 0ULL); if (l->custom_seq_show) { seq_putc(seq, ' '); diff --git a/net/can/bcm.c b/net/can/bcm.c index b7324e9c955b..30aac2f903d5 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1712,7 +1712,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ - sprintf(bo->procname, "%lu", sock_i_ino(sk)); + sprintf(bo->procname, "%llu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644, net->can.bcmproc_dir, bcm_proc_show, sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 71d5e17719de..bc4b43e52303 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1111,7 +1111,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e20c41206e29..bcc99ced1ade 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1041,7 +1041,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 63a8b174cf99..b772dceee395 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2883,7 +2883,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", + "%08X %5u %8d %llu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6c6b68a66dcd..db58bf786c4a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3428,7 +3428,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", + " %02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c564b68a0562..611fddb90c79 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1055,7 +1055,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d10487b4e5bf..cedfecc5c200 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2189,7 +2189,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %llu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, diff --git a/net/key/af_key.c b/net/key/af_key.c index 0756bac62f7c..522308ec934e 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3783,7 +3783,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v) if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else - seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", + seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6llu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4d609d5cf406..aba847902be5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2700,7 +2700,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", + seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8llu\n", s, s->sk_protocol, nlk->portid, diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1dfc340736b8..0b3e021bd0ed 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -43,7 +43,7 @@ static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, - u32 portid, u32 seq, u32 flags, int sk_ino) + u32 portid, u32 seq, u32 flags, u64 sk_ino) { struct nlmsghdr *nlh; struct netlink_diag_msg *rep; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 72d0935139f0..f2af2e0a8530 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4721,7 +4721,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) const struct packet_sock *po = pkt_sk(s); seq_printf(seq, - "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", + "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6llu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, diff --git a/net/packet/diag.c b/net/packet/diag.c index c8f43e0c1925..cee773f46571 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -130,7 +130,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, bool may_report_filterinfo, struct user_namespace *user_ns, - u32 portid, u32 seq, u32 flags, int sk_ino) + u32 portid, u32 seq, u32 flags, u64 sk_ino) { struct nlmsghdr *nlh; struct packet_diag_msg *rp; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 4423d483c630..c4af26357144 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -579,7 +579,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) struct sock *sk = v; struct pn_sock *pn = pn_sk(sk); - seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " + seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %llu " "%d %pK %u", sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, @@ -754,7 +754,7 @@ static int pn_res_seq_show(struct seq_file *seq, void *v) struct sock *sk = rcu_dereference_protected(*psk, lockdep_is_held(&resource_mutex)); - seq_printf(seq, "%02X %5u %lu", + seq_printf(seq, "%02X %5u %llu", (int) (psk - pnres.sk), from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk)); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 1ed281f3c355..43433d7e2acd 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -174,7 +174,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) sk = ep->base.sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) continue; - seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, + seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5llu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), @@ -261,7 +261,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%8pK %8pK %-3d %-3d %-2d %-4d " - "%4d %8d %8d %7u %5lu %-5d %5d ", + "%4d %8d %8d %7u %5llu %-5d %5d ", assoc, sk, sctp_sk(sk)->type, sk->sk_state, assoc->state, 0, assoc->assoc_id, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3756a93dc63a..a6c5015f3f0a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3537,7 +3537,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) struct unix_sock *u = unix_sk(s); unix_state_lock(s); - seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", + seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5llu", s, refcount_read(&s->sk_refcnt), 0, diff --git a/net/unix/diag.c b/net/unix/diag.c index ca3473026151..410f6c8745b7 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -45,7 +45,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) { struct sock *peer; - int ino; + u64 ino; peer = unix_peer_get(sk); if (peer) { @@ -112,7 +112,7 @@ static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, struct user_namespace *user_ns, - u32 portid, u32 seq, u32 flags, int sk_ino) + u32 portid, u32 seq, u32 flags, u64 sk_ino) { struct nlmsghdr *nlh; struct unix_diag_msg *rep; @@ -186,7 +186,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; spin_lock(&net->unx.table.locks[slot]); sk_for_each(sk, &net->unx.table.buckets[slot]) { - int sk_ino; + u64 sk_ino; if (num < s_num) goto next; diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0e0bca031c03..0170363eb542 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -92,7 +92,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, - u32 portid, u32 seq, u32 flags, int sk_ino) + u32 portid, u32 seq, u32 flags, u64 sk_ino) { struct xdp_sock *xs = xdp_sk(sk); struct xdp_diag_msg *msg; -- cgit v1.2.3 From 5a3d5928b1a4196affc5f504fdf95f67f169bc57 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:34 -0500 Subject: vfs: widen trace event i_ino fields to u64 Update VFS-layer trace event definitions to use u64 instead of ino_t/unsigned long for inode number fields. Update TP_printk format strings to use %llu/%llx to match the widened field type. Remove now-unnecessary (unsigned long) casts since __entry->ino is already u64. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-4-2257ad83d372@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/iomap/trace.h | 8 +- include/trace/events/filelock.h | 34 ++++---- include/trace/events/filemap.h | 20 ++--- include/trace/events/fs_dax.h | 20 ++--- include/trace/events/fsverity.h | 30 ++++---- include/trace/events/netfs.h | 8 +- include/trace/events/readahead.h | 18 ++--- include/trace/events/timestamp.h | 16 ++-- include/trace/events/writeback.h | 162 +++++++++++++++++++-------------------- 9 files changed, 158 insertions(+), 158 deletions(-) (limited to 'include') diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 532787277b16..097773c6db80 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -257,7 +257,7 @@ TRACE_EVENT(iomap_dio_rw_begin, TP_ARGS(iocb, iter, dio_flags, done_before), TP_STRUCT__entry( __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, isize) __field(loff_t, pos) __field(size_t, count) @@ -277,7 +277,7 @@ TRACE_EVENT(iomap_dio_rw_begin, __entry->dio_flags = dio_flags; __entry->aio = !is_sync_kiocb(iocb); ), - TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, @@ -294,7 +294,7 @@ TRACE_EVENT(iomap_dio_complete, TP_ARGS(iocb, error, ret), TP_STRUCT__entry( __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, isize) __field(loff_t, pos) __field(int, ki_flags) @@ -312,7 +312,7 @@ TRACE_EVENT(iomap_dio_complete, __entry->error = error; __entry->ret = ret; ), - TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index 370016c38a5b..116774886244 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -42,10 +42,10 @@ TRACE_EVENT(locks_get_lock_context, TP_ARGS(inode, type, ctx), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) + __field(struct file_lock_context *, ctx) __field(dev_t, s_dev) __field(unsigned char, type) - __field(struct file_lock_context *, ctx) ), TP_fast_assign( @@ -55,7 +55,7 @@ TRACE_EVENT(locks_get_lock_context, __entry->ctx = ctx; ), - TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", + TP_printk("dev=0x%x:0x%x ino=0x%llx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); @@ -66,16 +66,16 @@ DECLARE_EVENT_CLASS(filelock_lock, TP_ARGS(inode, fl, ret), TP_STRUCT__entry( + __field(u64, i_ino) + __field(loff_t, fl_start) + __field(loff_t, fl_end) __field(struct file_lock *, fl) - __field(unsigned long, i_ino) - __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) + __field(dev_t, s_dev) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) - __field(loff_t, fl_start) - __field(loff_t, fl_end) __field(int, ret) ), @@ -93,7 +93,7 @@ DECLARE_EVENT_CLASS(filelock_lock, __entry->ret = ret; ), - TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", + TP_printk("fl=%p dev=0x%x:0x%x ino=0x%llx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), @@ -123,15 +123,15 @@ DECLARE_EVENT_CLASS(filelock_lease, TP_ARGS(inode, fl), TP_STRUCT__entry( + __field(u64, i_ino) __field(struct file_lease *, fl) - __field(unsigned long, i_ino) - __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) - __field(unsigned int, flags) - __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) + __field(dev_t, s_dev) + __field(unsigned int, flags) + __field(unsigned char, type) ), TP_fast_assign( @@ -146,7 +146,7 @@ DECLARE_EVENT_CLASS(filelock_lease, __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), - TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", + TP_printk("fl=%p dev=0x%x:0x%x ino=0x%llx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), @@ -175,12 +175,12 @@ TRACE_EVENT(generic_add_lease, TP_ARGS(inode, fl), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) + __field(fl_owner_t, owner) + __field(dev_t, s_dev) __field(int, wcount) __field(int, rcount) __field(int, icount) - __field(dev_t, s_dev) - __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), @@ -196,7 +196,7 @@ TRACE_EVENT(generic_add_lease, __entry->type = fl->c.flc_type; ), - TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", + TP_printk("dev=0x%x:0x%x ino=0x%llx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h index f48fe637bfd2..4dcf8e9e2e0d 100644 --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -20,8 +20,8 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, TP_ARGS(folio), TP_STRUCT__entry( + __field(u64, i_ino) __field(unsigned long, pfn) - __field(unsigned long, i_ino) __field(unsigned long, index) __field(dev_t, s_dev) __field(unsigned char, order) @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, __entry->order = folio_order(folio); ), - TP_printk("dev %d:%d ino %lx pfn=0x%lx ofs=%lu order=%u", + TP_printk("dev %d:%d ino %llx pfn=0x%lx ofs=%lu order=%u", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->pfn, @@ -67,7 +67,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range, TP_ARGS(mapping, index, last_index), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) __field(dev_t, s_dev) __field(unsigned long, index) __field(unsigned long, last_index) @@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range, ), TP_printk( - "dev=%d:%d ino=%lx ofs=%lld-%lld", + "dev=%d:%d ino=%llx ofs=%lld-%lld", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, ((loff_t)__entry->index) << PAGE_SHIFT, @@ -117,7 +117,7 @@ TRACE_EVENT(mm_filemap_fault, TP_ARGS(mapping, index), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) __field(dev_t, s_dev) __field(unsigned long, index) ), @@ -133,7 +133,7 @@ TRACE_EVENT(mm_filemap_fault, ), TP_printk( - "dev=%d:%d ino=%lx ofs=%lld", + "dev=%d:%d ino=%llx ofs=%lld", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, ((loff_t)__entry->index) << PAGE_SHIFT @@ -146,7 +146,7 @@ TRACE_EVENT(filemap_set_wb_err, TP_ARGS(mapping, eseq), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) __field(dev_t, s_dev) __field(errseq_t, errseq) ), @@ -160,7 +160,7 @@ TRACE_EVENT(filemap_set_wb_err, __entry->s_dev = mapping->host->i_rdev; ), - TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x", + TP_printk("dev=%d:%d ino=0x%llx errseq=0x%x", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->errseq) ); @@ -171,8 +171,8 @@ TRACE_EVENT(file_check_and_advance_wb_err, TP_ARGS(file, old), TP_STRUCT__entry( + __field(u64, i_ino) __field(struct file *, file) - __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, old) __field(errseq_t, new) @@ -191,7 +191,7 @@ TRACE_EVENT(file_check_and_advance_wb_err, __entry->new = file->f_wb_err; ), - TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x", + TP_printk("file=%p dev=%d:%d ino=0x%llx old=0x%x new=0x%x", __entry->file, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->old, __entry->new) diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 50ebc1290ab0..11121baa8ece 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -12,7 +12,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, pgoff_t max_pgoff, int result), TP_ARGS(inode, vmf, max_pgoff, result), TP_STRUCT__entry( - __field(unsigned long, ino) + __field(u64, ino) __field(unsigned long, vm_start) __field(unsigned long, vm_end) __field(vm_flags_t, vm_flags) @@ -35,7 +35,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, __entry->max_pgoff = max_pgoff; __entry->result = result; ), - TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start " + TP_printk("dev %d:%d ino %#llx %s %s address %#lx vm_start " "%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, void *radix_entry), TP_ARGS(inode, vmf, zero_folio, radix_entry), TP_STRUCT__entry( - __field(unsigned long, ino) + __field(u64, ino) __field(vm_flags_t, vm_flags) __field(unsigned long, address) __field(struct folio *, zero_folio) @@ -81,7 +81,7 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, __entry->zero_folio = zero_folio; __entry->radix_entry = radix_entry; ), - TP_printk("dev %d:%d ino %#lx %s address %#lx zero_folio %p " + TP_printk("dev %d:%d ino %#llx %s address %#lx zero_folio %p " "radix_entry %#lx", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -106,7 +106,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class, TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), TP_ARGS(inode, vmf, result), TP_STRUCT__entry( - __field(unsigned long, ino) + __field(u64, ino) __field(vm_flags_t, vm_flags) __field(unsigned long, address) __field(pgoff_t, pgoff) @@ -123,7 +123,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class, __entry->pgoff = vmf->pgoff; __entry->result = result; ), - TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s", + TP_printk("dev %d:%d ino %#llx %s %s address %#lx pgoff %#lx %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -150,7 +150,7 @@ DECLARE_EVENT_CLASS(dax_writeback_range_class, TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), TP_ARGS(inode, start_index, end_index), TP_STRUCT__entry( - __field(unsigned long, ino) + __field(u64, ino) __field(pgoff_t, start_index) __field(pgoff_t, end_index) __field(dev_t, dev) @@ -161,7 +161,7 @@ DECLARE_EVENT_CLASS(dax_writeback_range_class, __entry->start_index = start_index; __entry->end_index = end_index; ), - TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx", + TP_printk("dev %d:%d ino %#llx pgoff %#lx-%#lx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -182,7 +182,7 @@ TRACE_EVENT(dax_writeback_one, TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen), TP_ARGS(inode, pgoff, pglen), TP_STRUCT__entry( - __field(unsigned long, ino) + __field(u64, ino) __field(pgoff_t, pgoff) __field(pgoff_t, pglen) __field(dev_t, dev) @@ -193,7 +193,7 @@ TRACE_EVENT(dax_writeback_one, __entry->pgoff = pgoff; __entry->pglen = pglen; ), - TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx", + TP_printk("dev %d:%d ino %#llx pgoff %#lx pglen %#lx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, diff --git a/include/trace/events/fsverity.h b/include/trace/events/fsverity.h index a8c52f21cbd5..4477c17e0574 100644 --- a/include/trace/events/fsverity.h +++ b/include/trace/events/fsverity.h @@ -16,7 +16,7 @@ TRACE_EVENT(fsverity_enable, const struct merkle_tree_params *params), TP_ARGS(inode, params), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(u64, data_size) __field(u64, tree_size) __field(unsigned int, merkle_block) @@ -29,8 +29,8 @@ TRACE_EVENT(fsverity_enable, __entry->merkle_block = params->block_size; __entry->num_levels = params->num_levels; ), - TP_printk("ino %lu data_size %llu tree_size %llu merkle_block %u levels %u", - (unsigned long) __entry->ino, + TP_printk("ino %llu data_size %llu tree_size %llu merkle_block %u levels %u", + __entry->ino, __entry->data_size, __entry->tree_size, __entry->merkle_block, @@ -42,7 +42,7 @@ TRACE_EVENT(fsverity_tree_done, const struct merkle_tree_params *params), TP_ARGS(inode, vi, params), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(u64, data_size) __field(u64, tree_size) __field(unsigned int, merkle_block) @@ -59,8 +59,8 @@ TRACE_EVENT(fsverity_tree_done, memcpy(__get_dynamic_array(root_hash), vi->root_hash, __get_dynamic_array_len(root_hash)); memcpy(__get_dynamic_array(file_digest), vi->file_digest, __get_dynamic_array_len(file_digest)); ), - TP_printk("ino %lu data_size %llu tree_size %lld merkle_block %u levels %u root_hash %s digest %s", - (unsigned long) __entry->ino, + TP_printk("ino %llu data_size %llu tree_size %lld merkle_block %u levels %u root_hash %s digest %s", + __entry->ino, __entry->data_size, __entry->tree_size, __entry->merkle_block, @@ -75,7 +75,7 @@ TRACE_EVENT(fsverity_verify_data_block, u64 data_pos), TP_ARGS(inode, params, data_pos), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(u64, data_pos) __field(unsigned int, merkle_block) ), @@ -84,8 +84,8 @@ TRACE_EVENT(fsverity_verify_data_block, __entry->data_pos = data_pos; __entry->merkle_block = params->block_size; ), - TP_printk("ino %lu data_pos %llu merkle_block %u", - (unsigned long) __entry->ino, + TP_printk("ino %llu data_pos %llu merkle_block %u", + __entry->ino, __entry->data_pos, __entry->merkle_block) ); @@ -96,7 +96,7 @@ TRACE_EVENT(fsverity_merkle_hit, unsigned int hidx), TP_ARGS(inode, data_pos, hblock_idx, level, hidx), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(u64, data_pos) __field(unsigned long, hblock_idx) __field(unsigned int, level) @@ -109,8 +109,8 @@ TRACE_EVENT(fsverity_merkle_hit, __entry->level = level; __entry->hidx = hidx; ), - TP_printk("ino %lu data_pos %llu hblock_idx %lu level %u hidx %u", - (unsigned long) __entry->ino, + TP_printk("ino %llu data_pos %llu hblock_idx %lu level %u hidx %u", + __entry->ino, __entry->data_pos, __entry->hblock_idx, __entry->level, @@ -122,7 +122,7 @@ TRACE_EVENT(fsverity_verify_merkle_block, unsigned int level, unsigned int hidx), TP_ARGS(inode, hblock_idx, level, hidx), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(unsigned long, hblock_idx) __field(unsigned int, level) __field(unsigned int, hidx) @@ -133,8 +133,8 @@ TRACE_EVENT(fsverity_verify_merkle_block, __entry->level = level; __entry->hidx = hidx; ), - TP_printk("ino %lu hblock_idx %lu level %u hidx %u", - (unsigned long) __entry->ino, + TP_printk("ino %llu hblock_idx %lu level %u hidx %u", + __entry->ino, __entry->hblock_idx, __entry->level, __entry->hidx) diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 64a382fbc31a..153c60e86d11 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -298,7 +298,7 @@ TRACE_EVENT(netfs_read, __field(loff_t, start) __field(size_t, len) __field(enum netfs_read_trace, what) - __field(unsigned int, netfs_inode) + __field(u64, netfs_inode) ), TP_fast_assign( @@ -311,7 +311,7 @@ TRACE_EVENT(netfs_read, __entry->netfs_inode = rreq->inode->i_ino; ), - TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx", + TP_printk("R=%08x %s c=%08x ni=%llx s=%llx l=%zx sz=%llx", __entry->rreq, __print_symbolic(__entry->what, netfs_read_traces), __entry->cookie, @@ -484,7 +484,7 @@ TRACE_EVENT(netfs_folio, TP_ARGS(folio, why), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, index) __field(unsigned int, nr) __field(enum netfs_folio_trace, why) @@ -498,7 +498,7 @@ TRACE_EVENT(netfs_folio, __entry->nr = folio_nr_pages(folio); ), - TP_printk("i=%05lx ix=%05lx-%05lx %s", + TP_printk("i=%05llx ix=%05lx-%05lx %s", __entry->ino, __entry->index, __entry->index + __entry->nr - 1, __print_symbolic(__entry->why, netfs_folio_traces)) ); diff --git a/include/trace/events/readahead.h b/include/trace/events/readahead.h index 0997ac5eceab..087f171e2b02 100644 --- a/include/trace/events/readahead.h +++ b/include/trace/events/readahead.h @@ -18,7 +18,7 @@ TRACE_EVENT(page_cache_ra_unbounded, TP_ARGS(inode, index, nr_to_read, lookahead_size), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) __field(dev_t, s_dev) __field(pgoff_t, index) __field(unsigned long, nr_to_read) @@ -34,7 +34,7 @@ TRACE_EVENT(page_cache_ra_unbounded, ), TP_printk( - "dev=%d:%d ino=%lx index=%lu nr_to_read=%lu lookahead_size=%lu", + "dev=%d:%d ino=%llx index=%lu nr_to_read=%lu lookahead_size=%lu", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->index, __entry->nr_to_read, __entry->lookahead_size ) @@ -46,7 +46,7 @@ TRACE_EVENT(page_cache_ra_order, TP_ARGS(inode, index, ra), TP_STRUCT__entry( - __field(unsigned long, i_ino) + __field(u64, i_ino) __field(dev_t, s_dev) __field(pgoff_t, index) __field(unsigned int, order) @@ -66,7 +66,7 @@ TRACE_EVENT(page_cache_ra_order, ), TP_printk( - "dev=%d:%d ino=%lx index=%lu order=%u size=%u async_size=%u ra_pages=%u", + "dev=%d:%d ino=%llx index=%lu order=%u size=%u async_size=%u ra_pages=%u", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->index, __entry->order, __entry->size, __entry->async_size, __entry->ra_pages @@ -80,16 +80,16 @@ DECLARE_EVENT_CLASS(page_cache_ra_op, TP_ARGS(inode, index, ra, req_count), TP_STRUCT__entry( - __field(unsigned long, i_ino) - __field(dev_t, s_dev) + __field(u64, i_ino) + __field(loff_t, prev_pos) __field(pgoff_t, index) + __field(unsigned long, req_count) + __field(dev_t, s_dev) __field(unsigned int, order) __field(unsigned int, size) __field(unsigned int, async_size) __field(unsigned int, ra_pages) __field(unsigned int, mmap_miss) - __field(loff_t, prev_pos) - __field(unsigned long, req_count) ), TP_fast_assign( @@ -106,7 +106,7 @@ DECLARE_EVENT_CLASS(page_cache_ra_op, ), TP_printk( - "dev=%d:%d ino=%lx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld", + "dev=%d:%d ino=%llx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->index, __entry->req_count, __entry->order, __entry->size, __entry->async_size, __entry->ra_pages, diff --git a/include/trace/events/timestamp.h b/include/trace/events/timestamp.h index c9e5ec930054..d6503612dddf 100644 --- a/include/trace/events/timestamp.h +++ b/include/trace/events/timestamp.h @@ -18,9 +18,9 @@ DECLARE_EVENT_CLASS(ctime, TP_ARGS(inode, ctime), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(time64_t, ctime_s) + __field(dev_t, dev) __field(u32, ctime_ns) __field(u32, gen) ), @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(ctime, __entry->ctime_ns = ctime->tv_nsec; ), - TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", + TP_printk("ino=%d:%d:%llu:%u ctime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns ) @@ -58,8 +58,8 @@ TRACE_EVENT(ctime_ns_xchg, TP_ARGS(inode, old, new, cur), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(u32, gen) __field(u32, old) __field(u32, new) @@ -75,7 +75,7 @@ TRACE_EVENT(ctime_ns_xchg, __entry->cur = cur; ), - TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", + TP_printk("ino=%d:%d:%llu:%u old=%u:%s new=%u cur=%u:%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->old & ~I_CTIME_QUERIED, __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), @@ -93,10 +93,10 @@ TRACE_EVENT(fill_mg_cmtime, TP_ARGS(inode, ctime, mtime), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(time64_t, ctime_s) __field(time64_t, mtime_s) + __field(dev_t, dev) __field(u32, ctime_ns) __field(u32, mtime_ns) __field(u32, gen) @@ -112,7 +112,7 @@ TRACE_EVENT(fill_mg_cmtime, __entry->mtime_ns = mtime->tv_nsec; ), - TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", + TP_printk("ino=%d:%d:%llu:%u ctime=%lld.%u mtime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns, __entry->mtime_s, __entry->mtime_ns diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 4d3d8c8f3a1b..e5cd2b80fd29 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -67,7 +67,7 @@ DECLARE_EVENT_CLASS(writeback_folio_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, index) ), @@ -79,9 +79,9 @@ DECLARE_EVENT_CLASS(writeback_folio_template, __entry->index = folio->index; ), - TP_printk("bdi %s: ino=%lu index=%lu", + TP_printk("bdi %s: ino=%llu index=%lu", __entry->name, - (unsigned long)__entry->ino, + __entry->ino, __entry->index ) ); @@ -108,7 +108,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(ino_t, ino) + __field(u64, ino) __field(unsigned long, state) __field(unsigned long, flags) ), @@ -123,9 +123,9 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, __entry->flags = flags; ), - TP_printk("bdi %s: ino=%lu state=%s flags=%s", + TP_printk("bdi %s: ino=%llu state=%s flags=%s", __entry->name, - (unsigned long)__entry->ino, + __entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) @@ -155,12 +155,12 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK -static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline u64 __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } -static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline u64 __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); @@ -169,12 +169,12 @@ static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) } #else /* CONFIG_CGROUP_WRITEBACK */ -static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline u64 __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } -static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline u64 __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } @@ -192,8 +192,8 @@ TRACE_EVENT(inode_foreign_history, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, ino) - __field(ino_t, cgroup_ino) + __field(u64, ino) + __field(u64, cgroup_ino) __field(unsigned int, history) ), @@ -204,10 +204,10 @@ TRACE_EVENT(inode_foreign_history, __entry->history = history; ), - TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", + TP_printk("bdi %s: ino=%llu cgroup_ino=%llu history=0x%x", __entry->name, - (unsigned long)__entry->ino, - (unsigned long)__entry->cgroup_ino, + __entry->ino, + __entry->cgroup_ino, __entry->history ) ); @@ -221,8 +221,8 @@ TRACE_EVENT(inode_switch_wbs_queue, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, old_cgroup_ino) - __field(ino_t, new_cgroup_ino) + __field(u64, old_cgroup_ino) + __field(u64, new_cgroup_ino) __field(unsigned int, count) ), @@ -233,10 +233,10 @@ TRACE_EVENT(inode_switch_wbs_queue, __entry->count = count; ), - TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", + TP_printk("bdi %s: old_cgroup_ino=%llu new_cgroup_ino=%llu count=%u", __entry->name, - (unsigned long)__entry->old_cgroup_ino, - (unsigned long)__entry->new_cgroup_ino, + __entry->old_cgroup_ino, + __entry->new_cgroup_ino, __entry->count ) ); @@ -250,9 +250,9 @@ TRACE_EVENT(inode_switch_wbs, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, ino) - __field(ino_t, old_cgroup_ino) - __field(ino_t, new_cgroup_ino) + __field(u64, ino) + __field(u64, old_cgroup_ino) + __field(u64, new_cgroup_ino) ), TP_fast_assign( @@ -262,11 +262,11 @@ TRACE_EVENT(inode_switch_wbs, __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), - TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", + TP_printk("bdi %s: ino=%llu old_cgroup_ino=%llu new_cgroup_ino=%llu", __entry->name, - (unsigned long)__entry->ino, - (unsigned long)__entry->old_cgroup_ino, - (unsigned long)__entry->new_cgroup_ino + __entry->ino, + __entry->old_cgroup_ino, + __entry->new_cgroup_ino ) ); @@ -279,10 +279,10 @@ TRACE_EVENT(track_foreign_dirty, TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) - __field(ino_t, ino) + __field(u64, ino) + __field(u64, cgroup_ino) + __field(u64, page_cgroup_ino) __field(unsigned int, memcg_id) - __field(ino_t, cgroup_ino) - __field(ino_t, page_cgroup_ino) ), TP_fast_assign( @@ -297,13 +297,13 @@ TRACE_EVENT(track_foreign_dirty, __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), - TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", + TP_printk("bdi %s[%llu]: ino=%llu memcg_id=%u cgroup_ino=%llu page_cgroup_ino=%llu", __entry->name, __entry->bdi_id, - (unsigned long)__entry->ino, + __entry->ino, __entry->memcg_id, - (unsigned long)__entry->cgroup_ino, - (unsigned long)__entry->page_cgroup_ino + __entry->cgroup_ino, + __entry->page_cgroup_ino ) ); @@ -316,7 +316,7 @@ TRACE_EVENT(flush_foreign, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, cgroup_ino) + __field(u64, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), @@ -328,9 +328,9 @@ TRACE_EVENT(flush_foreign, __entry->frn_memcg_id = frn_memcg_id; ), - TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", + TP_printk("bdi %s: cgroup_ino=%llu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, - (unsigned long)__entry->cgroup_ino, + __entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) @@ -345,9 +345,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(ino_t, ino) + __field(u64, ino) + __field(u64, cgroup_ino) __field(int, sync_mode) - __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -358,11 +358,11 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", + TP_printk("bdi %s: ino=%llu sync_mode=%d cgroup_ino=%llu", __entry->name, - (unsigned long)__entry->ino, + __entry->ino, __entry->sync_mode, - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -385,6 +385,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) + __field(u64, cgroup_ino) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) @@ -392,7 +393,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) - __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); @@ -406,7 +406,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%llu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -415,7 +415,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -445,15 +445,15 @@ DECLARE_EVENT_CLASS(writeback_class, TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, cgroup_ino) + __field(u64, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: cgroup_ino=%lu", + TP_printk("bdi %s: cgroup_ino=%llu", __entry->name, - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ @@ -482,15 +482,15 @@ DECLARE_EVENT_CLASS(wbc_class, TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) + __field(u64, cgroup_ino) __field(long, nr_to_write) __field(long, pages_skipped) + __field(long, range_start) + __field(long, range_end) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, range_cyclic) - __field(long, range_start) - __field(long, range_end) - __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -507,7 +507,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d " - "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%lu", + "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%llu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -517,7 +517,7 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic, __entry->range_start, __entry->range_end, - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ) @@ -535,11 +535,11 @@ TRACE_EVENT(writeback_queue_io, TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) + __field(u64, cgroup_ino) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) - __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); @@ -549,13 +549,13 @@ TRACE_EVENT(writeback_queue_io, __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%llu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -614,13 +614,13 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_STRUCT__entry( __array(char, bdi, 32) + __field(u64, cgroup_ino) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) - __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -638,7 +638,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", + "balanced_dirty_ratelimit=%lu cgroup_ino=%llu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ @@ -646,7 +646,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -667,6 +667,7 @@ TRACE_EVENT(balance_dirty_pages, TP_STRUCT__entry( __array( char, bdi, 32) + __field(u64, cgroup_ino) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) @@ -674,13 +675,12 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) - __field(unsigned int, dirtied) - __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) - __field(ino_t, cgroup_ino) + __field(unsigned int, dirtied) + __field(unsigned int, dirtied_pause) ), TP_fast_assign( @@ -711,7 +711,7 @@ TRACE_EVENT(balance_dirty_pages, "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", + "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%llu", __entry->bdi, __entry->limit, __entry->setpoint, @@ -726,7 +726,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -737,10 +737,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, ino) + __field(u64, ino) + __field(u64, cgroup_ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) - __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -752,13 +752,13 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", + TP_printk("bdi %s: ino=%llu state=%s dirtied_when=%lu age=%lu cgroup_ino=%llu", __entry->name, - (unsigned long)__entry->ino, + __entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -773,13 +773,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_STRUCT__entry( __array(char, name, 32) - __field(ino_t, ino) + __field(u64, ino) + __field(u64, cgroup_ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) - __field(long, nr_to_write) __field(unsigned long, wrote) - __field(ino_t, cgroup_ino) + __field(long, nr_to_write) ), TP_fast_assign( @@ -794,17 +794,17 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", + TP_printk("bdi %s: ino=%llu state=%s dirtied_when=%lu age=%lu " + "index=%lu to_write=%ld wrote=%lu cgroup_ino=%llu", __entry->name, - (unsigned long)__entry->ino, + __entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, - (unsigned long)__entry->cgroup_ino + __entry->cgroup_ino ) ); @@ -828,11 +828,11 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field(unsigned long, state ) - __field( __u16, mode ) __field(unsigned long, dirtied_when ) + __field( dev_t, dev ) + __field( __u16, mode ) ), TP_fast_assign( @@ -843,9 +843,9 @@ DECLARE_EVENT_CLASS(writeback_inode_template, __entry->dirtied_when = inode->dirtied_when; ), - TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", + TP_printk("dev %d,%d ino %llu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long)__entry->ino, __entry->dirtied_when, + __entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); -- cgit v1.2.3 From 7e35c9d7039504399f0dd573568d2b3a13a3e406 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:35 -0500 Subject: cachefiles: widen trace event i_ino fields to u64 Update cachefiles trace event definitions to use u64 instead of ino_t/unsigned long for inode number fields. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-5-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/cachefiles.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index a743b2a35ea7..6e3b1424eea4 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -249,10 +249,10 @@ TRACE_EVENT(cachefiles_lookup, TP_ARGS(obj, dir, de), TP_STRUCT__entry( + __field(u64, dino) + __field(u64, ino) __field(unsigned int, obj) __field(short, error) - __field(unsigned long, dino) - __field(unsigned long, ino) ), TP_fast_assign( @@ -263,7 +263,7 @@ TRACE_EVENT(cachefiles_lookup, __entry->error = IS_ERR(de) ? PTR_ERR(de) : 0; ), - TP_printk("o=%08x dB=%lx B=%lx e=%d", + TP_printk("o=%08x dB=%llx B=%llx e=%d", __entry->obj, __entry->dino, __entry->ino, __entry->error) ); @@ -578,8 +578,8 @@ TRACE_EVENT(cachefiles_mark_active, /* Note that obj may be NULL */ TP_STRUCT__entry( + __field(u64, inode) __field(unsigned int, obj) - __field(ino_t, inode) ), TP_fast_assign( @@ -587,7 +587,7 @@ TRACE_EVENT(cachefiles_mark_active, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x B=%lx", + TP_printk("o=%08x B=%llx", __entry->obj, __entry->inode) ); @@ -599,8 +599,8 @@ TRACE_EVENT(cachefiles_mark_failed, /* Note that obj may be NULL */ TP_STRUCT__entry( + __field(u64, inode) __field(unsigned int, obj) - __field(ino_t, inode) ), TP_fast_assign( @@ -608,7 +608,7 @@ TRACE_EVENT(cachefiles_mark_failed, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x B=%lx", + TP_printk("o=%08x B=%llx", __entry->obj, __entry->inode) ); @@ -620,8 +620,8 @@ TRACE_EVENT(cachefiles_mark_inactive, /* Note that obj may be NULL */ TP_STRUCT__entry( + __field(u64, inode) __field(unsigned int, obj) - __field(ino_t, inode) ), TP_fast_assign( @@ -629,7 +629,7 @@ TRACE_EVENT(cachefiles_mark_inactive, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x B=%lx", + TP_printk("o=%08x B=%llx", __entry->obj, __entry->inode) ); -- cgit v1.2.3 From aeb11603158aea8c2f34e17c445883fce8ef4e86 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:37 -0500 Subject: hugetlbfs: widen trace event i_ino fields to u64 Update hugetlbfs trace event definitions to use u64 instead of ino_t/unsigned long for inode number fields. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-7-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/hugetlbfs.h | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h index 59605dfaeeb4..8ba72c1d4f4d 100644 --- a/include/trace/events/hugetlbfs.h +++ b/include/trace/events/hugetlbfs.h @@ -14,9 +14,9 @@ TRACE_EVENT(hugetlbfs_alloc_inode, TP_ARGS(inode, dir, mode), TP_STRUCT__entry( + __field(u64, ino) + __field(u64, dir) __field(dev_t, dev) - __field(ino_t, ino) - __field(ino_t, dir) __field(__u16, mode) ), @@ -27,10 +27,10 @@ TRACE_EVENT(hugetlbfs_alloc_inode, __entry->mode = mode; ), - TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + TP_printk("dev %d,%d ino %llu dir %llu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->dir, __entry->mode) + __entry->ino, + __entry->dir, __entry->mode) ); DECLARE_EVENT_CLASS(hugetlbfs__inode, @@ -40,13 +40,13 @@ DECLARE_EVENT_CLASS(hugetlbfs__inode, TP_ARGS(inode), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) - __field(__u16, mode) + __field(u64, ino) __field(loff_t, size) + __field(blkcnt_t, blocks) + __field(dev_t, dev) __field(unsigned int, nlink) __field(unsigned int, seals) - __field(blkcnt_t, blocks) + __field(__u16, mode) ), TP_fast_assign( @@ -59,8 +59,8 @@ DECLARE_EVENT_CLASS(hugetlbfs__inode, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %llu mode 0%o size %lld nlink %u seals %u blocks %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->mode, __entry->size, __entry->nlink, __entry->seals, (unsigned long long)__entry->blocks) ); @@ -87,14 +87,14 @@ TRACE_EVENT(hugetlbfs_setattr, TP_ARGS(inode, dentry, attr), TP_STRUCT__entry( + __field(u64, ino) + __field(loff_t, old_size) + __field(loff_t, ia_size) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, d_len) __string(d_name, dentry->d_name.name) __field(unsigned int, ia_valid) __field(unsigned int, ia_mode) - __field(loff_t, old_size) - __field(loff_t, ia_size) ), TP_fast_assign( @@ -108,8 +108,8 @@ TRACE_EVENT(hugetlbfs_setattr, __entry->ia_size = attr->ia_size; ), - TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld", - MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, + TP_printk("dev %d,%d ino %llu name %.*s valid %#x mode 0%o old_size %lld size %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode, __entry->old_size, __entry->ia_size) ); @@ -122,12 +122,12 @@ TRACE_EVENT(hugetlbfs_fallocate, TP_ARGS(inode, mode, offset, len, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) - __field(int, mode) + __field(u64, ino) __field(loff_t, offset) __field(loff_t, len) __field(loff_t, size) + __field(dev_t, dev) + __field(int, mode) __field(int, ret) ), @@ -141,9 +141,9 @@ TRACE_EVENT(hugetlbfs_fallocate, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d", + TP_printk("dev %d,%d ino %llu mode 0%o offset %lld len %lld size %lld ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long)__entry->ino, __entry->mode, + __entry->ino, __entry->mode, (unsigned long long)__entry->offset, (unsigned long long)__entry->len, (unsigned long long)__entry->size, -- cgit v1.2.3 From d352871478616ac63d0edf90e78194ec0a19b988 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:39 -0500 Subject: ext4: widen trace event i_ino fields to u64 In trace events, change __field(ino_t, ...) to __field(u64, ...) and update TP_printk format strings to %llu/%llx to match the widened field type. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-9-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/ext4.h | 544 ++++++++++++++++++++++---------------------- 1 file changed, 272 insertions(+), 272 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index a3e8fe414df8..84ef091af2d3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -138,14 +138,14 @@ TRACE_DEFINE_ENUM(CR_ANY_FREE); { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, - TP_PROTO(struct inode *inode, ino_t orig_ino), + TP_PROTO(struct inode *inode, u64 orig_ino), TP_ARGS(inode, orig_ino), TP_STRUCT__entry( + __field( u64, ino ) + __field( u64, orig_ino ) __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, orig_ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u16, mode ) @@ -160,10 +160,10 @@ TRACE_EVENT(ext4_other_inode_update_time, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", + TP_printk("dev %d,%d orig_ino %llu ino %llu mode 0%o uid %u gid %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->orig_ino, - (unsigned long) __entry->ino, __entry->mode, + __entry->orig_ino, + __entry->ino, __entry->mode, __entry->uid, __entry->gid) ); @@ -173,11 +173,11 @@ TRACE_EVENT(ext4_free_inode, TP_ARGS(inode), TP_STRUCT__entry( + __field( u64, ino ) + __field( __u64, blocks ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( uid_t, uid ) __field( gid_t, gid ) - __field( __u64, blocks ) __field( __u16, mode ) ), @@ -190,9 +190,9 @@ TRACE_EVENT(ext4_free_inode, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", + TP_printk("dev %d,%d ino %llu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->mode, + __entry->ino, __entry->mode, __entry->uid, __entry->gid, __entry->blocks) ); @@ -202,8 +202,8 @@ TRACE_EVENT(ext4_request_inode, TP_ARGS(dir, mode), TP_STRUCT__entry( + __field( u64, dir ) __field( dev_t, dev ) - __field( ino_t, dir ) __field( __u16, mode ) ), @@ -213,9 +213,9 @@ TRACE_EVENT(ext4_request_inode, __entry->mode = mode; ), - TP_printk("dev %d,%d dir %lu mode 0%o", + TP_printk("dev %d,%d dir %llu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->dir, __entry->mode) + __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, @@ -224,9 +224,9 @@ TRACE_EVENT(ext4_allocate_inode, TP_ARGS(inode, dir, mode), TP_STRUCT__entry( + __field( u64, ino ) + __field( u64, dir ) __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, dir ) __field( __u16, mode ) ), @@ -237,10 +237,10 @@ TRACE_EVENT(ext4_allocate_inode, __entry->mode = mode; ), - TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + TP_printk("dev %d,%d ino %llu dir %llu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->dir, __entry->mode) + __entry->ino, + __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_evict_inode, @@ -249,8 +249,8 @@ TRACE_EVENT(ext4_evict_inode, TP_ARGS(inode), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, nlink ) ), @@ -260,9 +260,9 @@ TRACE_EVENT(ext4_evict_inode, __entry->nlink = inode->i_nlink; ), - TP_printk("dev %d,%d ino %lu nlink %d", + TP_printk("dev %d,%d ino %llu nlink %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->nlink) + __entry->ino, __entry->nlink) ); TRACE_EVENT(ext4_drop_inode, @@ -271,8 +271,8 @@ TRACE_EVENT(ext4_drop_inode, TP_ARGS(inode, drop), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, drop ) ), @@ -282,9 +282,9 @@ TRACE_EVENT(ext4_drop_inode, __entry->drop = drop; ), - TP_printk("dev %d,%d ino %lu drop %d", + TP_printk("dev %d,%d ino %llu drop %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->drop) + __entry->ino, __entry->drop) ); TRACE_EVENT(ext4_nfs_commit_metadata, @@ -293,8 +293,8 @@ TRACE_EVENT(ext4_nfs_commit_metadata, TP_ARGS(inode), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) ), TP_fast_assign( @@ -302,9 +302,9 @@ TRACE_EVENT(ext4_nfs_commit_metadata, __entry->ino = inode->i_ino; ), - TP_printk("dev %d,%d ino %lu", + TP_printk("dev %d,%d ino %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino) + __entry->ino) ); TRACE_EVENT(ext4_mark_inode_dirty, @@ -313,9 +313,9 @@ TRACE_EVENT(ext4_mark_inode_dirty, TP_ARGS(inode, IP), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field(unsigned long, ip ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -324,9 +324,9 @@ TRACE_EVENT(ext4_mark_inode_dirty, __entry->ip = IP; ), - TP_printk("dev %d,%d ino %lu caller %pS", + TP_printk("dev %d,%d ino %llu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, (void *)__entry->ip) + __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_begin_ordered_truncate, @@ -335,9 +335,9 @@ TRACE_EVENT(ext4_begin_ordered_truncate, TP_ARGS(inode, new_size), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, new_size ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -346,9 +346,9 @@ TRACE_EVENT(ext4_begin_ordered_truncate, __entry->new_size = new_size; ), - TP_printk("dev %d,%d ino %lu new_size %lld", + TP_printk("dev %d,%d ino %llu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->new_size) ); @@ -359,9 +359,9 @@ DECLARE_EVENT_CLASS(ext4__write_begin, TP_ARGS(inode, pos, len), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, pos ) + __field( dev_t, dev ) __field( unsigned int, len ) ), @@ -372,9 +372,9 @@ DECLARE_EVENT_CLASS(ext4__write_begin, __entry->len = len; ), - TP_printk("dev %d,%d ino %lu pos %lld len %u", + TP_printk("dev %d,%d ino %llu pos %lld len %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->pos, __entry->len) ); @@ -399,9 +399,9 @@ DECLARE_EVENT_CLASS(ext4__write_end, TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, pos ) + __field( dev_t, dev ) __field( unsigned int, len ) __field( unsigned int, copied ) ), @@ -414,9 +414,9 @@ DECLARE_EVENT_CLASS(ext4__write_end, __entry->copied = copied; ), - TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", + TP_printk("dev %d,%d ino %llu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->pos, __entry->len, __entry->copied) ); @@ -450,13 +450,13 @@ TRACE_EVENT(ext4_writepages, TP_ARGS(inode, wbc), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( long, nr_to_write ) - __field( long, pages_skipped ) + __field( u64, ino ) __field( loff_t, range_start ) __field( loff_t, range_end ) + __field( long, nr_to_write ) + __field( long, pages_skipped ) __field( pgoff_t, writeback_index ) + __field( dev_t, dev ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) @@ -475,11 +475,11 @@ TRACE_EVENT(ext4_writepages, __entry->range_cyclic = wbc->range_cyclic; ), - TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " + TP_printk("dev %d,%d ino %llu nr_to_write %ld pages_skipped %ld " "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->nr_to_write, + __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, @@ -493,11 +493,11 @@ TRACE_EVENT(ext4_da_write_folios_start, TP_ARGS(inode, start_pos, next_pos, wbc), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, start_pos ) __field( loff_t, next_pos ) __field( long, nr_to_write ) + __field( dev_t, dev ) __field( int, sync_mode ) ), @@ -510,9 +510,9 @@ TRACE_EVENT(ext4_da_write_folios_start, __entry->sync_mode = wbc->sync_mode; ), - TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d", + TP_printk("dev %d,%d ino %llu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, + __entry->ino, __entry->start_pos, __entry->next_pos, __entry->nr_to_write, __entry->sync_mode) ); @@ -523,11 +523,11 @@ TRACE_EVENT(ext4_da_write_folios_end, TP_ARGS(inode, start_pos, next_pos, wbc, ret), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, start_pos ) __field( loff_t, next_pos ) __field( long, nr_to_write ) + __field( dev_t, dev ) __field( int, ret ) ), @@ -540,9 +540,9 @@ TRACE_EVENT(ext4_da_write_folios_end, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d", + TP_printk("dev %d,%d ino %llu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, + __entry->ino, __entry->start_pos, __entry->next_pos, __entry->nr_to_write, __entry->ret) ); @@ -552,9 +552,9 @@ TRACE_EVENT(ext4_da_write_pages_extent, TP_ARGS(inode, map), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, lblk ) + __field( dev_t, dev ) __field( __u32, len ) __field( __u32, flags ) ), @@ -567,9 +567,9 @@ TRACE_EVENT(ext4_da_write_pages_extent, __entry->flags = map->m_flags; ), - TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", + TP_printk("dev %d,%d ino %llu lblk %llu len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->lblk, __entry->len, + __entry->ino, __entry->lblk, __entry->len, show_mflags(__entry->flags)) ); @@ -580,12 +580,12 @@ TRACE_EVENT(ext4_writepages_result, TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( + __field( u64, ino ) + __field( long, pages_skipped ) + __field( pgoff_t, writeback_index ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) - __field( long, pages_skipped ) - __field( pgoff_t, writeback_index ) __field( int, sync_mode ) ), @@ -599,10 +599,10 @@ TRACE_EVENT(ext4_writepages_result, __entry->sync_mode = wbc->sync_mode; ), - TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " + TP_printk("dev %d,%d ino %llu ret %d pages_written %d pages_skipped %ld " "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->ret, + __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->sync_mode, (unsigned long) __entry->writeback_index) @@ -614,9 +614,9 @@ DECLARE_EVENT_CLASS(ext4__folio_op, TP_ARGS(inode, folio), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( pgoff_t, index ) + __field( dev_t, dev ) ), @@ -626,9 +626,9 @@ DECLARE_EVENT_CLASS(ext4__folio_op, __entry->index = folio->index; ), - TP_printk("dev %d,%d ino %lu folio_index %lu", + TP_printk("dev %d,%d ino %llu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned long) __entry->index) ); @@ -652,11 +652,11 @@ DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, TP_ARGS(folio, offset, length), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( pgoff_t, index ) __field( size_t, offset ) __field( size_t, length ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -667,9 +667,9 @@ DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, __entry->length = length; ), - TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu", + TP_printk("dev %d,%d ino %llu folio_index %lu offset %zu length %zu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); @@ -717,10 +717,10 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_ARGS(ac, pa), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, pa_pstart ) __field( __u64, pa_lstart ) + __field( dev_t, dev ) __field( __u32, pa_len ) ), @@ -733,9 +733,9 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, __entry->pa_len = pa->pa_len; ), - TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", + TP_printk("dev %d,%d ino %llu pstart %llu len %u lstart %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); @@ -762,9 +762,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_ARGS(pa, block, count), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, block ) + __field( dev_t, dev ) __field( __u32, count ) ), @@ -776,9 +776,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, __entry->count = count; ), - TP_printk("dev %d,%d ino %lu block %llu count %u", + TP_printk("dev %d,%d ino %llu block %llu count %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->block, __entry->count) ); @@ -811,8 +811,8 @@ TRACE_EVENT(ext4_discard_preallocations, TP_ARGS(inode, len), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( unsigned int, len ) ), @@ -823,9 +823,9 @@ TRACE_EVENT(ext4_discard_preallocations, __entry->len = len; ), - TP_printk("dev %d,%d ino %lu len: %u", + TP_printk("dev %d,%d ino %llu len: %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->len) + __entry->ino, __entry->len) ); TRACE_EVENT(ext4_mb_discard_preallocations, @@ -855,15 +855,15 @@ TRACE_EVENT(ext4_request_blocks, TP_ARGS(ar), TP_STRUCT__entry( + __field( u64, ino ) + __field( __u64, goal ) + __field( __u64, pleft ) + __field( __u64, pright ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) - __field( __u64, goal ) - __field( __u64, pleft ) - __field( __u64, pright ) __field( unsigned int, flags ) ), @@ -880,10 +880,10 @@ TRACE_EVENT(ext4_request_blocks, __entry->flags = ar->flags; ), - TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " + TP_printk("dev %d,%d ino %llu flags %s len %u lblk %u goal %llu " "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), + __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) @@ -895,16 +895,16 @@ TRACE_EVENT(ext4_allocate_blocks, TP_ARGS(ar, block), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, block ) + __field( __u64, goal ) + __field( __u64, pleft ) + __field( __u64, pright ) + __field( dev_t, dev ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) - __field( __u64, goal ) - __field( __u64, pleft ) - __field( __u64, pright ) __field( unsigned int, flags ) ), @@ -922,10 +922,10 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->flags = ar->flags; ), - TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " + TP_printk("dev %d,%d ino %llu flags %s len %u block %llu lblk %u " "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), + __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->block, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) @@ -938,10 +938,10 @@ TRACE_EVENT(ext4_free_blocks, TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, block ) __field( unsigned long, count ) + __field( dev_t, dev ) __field( int, flags ) __field( __u16, mode ) ), @@ -955,9 +955,9 @@ TRACE_EVENT(ext4_free_blocks, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", + TP_printk("dev %d,%d ino %llu mode 0%o block %llu count %lu flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->mode, __entry->block, __entry->count, show_free_flags(__entry->flags)) ); @@ -968,9 +968,9 @@ TRACE_EVENT(ext4_sync_file_enter, TP_ARGS(file, datasync), TP_STRUCT__entry( + __field( u64, ino ) + __field( u64, parent ) __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, parent ) __field( int, datasync ) ), @@ -983,10 +983,10 @@ TRACE_EVENT(ext4_sync_file_enter, __entry->parent = d_inode(dentry->d_parent)->i_ino; ), - TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", + TP_printk("dev %d,%d ino %llu parent %llu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->parent, __entry->datasync) + __entry->ino, + __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_file_exit, @@ -995,8 +995,8 @@ TRACE_EVENT(ext4_sync_file_exit, TP_ARGS(inode, ret), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, ret ) ), @@ -1006,9 +1006,9 @@ TRACE_EVENT(ext4_sync_file_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu ret %d", + TP_printk("dev %d,%d ino %llu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->ret) ); @@ -1039,8 +1039,8 @@ TRACE_EVENT(ext4_alloc_da_blocks, TP_ARGS(inode), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( unsigned int, data_blocks ) ), @@ -1050,9 +1050,9 @@ TRACE_EVENT(ext4_alloc_da_blocks, __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; ), - TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", + TP_printk("dev %d,%d ino %llu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->data_blocks) ); @@ -1062,8 +1062,8 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_ARGS(ac), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) @@ -1107,11 +1107,11 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->cr = ac->ac_criteria; ), - TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " + TP_printk("dev %d,%d inode %llu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, @@ -1129,8 +1129,8 @@ TRACE_EVENT(ext4_mballoc_prealloc, TP_ARGS(ac), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) @@ -1154,9 +1154,9 @@ TRACE_EVENT(ext4_mballoc_prealloc, __entry->result_len = ac->ac_b_ex.fe_len; ), - TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", + TP_printk("dev %d,%d inode %llu orig %u/%d/%u@%u result %u/%d/%u@%u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, @@ -1173,8 +1173,8 @@ DECLARE_EVENT_CLASS(ext4__mballoc, TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) @@ -1188,9 +1188,9 @@ DECLARE_EVENT_CLASS(ext4__mballoc, __entry->result_len = len; ), - TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", + TP_printk("dev %d,%d inode %llu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); @@ -1223,9 +1223,9 @@ TRACE_EVENT(ext4_forget, TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, block ) + __field( dev_t, dev ) __field( int, is_metadata ) __field( __u16, mode ) ), @@ -1238,9 +1238,9 @@ TRACE_EVENT(ext4_forget, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", + TP_printk("dev %d,%d ino %llu mode 0%o is_metadata %d block %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); @@ -1250,9 +1250,9 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, i_blocks ) + __field( dev_t, dev ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, quota_claim ) @@ -1270,10 +1270,10 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " + TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->quota_claim) @@ -1285,9 +1285,9 @@ TRACE_EVENT(ext4_da_reserve_space, TP_ARGS(inode, nr_resv), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, i_blocks ) + __field( dev_t, dev ) __field( int, reserve_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) @@ -1302,10 +1302,10 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d" + TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu reserve_blocks %d" "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->mode, __entry->i_blocks, __entry->reserve_blocks, __entry->reserved_data_blocks) ); @@ -1316,9 +1316,9 @@ TRACE_EVENT(ext4_da_release_space, TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, i_blocks ) + __field( dev_t, dev ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) @@ -1333,10 +1333,10 @@ TRACE_EVENT(ext4_da_release_space, __entry->mode = inode->i_mode; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " + TP_printk("dev %d,%d ino %llu mode 0%o i_blocks %llu freed_blocks %d " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks) ); @@ -1412,10 +1412,10 @@ DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, offset ) __field( loff_t, len ) + __field( dev_t, dev ) __field( int, mode ) ), @@ -1427,9 +1427,9 @@ DECLARE_EVENT_CLASS(ext4__fallocate_mode, __entry->mode = mode; ), - TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", + TP_printk("dev %d,%d ino %llu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->offset, __entry->len, show_falloc_mode(__entry->mode)) ); @@ -1462,9 +1462,9 @@ TRACE_EVENT(ext4_fallocate_exit, TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, pos ) + __field( dev_t, dev ) __field( unsigned int, blocks ) __field( int, ret ) ), @@ -1477,9 +1477,9 @@ TRACE_EVENT(ext4_fallocate_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", + TP_printk("dev %d,%d ino %llu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->pos, __entry->blocks, __entry->ret) ); @@ -1490,10 +1490,10 @@ TRACE_EVENT(ext4_unlink_enter, TP_ARGS(parent, dentry), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, parent ) + __field( u64, ino ) + __field( u64, parent ) __field( loff_t, size ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -1503,10 +1503,10 @@ TRACE_EVENT(ext4_unlink_enter, __entry->size = d_inode(dentry)->i_size; ), - TP_printk("dev %d,%d ino %lu size %lld parent %lu", + TP_printk("dev %d,%d ino %llu size %lld parent %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->size, - (unsigned long) __entry->parent) + __entry->ino, __entry->size, + __entry->parent) ); TRACE_EVENT(ext4_unlink_exit, @@ -1515,8 +1515,8 @@ TRACE_EVENT(ext4_unlink_exit, TP_ARGS(dentry, ret), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, ret ) ), @@ -1526,9 +1526,9 @@ TRACE_EVENT(ext4_unlink_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu ret %d", + TP_printk("dev %d,%d ino %llu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->ret) ); @@ -1538,9 +1538,9 @@ DECLARE_EVENT_CLASS(ext4__truncate, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( __u64, blocks ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -1549,9 +1549,9 @@ DECLARE_EVENT_CLASS(ext4__truncate, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %d,%d ino %lu blocks %llu", + TP_printk("dev %d,%d ino %llu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->blocks) + __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, @@ -1576,13 +1576,13 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_ARGS(inode, map, ux), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, u_pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) - __field( ext4_fsblk_t, u_pblk ) ), TP_fast_assign( @@ -1595,10 +1595,10 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, __entry->u_pblk = ext4_ext_pblock(ux); ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " + TP_printk("dev %d,%d ino %llu m_lblk %u m_len %u u_lblk %u u_len %u " "u_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk) ); @@ -1614,16 +1614,16 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, u_pblk ) + __field( ext4_fsblk_t, i_pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) - __field( ext4_fsblk_t, u_pblk ) __field( ext4_lblk_t, i_lblk ) __field( unsigned, i_len ) - __field( ext4_fsblk_t, i_pblk ) ), TP_fast_assign( @@ -1639,11 +1639,11 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, __entry->i_pblk = ext4_ext_pblock(ix); ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " + TP_printk("dev %d,%d ino %llu m_lblk %u m_len %u " "u_lblk %u u_len %u u_pblk %llu " "i_lblk %u i_len %u i_pblk %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk, __entry->i_lblk, __entry->i_len, __entry->i_pblk) @@ -1656,8 +1656,8 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) @@ -1671,9 +1671,9 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter, __entry->flags = flags; ), - TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", + TP_printk("dev %d,%d ino %llu lblk %u len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->len, show_map_flags(__entry->flags)) ); @@ -1698,10 +1698,10 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_ARGS(inode, flags, map, ret), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( unsigned int, flags ) - __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, mflags ) @@ -1719,10 +1719,10 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " + TP_printk("dev %d,%d ino %llu flags %s lblk %u pblk %llu len %u " "mflags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, __entry->len, show_mflags(__entry->mflags), __entry->ret) ); @@ -1747,9 +1747,9 @@ TRACE_EVENT(ext4_ext_load_extent, TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( ext4_fsblk_t, pblk ) + __field( dev_t, dev ) __field( ext4_lblk_t, lblk ) ), @@ -1760,9 +1760,9 @@ TRACE_EVENT(ext4_ext_load_extent, __entry->lblk = lblk; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", + TP_printk("dev %d,%d ino %llu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->pblk) ); @@ -1772,8 +1772,8 @@ TRACE_EVENT(ext4_load_inode, TP_ARGS(sb, ino), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) ), TP_fast_assign( @@ -1781,9 +1781,9 @@ TRACE_EVENT(ext4_load_inode, __entry->ino = ino; ), - TP_printk("dev %d,%d ino %ld", + TP_printk("dev %d,%d ino %lld", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino) + __entry->ino) ); TRACE_EVENT(ext4_journal_start_sb, @@ -1823,7 +1823,7 @@ TRACE_EVENT(ext4_journal_start_inode, TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( - __field( unsigned long, ino ) + __field( u64, ino ) __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) @@ -1843,10 +1843,10 @@ TRACE_EVENT(ext4_journal_start_inode, ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," - " type %d, ino %lu, caller %pS", MAJOR(__entry->dev), + " type %d, ino %llu, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, - __entry->revoke_creds, __entry->type, __entry->ino, - (void *)__entry->ip) + __entry->revoke_creds, __entry->type, + __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, @@ -1927,14 +1927,14 @@ TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, pblk ) + __field( ext4_fsblk_t, newblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( int, flags ) __field( ext4_lblk_t, lblk ) - __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( unsigned int, allocated ) - __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( @@ -1948,10 +1948,10 @@ TRACE_EVENT(ext4_ext_handle_unwritten_extents, __entry->newblk = newblock; ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " + TP_printk("dev %d,%d ino %llu m_lblk %u m_pblk %llu m_len %u flags %s " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_map_flags(__entry->flags), (unsigned int) __entry->allocated, @@ -1994,9 +1994,9 @@ TRACE_EVENT(ext4_ext_show_extent, TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( ext4_fsblk_t, pblk ) + __field( dev_t, dev ) __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), @@ -2009,9 +2009,9 @@ TRACE_EVENT(ext4_ext_show_extent, __entry->len = len; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", + TP_printk("dev %d,%d ino %llu lblk %u pblk %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, (unsigned short) __entry->len) @@ -2025,14 +2025,14 @@ TRACE_EVENT(ext4_remove_blocks, TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, ee_pblk ) + __field( ext4_fsblk_t, pc_pclu ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) - __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) - __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), @@ -2050,10 +2050,10 @@ TRACE_EVENT(ext4_remove_blocks, __entry->pc_state = pc->state; ), - TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" + TP_printk("dev %d,%d ino %llu extent [%u(%llu), %u]" "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, @@ -2072,13 +2072,13 @@ TRACE_EVENT(ext4_ext_rm_leaf, TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, ee_pblk ) + __field( ext4_fsblk_t, pc_pclu ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) - __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) - __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), @@ -2095,10 +2095,10 @@ TRACE_EVENT(ext4_ext_rm_leaf, __entry->pc_state = pc->state; ), - TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" + TP_printk("dev %d,%d ino %llu start_lblk %u last_extent [%u(%llu), %u]" "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, @@ -2114,9 +2114,9 @@ TRACE_EVENT(ext4_ext_rm_idx, TP_ARGS(inode, pblk), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( ext4_fsblk_t, pblk ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -2125,9 +2125,9 @@ TRACE_EVENT(ext4_ext_rm_idx, __entry->pblk = pblk; ), - TP_printk("dev %d,%d ino %lu index_pblk %llu", + TP_printk("dev %d,%d ino %llu index_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned long long) __entry->pblk) ); @@ -2138,8 +2138,8 @@ TRACE_EVENT(ext4_ext_remove_space, TP_ARGS(inode, start, end, depth), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) @@ -2153,9 +2153,9 @@ TRACE_EVENT(ext4_ext_remove_space, __entry->depth = depth; ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d", + TP_printk("dev %d,%d ino %llu since %u end %u depth %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth) @@ -2168,12 +2168,12 @@ TRACE_EVENT(ext4_ext_remove_space_done, TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, pc_pclu ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) - __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state ) __field( unsigned short, eh_entries ) @@ -2191,11 +2191,11 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->eh_entries = le16_to_cpu(eh_entries); ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d " + TP_printk("dev %d,%d ino %llu since %u end %u depth %d " "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, @@ -2211,13 +2211,13 @@ DECLARE_EVENT_CLASS(ext4__es_extent, TP_ARGS(inode, es), TP_STRUCT__entry( + __field( u64, ino ) + __field( u64, seq ) + __field( ext4_fsblk_t, pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) __field( char, status ) - __field( u64, seq ) ), TP_fast_assign( @@ -2230,9 +2230,9 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu", + TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->seq) @@ -2256,11 +2256,11 @@ TRACE_EVENT(ext4_es_remove_extent, TP_ARGS(inode, lblk, len), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) + __field( u64, ino ) __field( loff_t, lblk ) __field( loff_t, len ) __field( u64, seq ) + __field( dev_t, dev ) ), TP_fast_assign( @@ -2271,9 +2271,9 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu", + TP_printk("dev %d,%d ino %llu es [%lld/%lld) seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->len, __entry->seq) ); @@ -2283,8 +2283,8 @@ TRACE_EVENT(ext4_es_find_extent_range_enter, TP_ARGS(inode, lblk), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), @@ -2294,9 +2294,9 @@ TRACE_EVENT(ext4_es_find_extent_range_enter, __entry->lblk = lblk; ), - TP_printk("dev %d,%d ino %lu lblk %u", + TP_printk("dev %d,%d ino %llu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->lblk) + __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_range_exit, @@ -2305,11 +2305,11 @@ TRACE_EVENT(ext4_es_find_extent_range_exit, TP_ARGS(inode, es), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) __field( char, status ) ), @@ -2322,9 +2322,9 @@ TRACE_EVENT(ext4_es_find_extent_range_exit, __entry->status = ext4_es_status(es); ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", + TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); @@ -2335,8 +2335,8 @@ TRACE_EVENT(ext4_es_lookup_extent_enter, TP_ARGS(inode, lblk), TP_STRUCT__entry( + __field( u64, ino ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), @@ -2346,9 +2346,9 @@ TRACE_EVENT(ext4_es_lookup_extent_enter, __entry->lblk = lblk; ), - TP_printk("dev %d,%d ino %lu lblk %u", + TP_printk("dev %d,%d ino %llu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->lblk) + __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_lookup_extent_exit, @@ -2358,13 +2358,13 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, TP_ARGS(inode, es, found), TP_STRUCT__entry( + __field( u64, ino ) + __field( ext4_fsblk_t, pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) - __field( char, status ) __field( int, found ) + __field( char, status ) ), TP_fast_assign( @@ -2377,9 +2377,9 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, __entry->found = found; ), - TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", + TP_printk("dev %d,%d ino %llu found %d [%u/%u) %llu %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->found, + __entry->ino, __entry->found, __entry->lblk, __entry->len, __entry->found ? __entry->pblk : 0, show_extent_status(__entry->found ? __entry->status : 0)) @@ -2447,10 +2447,10 @@ TRACE_EVENT(ext4_collapse_range, TP_ARGS(inode, offset, len), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, offset) __field(loff_t, len) + __field(dev_t, dev) ), TP_fast_assign( @@ -2460,9 +2460,9 @@ TRACE_EVENT(ext4_collapse_range, __entry->len = len; ), - TP_printk("dev %d,%d ino %lu offset %lld len %lld", + TP_printk("dev %d,%d ino %llu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->offset, __entry->len) ); @@ -2472,10 +2472,10 @@ TRACE_EVENT(ext4_insert_range, TP_ARGS(inode, offset, len), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, offset) __field(loff_t, len) + __field(dev_t, dev) ), TP_fast_assign( @@ -2485,9 +2485,9 @@ TRACE_EVENT(ext4_insert_range, __entry->len = len; ), - TP_printk("dev %d,%d ino %lu offset %lld len %lld", + TP_printk("dev %d,%d ino %llu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->offset, __entry->len) ); @@ -2526,15 +2526,15 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, TP_ARGS(inode, es, lclu_allocated, end_allocated), TP_STRUCT__entry( + __field( u64, ino ) + __field( u64, seq ) + __field( ext4_fsblk_t, pblk ) __field( dev_t, dev ) - __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( bool, lclu_allocated ) __field( bool, end_allocated ) - __field( u64, seq ) ), TP_fast_assign( @@ -2549,9 +2549,9 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", + TP_printk("dev %d,%d ino %llu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, + __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->lclu_allocated, __entry->end_allocated, @@ -2875,9 +2875,9 @@ DECLARE_EVENT_CLASS(ext4_fc_track_dentry, TP_ARGS(handle, inode, dentry, ret), TP_STRUCT__entry( + __field(u64, i_ino) __field(dev_t, dev) __field(tid_t, t_tid) - __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), @@ -2892,7 +2892,7 @@ DECLARE_EVENT_CLASS(ext4_fc_track_dentry, __entry->error = ret; ), - TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d", + TP_printk("dev %d,%d, t_tid %u, ino %llu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error @@ -2916,9 +2916,9 @@ TRACE_EVENT(ext4_fc_track_inode, TP_ARGS(handle, inode, ret), TP_STRUCT__entry( + __field(u64, i_ino) __field(dev_t, dev) __field(tid_t, t_tid) - __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), @@ -2933,7 +2933,7 @@ TRACE_EVENT(ext4_fc_track_inode, __entry->error = ret; ), - TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d", + TP_printk("dev %d:%d, t_tid %u, inode %llu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error) @@ -2946,12 +2946,12 @@ TRACE_EVENT(ext4_fc_track_range, TP_ARGS(handle, inode, start, end, ret), TP_STRUCT__entry( + __field(u64, i_ino) + __field(long, start) + __field(long, end) __field(dev_t, dev) __field(tid_t, t_tid) - __field(ino_t, i_ino) __field(tid_t, i_sync_tid) - __field(long, start) - __field(long, end) __field(int, error) ), @@ -2967,7 +2967,7 @@ TRACE_EVENT(ext4_fc_track_range, __entry->error = ret; ), - TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld", + TP_printk("dev %d:%d, t_tid %u, inode %llu, i_sync_tid %u, error %d, start %ld, end %ld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error, __entry->start, __entry->end) @@ -3029,11 +3029,11 @@ TRACE_EVENT(ext4_move_extent_enter, TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk), TP_STRUCT__entry( + __field(u64, orig_ino) + __field(u64, donor_ino) __field(dev_t, dev) - __field(ino_t, orig_ino) __field(ext4_lblk_t, orig_lblk) __field(unsigned int, orig_flags) - __field(ino_t, donor_ino) __field(ext4_lblk_t, donor_lblk) __field(unsigned int, len) ), @@ -3048,11 +3048,11 @@ TRACE_EVENT(ext4_move_extent_enter, __entry->len = orig_map->m_len; ), - TP_printk("dev %d,%d origin ino %lu lblk %u flags %s donor ino %lu lblk %u len %u", + TP_printk("dev %d,%d origin ino %llu lblk %u flags %s donor ino %llu lblk %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->orig_ino, __entry->orig_lblk, + __entry->orig_ino, __entry->orig_lblk, show_mflags(__entry->orig_flags), - (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->donor_ino, __entry->donor_lblk, __entry->len) ); @@ -3065,13 +3065,13 @@ TRACE_EVENT(ext4_move_extent_exit, move_len, move_type, ret), TP_STRUCT__entry( + __field(u64, orig_ino) + __field(u64, donor_ino) + __field(u64, move_len) __field(dev_t, dev) - __field(ino_t, orig_ino) __field(ext4_lblk_t, orig_lblk) - __field(ino_t, donor_ino) __field(ext4_lblk_t, donor_lblk) __field(unsigned int, m_len) - __field(u64, move_len) __field(int, move_type) __field(int, ret) ), @@ -3088,10 +3088,10 @@ TRACE_EVENT(ext4_move_extent_exit, __entry->ret = ret; ), - TP_printk("dev %d,%d origin ino %lu lblk %u donor ino %lu lblk %u m_len %u, move_len %llu type %d ret %d", + TP_printk("dev %d,%d origin ino %llu lblk %u donor ino %llu lblk %u m_len %u, move_len %llu type %d ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->orig_ino, __entry->orig_lblk, - (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->orig_ino, __entry->orig_lblk, + __entry->donor_ino, __entry->donor_lblk, __entry->m_len, __entry->move_len, __entry->move_type, __entry->ret) ); -- cgit v1.2.3 From a232ef7831048448c3a7a93a80b1521ec9106b6a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:40 -0500 Subject: f2fs: widen trace event i_ino fields to u64 In trace events, change __field(ino_t, ...) to __field(u64, ...) and update TP_printk format strings to %llu/%llx to match the widened field type. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-10-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/f2fs.h | 242 ++++++++++++++++++++++---------------------- 1 file changed, 121 insertions(+), 121 deletions(-) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 9364e6775562..05a46908acd9 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -9,7 +9,7 @@ #include #define show_dev(dev) MAJOR(dev), MINOR(dev) -#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -206,13 +206,13 @@ DECLARE_EVENT_CLASS(f2fs__inode, TP_ARGS(inode), TP_STRUCT__entry( + __field(u64, ino) + __field(u64, pino) + __field(loff_t, size) + __field(blkcnt_t, blocks) __field(dev_t, dev) - __field(ino_t, ino) - __field(ino_t, pino) __field(umode_t, mode) - __field(loff_t, size) __field(unsigned int, nlink) - __field(blkcnt_t, blocks) __field(__u8, advise) ), @@ -227,10 +227,10 @@ DECLARE_EVENT_CLASS(f2fs__inode, __entry->advise = F2FS_I(inode)->i_advise; ), - TP_printk("dev = (%d,%d), ino = %lu, pino = %lu, i_mode = 0x%hx, " + TP_printk("dev = (%d,%d), ino = %llu, pino = %llu, i_mode = 0x%hx, " "i_size = %lld, i_nlink = %u, i_blocks = %llu, i_advise = 0x%x", show_dev_ino(__entry), - (unsigned long)__entry->pino, + __entry->pino, __entry->mode, __entry->size, (unsigned int)__entry->nlink, @@ -245,8 +245,8 @@ DECLARE_EVENT_CLASS(f2fs__inode_exit, TP_ARGS(inode, ret), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(umode_t, mode) __field(int, ret) ), @@ -258,7 +258,7 @@ DECLARE_EVENT_CLASS(f2fs__inode_exit, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, type: %s, mode = 0%o, ret = %d", + TP_printk("dev = (%d,%d), ino = %llu, type: %s, mode = 0%o, ret = %d", show_dev_ino(__entry), show_inode_type(__entry->mode & S_IFMT), __entry->mode & S_ALL_PERM, @@ -279,8 +279,8 @@ TRACE_EVENT(f2fs_sync_file_exit, TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(int, cp_reason) __field(int, datasync) __field(int, ret) @@ -294,7 +294,7 @@ TRACE_EVENT(f2fs_sync_file_exit, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " + TP_printk("dev = (%d,%d), ino = %llu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), show_fsync_cpreason(__entry->cp_reason), @@ -361,10 +361,10 @@ TRACE_EVENT(f2fs_unlink_enter, TP_ARGS(dir, dentry), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, size) __field(blkcnt_t, blocks) + __field(dev_t, dev) __string(name, dentry->d_name.name) ), @@ -376,7 +376,7 @@ TRACE_EVENT(f2fs_unlink_enter, __assign_str(name); ), - TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " + TP_printk("dev = (%d,%d), dir ino = %llu, i_size = %lld, " "i_blocks = %llu, name = %s", show_dev_ino(__entry), __entry->size, @@ -412,8 +412,8 @@ TRACE_EVENT(f2fs_truncate_data_blocks_range, TP_ARGS(inode, nid, ofs, free), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(nid_t, nid) __field(unsigned int, ofs) __field(int, free) @@ -427,7 +427,7 @@ TRACE_EVENT(f2fs_truncate_data_blocks_range, __entry->free = free; ), - TP_printk("dev = (%d,%d), ino = %lu, nid = %u, offset = %u, freed = %d", + TP_printk("dev = (%d,%d), ino = %llu, nid = %u, offset = %u, freed = %d", show_dev_ino(__entry), (unsigned int)__entry->nid, __entry->ofs, @@ -441,11 +441,11 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op, TP_ARGS(inode, from), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, size) __field(blkcnt_t, blocks) __field(u64, from) + __field(dev_t, dev) ), TP_fast_assign( @@ -456,7 +456,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op, __entry->from = from; ), - TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld, i_blocks = %llu, " + TP_printk("dev = (%d,%d), ino = %llu, i_size = %lld, i_blocks = %llu, " "start file offset = %llu", show_dev_ino(__entry), __entry->size, @@ -499,8 +499,8 @@ DECLARE_EVENT_CLASS(f2fs__truncate_node, TP_ARGS(inode, nid, blk_addr), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(nid_t, nid) __field(block_t, blk_addr) ), @@ -512,7 +512,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_node, __entry->blk_addr = blk_addr; ), - TP_printk("dev = (%d,%d), ino = %lu, nid = %u, block_address = 0x%llx", + TP_printk("dev = (%d,%d), ino = %llu, nid = %u, block_address = 0x%llx", show_dev_ino(__entry), (unsigned int)__entry->nid, (unsigned long long)__entry->blk_addr) @@ -546,8 +546,8 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, TP_ARGS(inode, nid, depth, err), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __array(nid_t, nid, 3) __field(int, depth) __field(int, err) @@ -563,7 +563,7 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, __entry->err = err; ), - TP_printk("dev = (%d,%d), ino = %lu, " + TP_printk("dev = (%d,%d), ino = %llu, " "nid[0] = %u, nid[1] = %u, nid[2] = %u, depth = %d, err = %d", show_dev_ino(__entry), (unsigned int)__entry->nid[0], @@ -581,11 +581,11 @@ TRACE_EVENT(f2fs_file_write_iter, TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, offset) __field(size_t, length) __field(ssize_t, ret) + __field(dev_t, dev) ), TP_fast_assign( @@ -596,7 +596,7 @@ TRACE_EVENT(f2fs_file_write_iter, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, " + TP_printk("dev = (%d,%d), ino = %llu, " "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, @@ -611,11 +611,11 @@ TRACE_EVENT(f2fs_fadvise, TP_ARGS(inode, offset, len, advice), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, size) __field(loff_t, offset) __field(loff_t, len) + __field(dev_t, dev) __field(int, advice) ), @@ -628,7 +628,7 @@ TRACE_EVENT(f2fs_fadvise, __entry->advice = advice; ), - TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld offset:%llu, len:%llu, advise:%d", + TP_printk("dev = (%d,%d), ino = %llu, i_size = %lld offset:%llu, len:%llu, advise:%d", show_dev_ino(__entry), (unsigned long long)__entry->size, __entry->offset, @@ -643,8 +643,8 @@ TRACE_EVENT(f2fs_map_blocks, TP_ARGS(inode, map, flag, ret), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) @@ -670,7 +670,7 @@ TRACE_EVENT(f2fs_map_blocks, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " + TP_printk("dev = (%d,%d), ino = %llu, file offset = %llu, " "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " "seg_type = %d, may_create = %d, multidevice = %d, " "flag = %d, err = %d", @@ -885,8 +885,8 @@ TRACE_EVENT(f2fs_lookup_start, TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __string(name, dentry->d_name.name) __field(unsigned int, flags) ), @@ -898,7 +898,7 @@ TRACE_EVENT(f2fs_lookup_start, __entry->flags = flags; ), - TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", + TP_printk("dev = (%d,%d), pino = %llu, name:%s, flags:%u", show_dev_ino(__entry), __get_str(name), __entry->flags) @@ -912,8 +912,8 @@ TRACE_EVENT(f2fs_lookup_end, TP_ARGS(dir, dentry, ino, err), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) @@ -927,7 +927,7 @@ TRACE_EVENT(f2fs_lookup_end, __entry->err = err; ), - TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", + TP_printk("dev = (%d,%d), pino = %llu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), __get_str(name), __entry->cino, @@ -943,10 +943,10 @@ TRACE_EVENT(f2fs_rename_start, TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, flags), TP_STRUCT__entry( + __field(u64, ino) + __field(u64, new_pino) __field(dev_t, dev) - __field(ino_t, ino) __string(old_name, old_dentry->d_name.name) - __field(ino_t, new_pino) __string(new_name, new_dentry->d_name.name) __field(unsigned int, flags) ), @@ -960,8 +960,8 @@ TRACE_EVENT(f2fs_rename_start, __entry->flags = flags; ), - TP_printk("dev = (%d,%d), old_dir = %lu, old_name: %s, " - "new_dir = %lu, new_name: %s, flags = %u", + TP_printk("dev = (%d,%d), old_dir = %llu, old_name: %s, " + "new_dir = %llu, new_name: %s, flags = %u", show_dev_ino(__entry), __get_str(old_name), __entry->new_pino, @@ -977,8 +977,8 @@ TRACE_EVENT(f2fs_rename_end, TP_ARGS(old_dentry, new_dentry, flags, ret), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __string(old_name, old_dentry->d_name.name) __string(new_name, new_dentry->d_name.name) __field(unsigned int, flags) @@ -994,7 +994,7 @@ TRACE_EVENT(f2fs_rename_end, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, old_name: %s, " + TP_printk("dev = (%d,%d), ino = %llu, old_name: %s, " "new_name: %s, flags = %u, ret = %d", show_dev_ino(__entry), __get_str(old_name), @@ -1010,10 +1010,10 @@ TRACE_EVENT(f2fs_readdir, TP_ARGS(dir, start_pos, end_pos, err), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, start) __field(loff_t, end) + __field(dev_t, dev) __field(int, err) ), @@ -1025,7 +1025,7 @@ TRACE_EVENT(f2fs_readdir, __entry->err = err; ), - TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", + TP_printk("dev = (%d,%d), ino = %llu, start_pos:%llu, end_pos:%llu, err:%d", show_dev_ino(__entry), __entry->start, __entry->end, @@ -1040,13 +1040,13 @@ TRACE_EVENT(f2fs_fallocate, TP_ARGS(inode, mode, offset, len, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) - __field(int, mode) + __field(u64, ino) __field(loff_t, offset) __field(loff_t, len) __field(loff_t, size) __field(blkcnt_t, blocks) + __field(dev_t, dev) + __field(int, mode) __field(int, ret) ), @@ -1061,7 +1061,7 @@ TRACE_EVENT(f2fs_fallocate, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, mode = %x, offset = %lld, " + TP_printk("dev = (%d,%d), ino = %llu, mode = %x, offset = %lld, " "len = %lld, i_size = %lld, i_blocks = %llu, ret = %d", show_dev_ino(__entry), __entry->mode, @@ -1079,12 +1079,12 @@ TRACE_EVENT(f2fs_direct_IO_enter, TP_ARGS(inode, iocb, len, rw), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, ki_pos) + __field(unsigned long, len) + __field(dev_t, dev) __field(int, ki_flags) __field(u16, ki_ioprio) - __field(unsigned long, len) __field(int, rw) ), @@ -1098,7 +1098,7 @@ TRACE_EVENT(f2fs_direct_IO_enter, __entry->rw = rw; ), - TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", + TP_printk("dev = (%d,%d), ino = %llu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), __entry->ki_pos, __entry->len, @@ -1115,10 +1115,10 @@ TRACE_EVENT(f2fs_direct_IO_exit, TP_ARGS(inode, offset, len, rw, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, pos) __field(unsigned long, len) + __field(dev_t, dev) __field(int, rw) __field(int, ret) ), @@ -1132,7 +1132,7 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu " + TP_printk("dev = (%d,%d), ino = %llu pos = %lld len = %lu " "rw = %d ret = %d", show_dev_ino(__entry), __entry->pos, @@ -1176,9 +1176,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_folio_bio, TP_ARGS(folio, fio), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, index) + __field(dev_t, dev) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) __field(enum req_op, op) @@ -1199,7 +1199,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_folio_bio, __entry->type = fio->type; ), - TP_printk("dev = (%d,%d), ino = %lu, folio_index = 0x%lx, " + TP_printk("dev = (%d,%d), ino = %llu, folio_index = 0x%lx, " "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, @@ -1306,9 +1306,9 @@ TRACE_EVENT(f2fs_write_begin, TP_ARGS(inode, pos, len), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, pos) + __field(dev_t, dev) __field(unsigned int, len) ), @@ -1319,7 +1319,7 @@ TRACE_EVENT(f2fs_write_begin, __entry->len = len; ), - TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u", + TP_printk("dev = (%d,%d), ino = %llu, pos = %llu, len = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len) @@ -1333,9 +1333,9 @@ TRACE_EVENT(f2fs_write_end, TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, pos) + __field(dev_t, dev) __field(unsigned int, len) __field(unsigned int, copied) ), @@ -1348,7 +1348,7 @@ TRACE_EVENT(f2fs_write_end, __entry->copied = copied; ), - TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, copied = %u", + TP_printk("dev = (%d,%d), ino = %llu, pos = %llu, len = %u, copied = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len, @@ -1362,12 +1362,12 @@ DECLARE_EVENT_CLASS(f2fs__folio, TP_ARGS(folio, type), TP_STRUCT__entry( + __field(u64, ino) + __field(pgoff_t, index) + __field(pgoff_t, nrpages) __field(dev_t, dev) - __field(ino_t, ino) __field(int, type) __field(int, dir) - __field(pgoff_t, index) - __field(pgoff_t, nrpages) __field(int, dirty) __field(int, uptodate) ), @@ -1383,7 +1383,7 @@ DECLARE_EVENT_CLASS(f2fs__folio, __entry->uptodate = folio_test_uptodate(folio); ), - TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, nr_pages = %lu, " + TP_printk("dev = (%d,%d), ino = %llu, %s, %s, index = %lu, nr_pages = %lu, " "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), @@ -1437,10 +1437,10 @@ TRACE_EVENT(f2fs_replace_atomic_write_block, TP_ARGS(inode, cow_inode, index, old_addr, new_addr, recovery), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) - __field(ino_t, cow_ino) + __field(u64, ino) + __field(u64, cow_ino) __field(pgoff_t, index) + __field(dev_t, dev) __field(block_t, old_addr) __field(block_t, new_addr) __field(bool, recovery) @@ -1456,7 +1456,7 @@ TRACE_EVENT(f2fs_replace_atomic_write_block, __entry->recovery = recovery; ), - TP_printk("dev = (%d,%d), ino = %lu, cow_ino = %lu, index = %lu, " + TP_printk("dev = (%d,%d), ino = %llu, cow_ino = %llu, index = %lu, " "old_addr = 0x%llx, new_addr = 0x%llx, recovery = %d", show_dev_ino(__entry), __entry->cow_ino, @@ -1474,10 +1474,10 @@ DECLARE_EVENT_CLASS(f2fs_mmap, TP_ARGS(inode, index, flags, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, index) __field(vm_flags_t, flags) + __field(dev_t, dev) __field(vm_fault_t, ret) ), @@ -1489,7 +1489,7 @@ DECLARE_EVENT_CLASS(f2fs_mmap, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, index = %lu, flags: %s, ret: %s", + TP_printk("dev = (%d,%d), ino = %llu, index = %lu, flags: %s, ret: %s", show_dev_ino(__entry), (unsigned long)__entry->index, __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), @@ -1519,15 +1519,15 @@ TRACE_EVENT(f2fs_writepages, TP_ARGS(inode, wbc, type), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) - __field(int, type) - __field(int, dir) - __field(long, nr_to_write) - __field(long, pages_skipped) + __field(u64, ino) __field(loff_t, range_start) __field(loff_t, range_end) + __field(long, nr_to_write) + __field(long, pages_skipped) __field(pgoff_t, writeback_index) + __field(dev_t, dev) + __field(int, type) + __field(int, dir) __field(int, sync_mode) __field(char, for_kupdate) __field(char, for_background) @@ -1554,7 +1554,7 @@ TRACE_EVENT(f2fs_writepages, __entry->for_sync = wbc->for_sync; ), - TP_printk("dev = (%d,%d), ino = %lu, %s, %s, nr_to_write %ld, " + TP_printk("dev = (%d,%d), ino = %llu, %s, %s, nr_to_write %ld, " "skipped %ld, start %lld, end %lld, wb_idx %lu, sync_mode %d, " "kupdate %u background %u tagged %u cyclic %u sync %u", show_dev_ino(__entry), @@ -1580,9 +1580,9 @@ TRACE_EVENT(f2fs_readpages, TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, start) + __field(dev_t, dev) __field(unsigned int, nrpage) ), @@ -1593,7 +1593,7 @@ TRACE_EVENT(f2fs_readpages, __entry->nrpage = nrpage; ), - TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u", + TP_printk("dev = (%d,%d), ino = %llu, start = %lu nrpage = %u", show_dev_ino(__entry), (unsigned long)__entry->start, __entry->nrpage) @@ -1738,8 +1738,8 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start, TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, pgofs) __field(enum extent_type, type) ), @@ -1751,7 +1751,7 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start, __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", + TP_printk("dev = (%d,%d), ino = %llu, pgofs = %u, type = %s", show_dev_ino(__entry), __entry->pgofs, show_extent_type(__entry->type)) @@ -1767,8 +1767,8 @@ TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_CONDITION(ei), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) @@ -1784,7 +1784,7 @@ TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, __entry->blk = ei->blk; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + TP_printk("dev = (%d,%d), ino = %llu, pgofs = %u, " "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, @@ -1803,13 +1803,13 @@ TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, TP_CONDITION(ei), TP_STRUCT__entry( + __field(u64, ino) + __field(unsigned long long, age) + __field(unsigned long long, blocks) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) - __field(unsigned long long, age) - __field(unsigned long long, blocks) ), TP_fast_assign( @@ -1822,7 +1822,7 @@ TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, __entry->blocks = ei->last_blocks; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + TP_printk("dev = (%d,%d), ino = %llu, pgofs = %u, " "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", show_dev_ino(__entry), __entry->pgofs, @@ -1841,8 +1841,8 @@ TRACE_EVENT(f2fs_update_read_extent_tree_range, TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) @@ -1858,7 +1858,7 @@ TRACE_EVENT(f2fs_update_read_extent_tree_range, __entry->c_len = c_len; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + TP_printk("dev = (%d,%d), ino = %llu, pgofs = %u, " "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, @@ -1876,12 +1876,12 @@ TRACE_EVENT(f2fs_update_age_extent_tree_range, TP_ARGS(inode, pgofs, len, age, last_blks), TP_STRUCT__entry( + __field(u64, ino) + __field(unsigned long long, age) + __field(unsigned long long, blocks) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, len) - __field(unsigned long long, age) - __field(unsigned long long, blocks) ), TP_fast_assign( @@ -1893,7 +1893,7 @@ TRACE_EVENT(f2fs_update_age_extent_tree_range, __entry->blocks = last_blks; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + TP_printk("dev = (%d,%d), ino = %llu, pgofs = %u, " "len = %u, age = %llu, blocks = %llu", show_dev_ino(__entry), __entry->pgofs, @@ -1938,8 +1938,8 @@ TRACE_EVENT(f2fs_destroy_extent_tree, TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( + __field(u64, ino) __field(dev_t, dev) - __field(ino_t, ino) __field(unsigned int, node_cnt) __field(enum extent_type, type) ), @@ -1951,7 +1951,7 @@ TRACE_EVENT(f2fs_destroy_extent_tree, __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", + TP_printk("dev = (%d,%d), ino = %llu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), __entry->node_cnt, show_extent_type(__entry->type)) @@ -2027,9 +2027,9 @@ DECLARE_EVENT_CLASS(f2fs_zip_start, TP_ARGS(inode, cluster_idx, cluster_size, algtype), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, idx) + __field(dev_t, dev) __field(unsigned int, size) __field(unsigned int, algtype) ), @@ -2042,7 +2042,7 @@ DECLARE_EVENT_CLASS(f2fs_zip_start, __entry->algtype = algtype; ), - TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " + TP_printk("dev = (%d,%d), ino = %llu, cluster_idx:%lu, " "cluster_size = %u, algorithm = %s", show_dev_ino(__entry), __entry->idx, @@ -2058,9 +2058,9 @@ DECLARE_EVENT_CLASS(f2fs_zip_end, TP_ARGS(inode, cluster_idx, compressed_size, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(pgoff_t, idx) + __field(dev_t, dev) __field(unsigned int, size) __field(unsigned int, ret) ), @@ -2073,7 +2073,7 @@ DECLARE_EVENT_CLASS(f2fs_zip_end, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " + TP_printk("dev = (%d,%d), ino = %llu, cluster_idx:%lu, " "compressed_size = %u, ret = %d", show_dev_ino(__entry), __entry->idx, @@ -2311,10 +2311,10 @@ TRACE_EVENT(f2fs_bmap, TP_ARGS(inode, lblock, pblock), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(sector_t, lblock) __field(sector_t, pblock) + __field(dev_t, dev) ), TP_fast_assign( @@ -2324,7 +2324,7 @@ TRACE_EVENT(f2fs_bmap, __entry->pblock = pblock; ), - TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld", + TP_printk("dev = (%d,%d), ino = %llu, lblock:%lld, pblock:%lld", show_dev_ino(__entry), (unsigned long long)__entry->lblock, (unsigned long long)__entry->pblock) @@ -2338,11 +2338,11 @@ TRACE_EVENT(f2fs_fiemap, TP_ARGS(inode, lblock, pblock, len, flags, ret), TP_STRUCT__entry( - __field(dev_t, dev) - __field(ino_t, ino) + __field(u64, ino) __field(sector_t, lblock) __field(sector_t, pblock) __field(unsigned long long, len) + __field(dev_t, dev) __field(unsigned int, flags) __field(int, ret) ), @@ -2357,7 +2357,7 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, " + TP_printk("dev = (%d,%d), ino = %llu, lblock:%lld, pblock:%lld, " "len:%llu, flags:%u, ret:%d", show_dev_ino(__entry), (unsigned long long)__entry->lblock, @@ -2375,13 +2375,13 @@ DECLARE_EVENT_CLASS(f2fs__rw_start, TP_ARGS(inode, offset, bytes, pid, pathname, command), TP_STRUCT__entry( - __string(pathbuf, pathname) + __field(u64, ino) __field(loff_t, offset) - __field(int, bytes) __field(loff_t, i_size) + __string(pathbuf, pathname) __string(cmdline, command) __field(pid_t, pid) - __field(ino_t, ino) + __field(int, bytes) ), TP_fast_assign( @@ -2402,10 +2402,10 @@ DECLARE_EVENT_CLASS(f2fs__rw_start, ), TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," - " pid %d, i_size %llu, ino %lu", + " pid %d, i_size %llu, ino %llu", __get_str(pathbuf), __entry->offset, __entry->bytes, __get_str(cmdline), __entry->pid, __entry->i_size, - (unsigned long) __entry->ino) + __entry->ino) ); DECLARE_EVENT_CLASS(f2fs__rw_end, @@ -2415,7 +2415,7 @@ DECLARE_EVENT_CLASS(f2fs__rw_end, TP_ARGS(inode, offset, bytes), TP_STRUCT__entry( - __field(ino_t, ino) + __field(u64, ino) __field(loff_t, offset) __field(int, bytes) ), @@ -2426,8 +2426,8 @@ DECLARE_EVENT_CLASS(f2fs__rw_end, __entry->bytes = bytes; ), - TP_printk("ino %lu, offset %llu, bytes %d", - (unsigned long) __entry->ino, + TP_printk("ino %llu, offset %llu, bytes %d", + __entry->ino, __entry->offset, __entry->bytes) ); -- cgit v1.2.3 From d84c70c6eab10e56d22c394e3a250c1c6fde8d6e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:41 -0500 Subject: nilfs2: widen trace event i_ino fields to u64 In trace events, change __field(unsigned long, ...) to __field(u64, ...) and update TP_PROTO parameters and TP_printk format strings to match the widened field type. Reviewed-by: Viacheslav Dubeyko Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-11-2257ad83d372@kernel.org Signed-off-by: Christian Brauner --- include/trace/events/nilfs2.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index 8880c11733dd..49efbd209585 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -165,14 +165,14 @@ TRACE_EVENT(nilfs2_segment_usage_freed, TRACE_EVENT(nilfs2_mdt_insert_new_block, TP_PROTO(struct inode *inode, - unsigned long ino, + u64 ino, unsigned long block), TP_ARGS(inode, ino, block), TP_STRUCT__entry( + __field(u64, ino) __field(struct inode *, inode) - __field(unsigned long, ino) __field(unsigned long, block) ), @@ -182,7 +182,7 @@ TRACE_EVENT(nilfs2_mdt_insert_new_block, __entry->block = block; ), - TP_printk("inode = %p ino = %lu block = %lu", + TP_printk("inode = %p ino = %llu block = %lu", __entry->inode, __entry->ino, __entry->block) @@ -190,15 +190,15 @@ TRACE_EVENT(nilfs2_mdt_insert_new_block, TRACE_EVENT(nilfs2_mdt_submit_block, TP_PROTO(struct inode *inode, - unsigned long ino, + u64 ino, unsigned long blkoff, enum req_op mode), TP_ARGS(inode, ino, blkoff, mode), TP_STRUCT__entry( + __field(u64, ino) __field(struct inode *, inode) - __field(unsigned long, ino) __field(unsigned long, blkoff) /* * Use field_struct() to avoid is_signed_type() on the @@ -214,7 +214,7 @@ TRACE_EVENT(nilfs2_mdt_submit_block, __entry->mode = mode; ), - TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x", + TP_printk("inode = %p ino = %llu blkoff = %lu mode = %x", __entry->inode, __entry->ino, __entry->blkoff, -- cgit v1.2.3 From 0b2600f81cefcdfcda58d50df7be8fd48ada8ce2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:42 -0500 Subject: treewide: change inode->i_ino from unsigned long to u64 On 32-bit architectures, unsigned long is only 32 bits wide, which causes 64-bit inode numbers to be silently truncated. Several filesystems (NFS, XFS, BTRFS, etc.) can generate inode numbers that exceed 32 bits, and this truncation can lead to inode number collisions and other subtle bugs on 32-bit systems. Change the type of inode->i_ino from unsigned long to u64 to ensure that inode numbers are always represented as 64-bit values regardless of architecture. Update all format specifiers treewide from %lu/%lx to %llu/%llx to match the new type, along with corresponding local variable types. This is the bulk treewide conversion. Earlier patches in this series handled trace events separately to allow trace field reordering for better struct packing on 32-bit. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-12-2257ad83d372@kernel.org Acked-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- drivers/dma-buf/dma-buf.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +-- fs/9p/vfs_addr.c | 4 +-- fs/9p/vfs_inode.c | 6 ++-- fs/9p/vfs_inode_dotl.c | 6 ++-- fs/affs/amigaffs.c | 10 +++---- fs/affs/bitmap.c | 2 +- fs/affs/dir.c | 2 +- fs/affs/file.c | 20 ++++++------- fs/affs/inode.c | 12 ++++---- fs/affs/namei.c | 14 ++++----- fs/affs/symlink.c | 2 +- fs/afs/dir.c | 10 +++---- fs/afs/dir_search.c | 2 +- fs/afs/dynroot.c | 2 +- fs/afs/inode.c | 2 +- fs/autofs/inode.c | 2 +- fs/befs/linuxvfs.c | 28 ++++++++--------- fs/bfs/dir.c | 4 +-- fs/cachefiles/io.c | 6 ++-- fs/cachefiles/namei.c | 12 ++++---- fs/cachefiles/xattr.c | 2 +- fs/ceph/crypto.c | 4 +-- fs/coda/dir.c | 2 +- fs/coda/inode.c | 2 +- fs/cramfs/inode.c | 2 +- fs/crypto/crypto.c | 2 +- fs/crypto/hooks.c | 2 +- fs/crypto/keyring.c | 4 +-- fs/crypto/keysetup.c | 2 +- fs/dcache.c | 4 +-- fs/ecryptfs/crypto.c | 6 ++-- fs/ecryptfs/file.c | 2 +- fs/efs/inode.c | 6 ++-- fs/eventpoll.c | 2 +- fs/exportfs/expfs.c | 4 +-- fs/ext2/dir.c | 10 +++---- fs/ext2/ialloc.c | 9 +++--- fs/ext2/inode.c | 2 +- fs/ext2/xattr.c | 14 ++++----- fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 4 +-- fs/ext4/extents.c | 8 ++--- fs/ext4/extents_status.c | 28 ++++++++--------- fs/ext4/fast_commit.c | 8 ++--- fs/ext4/ialloc.c | 10 +++---- fs/ext4/indirect.c | 2 +- fs/ext4/inline.c | 14 ++++----- fs/ext4/inode.c | 22 +++++++------- fs/ext4/ioctl.c | 4 +-- fs/ext4/mballoc.c | 6 ++-- fs/ext4/migrate.c | 2 +- fs/ext4/move_extent.c | 20 ++++++------- fs/ext4/namei.c | 10 +++---- fs/ext4/orphan.c | 16 +++++----- fs/ext4/page-io.c | 10 +++---- fs/ext4/super.c | 22 +++++++------- fs/ext4/xattr.c | 10 +++---- fs/f2fs/compress.c | 4 +-- fs/f2fs/dir.c | 2 +- fs/f2fs/extent_cache.c | 8 ++--- fs/f2fs/f2fs.h | 6 ++-- fs/f2fs/file.c | 12 ++++---- fs/f2fs/gc.c | 2 +- fs/f2fs/inline.c | 4 +-- fs/f2fs/inode.c | 48 +++++++++++++++--------------- fs/f2fs/namei.c | 8 ++--- fs/f2fs/node.c | 10 +++---- fs/f2fs/recovery.c | 10 +++---- fs/f2fs/xattr.c | 10 +++---- fs/freevxfs/vxfs_bmap.c | 4 +-- fs/fserror.c | 2 +- fs/hfs/catalog.c | 2 +- fs/hfs/extent.c | 4 +-- fs/hfs/inode.c | 4 +-- fs/hfsplus/attributes.c | 10 +++---- fs/hfsplus/catalog.c | 2 +- fs/hfsplus/dir.c | 6 ++-- fs/hfsplus/extents.c | 6 ++-- fs/hfsplus/inode.c | 8 ++--- fs/hfsplus/super.c | 6 ++-- fs/hfsplus/xattr.c | 10 +++---- fs/hpfs/dir.c | 4 +-- fs/hpfs/dnode.c | 4 +-- fs/hpfs/ea.c | 4 +-- fs/hpfs/inode.c | 4 +-- fs/inode.c | 13 ++++---- fs/iomap/ioend.c | 2 +- fs/isofs/compress.c | 2 +- fs/isofs/dir.c | 2 +- fs/isofs/inode.c | 6 ++-- fs/isofs/namei.c | 2 +- fs/jbd2/journal.c | 4 +-- fs/jbd2/transaction.c | 2 +- fs/jffs2/dir.c | 4 +-- fs/jffs2/file.c | 4 +-- fs/jffs2/fs.c | 18 +++++------ fs/jfs/inode.c | 2 +- fs/jfs/jfs_imap.c | 2 +- fs/jfs/jfs_metapage.c | 2 +- fs/lockd/svclock.c | 8 ++--- fs/lockd/svcsubs.c | 2 +- fs/locks.c | 6 ++-- fs/minix/inode.c | 10 +++---- fs/nfs/dir.c | 20 ++++++------- fs/nfs/file.c | 8 ++--- fs/nfs/filelayout/filelayout.c | 8 ++--- fs/nfs/flexfilelayout/flexfilelayout.c | 8 ++--- fs/nfs/inode.c | 6 ++-- fs/nfs/nfs4proc.c | 4 +-- fs/nfs/pnfs.c | 12 ++++---- fs/nfsd/export.c | 2 +- fs/nfsd/nfs4state.c | 4 +-- fs/nfsd/nfsfh.c | 4 +-- fs/nfsd/vfs.c | 2 +- fs/nilfs2/alloc.c | 10 +++---- fs/nilfs2/bmap.c | 2 +- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/btree.c | 12 ++++---- fs/nilfs2/dir.c | 12 ++++---- fs/nilfs2/direct.c | 4 +-- fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/inode.c | 8 ++--- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/namei.c | 2 +- fs/nilfs2/segment.c | 2 +- fs/notify/fdinfo.c | 4 +-- fs/nsfs.c | 4 +-- fs/ntfs3/super.c | 2 +- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 4 +-- fs/ocfs2/dir.c | 8 ++--- fs/ocfs2/dlmfs/dlmfs.c | 10 +++---- fs/ocfs2/extent_map.c | 12 ++++---- fs/ocfs2/inode.c | 2 +- fs/ocfs2/quota_local.c | 2 +- fs/ocfs2/refcounttree.c | 10 +++---- fs/ocfs2/xattr.c | 4 +-- fs/orangefs/inode.c | 2 +- fs/overlayfs/export.c | 2 +- fs/overlayfs/namei.c | 4 +-- fs/overlayfs/util.c | 2 +- fs/pipe.c | 2 +- fs/proc/fd.c | 2 +- fs/proc/task_mmu.c | 4 +-- fs/qnx4/inode.c | 4 +-- fs/qnx6/inode.c | 2 +- fs/ubifs/debug.c | 8 ++--- fs/ubifs/dir.c | 28 ++++++++--------- fs/ubifs/file.c | 28 ++++++++--------- fs/ubifs/journal.c | 6 ++-- fs/ubifs/super.c | 16 +++++----- fs/ubifs/tnc.c | 4 +-- fs/ubifs/xattr.c | 14 ++++----- fs/udf/directory.c | 18 +++++------ fs/udf/file.c | 2 +- fs/udf/inode.c | 12 ++++---- fs/udf/namei.c | 8 ++--- fs/udf/super.c | 2 +- fs/ufs/balloc.c | 6 ++-- fs/ufs/dir.c | 10 +++---- fs/ufs/ialloc.c | 6 ++-- fs/ufs/inode.c | 18 +++++------ fs/ufs/ufs_fs.h | 6 ++-- fs/ufs/util.c | 2 +- fs/verity/init.c | 2 +- fs/zonefs/super.c | 8 ++--- include/linux/fs.h | 2 +- kernel/events/uprobes.c | 4 +-- net/netrom/af_netrom.c | 4 +-- net/rose/af_rose.c | 4 +-- net/socket.c | 2 +- net/x25/x25_proc.c | 4 +-- security/apparmor/apparmorfs.c | 4 +-- security/integrity/integrity_audit.c | 2 +- security/ipe/audit.c | 2 +- security/lsm_audit.c | 10 +++---- security/selinux/hooks.c | 10 +++---- security/smack/smack_lsm.c | 12 ++++---- 179 files changed, 607 insertions(+), 607 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 11711874a325..8c16c8c425cc 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1708,7 +1708,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) spin_lock(&buf_obj->name_lock); - seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08llu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 1fb956400696..aaa8cdc122c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1676,9 +1676,9 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) attachment = READ_ONCE(bo->tbo.base.import_attach); if (attachment) - seq_printf(m, " imported from ino:%lu", file_inode(dma_buf->file)->i_ino); + seq_printf(m, " imported from ino:%llu", file_inode(dma_buf->file)->i_ino); else if (dma_buf) - seq_printf(m, " exported as ino:%lu", file_inode(dma_buf->file)->i_ino); + seq_printf(m, " exported as ino:%llu", file_inode(dma_buf->file)->i_ino); amdgpu_bo_print_flag(m, bo, CPU_ACCESS_REQUIRED); amdgpu_bo_print_flag(m, bo, NO_CPU_ACCESS); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 862164181bac..c21d33830f5f 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -36,7 +36,7 @@ static void v9fs_begin_writeback(struct netfs_io_request *wreq) fid = v9fs_fid_find_inode(wreq->inode, true, INVALID_UID, true); if (!fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%llx\n", wreq->inode->i_ino); return; } @@ -133,7 +133,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) return 0; no_fid: - WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%llx\n", rreq->inode->i_ino); return -EINVAL; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 97abe65bf7c1..d1508b1fe109 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1245,7 +1245,7 @@ static int v9fs_vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd,%s\n", dir->i_ino, dentry, symname); return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); @@ -1269,7 +1269,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */ struct p9_fid *oldfid; - p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd,%pd\n", dir->i_ino, dentry, old_dentry); oldfid = v9fs_fid_clone(old_dentry); @@ -1305,7 +1305,7 @@ v9fs_vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1]; u32 perm; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, mode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 643e759eacb2..71796a89bcf4 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -691,7 +691,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct p9_fid *fid = NULL; name = dentry->d_name.name; - p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); + p9_debug(P9_DEBUG_VFS, "%llu,%s,%s\n", dir->i_ino, name, symname); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { @@ -734,7 +734,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", + p9_debug(P9_DEBUG_VFS, "dir ino: %llu, old_name: %pd, new_name: %pd\n", dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); @@ -798,7 +798,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index fd669daa4e7b..d8a96d8cc826 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -33,7 +33,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); + pr_debug("%s(dir=%llu, ino=%d)\n", __func__, dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -83,7 +83,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, rem_ino, offset); bh = affs_bread(sb, dir->i_ino); @@ -128,7 +128,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { - dentry->d_fsdata = (void *)inode->i_ino; + dentry->d_fsdata = (void *)(unsigned long)inode->i_ino; break; } } @@ -147,7 +147,7 @@ affs_remove_link(struct dentry *dentry) u32 link_ino, ino; int retval; - pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + pr_debug("%s(key=%llu)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) @@ -279,7 +279,7 @@ affs_remove_header(struct dentry *dentry) if (!inode) goto done; - pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + pr_debug("%s(key=%llu)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); if (!bh) diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 5ba9ef2742f6..40bc4ce6af4a 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal) sb = inode->i_sb; sbi = AFFS_SB(sb); - pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + pr_debug("balloc(inode=%llu,goal=%u): ", inode->i_ino, goal); if (AFFS_I(inode)->i_pa_cnt) { pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 5c8d83387a39..11e2bac2e391 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -90,7 +90,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) u32 ino; int error = 0; - pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); + pr_debug("%s(ino=%llu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { data->ino = 0; diff --git a/fs/affs/file.c b/fs/affs/file.c index 6c9258359ddb..a51dee9d7d7e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -24,7 +24,7 @@ static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); static int affs_file_open(struct inode *inode, struct file *filp) { - pr_debug("open(%lu,%d)\n", + pr_debug("open(%llu,%d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); atomic_inc(&AFFS_I(inode)->i_opencnt); return 0; @@ -33,7 +33,7 @@ affs_file_open(struct inode *inode, struct file *filp) static int affs_file_release(struct inode *inode, struct file *filp) { - pr_debug("release(%lu, %d)\n", + pr_debug("release(%llu, %d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { @@ -301,7 +301,7 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino, + pr_debug("%s(%llu, %llu)\n", __func__, inode->i_ino, (unsigned long long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -534,7 +534,7 @@ static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + pr_debug("%s(%llu, %ld, 0, %zu)\n", __func__, inode->i_ino, folio->index, to); BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; @@ -566,7 +566,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize); + pr_debug("%s(%llu, %d)\n", __func__, inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -634,7 +634,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + pr_debug("%s(%llu, %ld)\n", __func__, inode->i_ino, folio->index); to = folio_size(folio); if (folio_pos(folio) + to > inode->i_size) { to = inode->i_size - folio_pos(folio); @@ -658,7 +658,7 @@ static int affs_write_begin_ofs(const struct kiocb *iocb, pgoff_t index; int err = 0; - pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pr_debug("%s(%llu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of @@ -710,7 +710,7 @@ static int affs_write_end_ofs(const struct kiocb *iocb, * due to write_begin. */ - pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pr_debug("%s(%llu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = folio_address(folio); @@ -854,7 +854,7 @@ affs_free_prealloc(struct inode *inode) { struct super_block *sb = inode->i_sb; - pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino); + pr_debug("free_prealloc(ino=%llu)\n", inode->i_ino); while (AFFS_I(inode)->i_pa_cnt) { AFFS_I(inode)->i_pa_cnt--; @@ -874,7 +874,7 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n", + pr_debug("truncate(inode=%llu, oldsize=%llu, newsize=%llu)\n", inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size); last_blk = 0; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0bfc7d151dcd..561fc0185e89 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -32,7 +32,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (!(inode_state_read_once(inode) & I_NEW)) return inode; - pr_debug("affs_iget(%lu)\n", inode->i_ino); + pr_debug("affs_iget(%llu)\n", inode->i_ino); block = inode->i_ino; bh = affs_bread(sb, block); @@ -171,14 +171,14 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) uid_t uid; gid_t gid; - pr_debug("write_inode(%lu)\n", inode->i_ino); + pr_debug("write_inode(%llu)\n", inode->i_ino); if (!inode->i_nlink) // possibly free block return 0; bh = affs_bread(sb, inode->i_ino); if (!bh) { - affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino); + affs_error(sb, "write_inode", "Cannot read block %llu", inode->i_ino); return -EIO; } tail = AFFS_TAIL(sb, bh); @@ -219,7 +219,7 @@ affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode = d_inode(dentry); int error; - pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); + pr_debug("notify_change(%llu,0x%x)\n", inode->i_ino, attr->ia_valid); error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) @@ -260,7 +260,7 @@ void affs_evict_inode(struct inode *inode) { unsigned long cache_page; - pr_debug("evict_inode(ino=%lu, nlink=%u)\n", + pr_debug("evict_inode(ino=%llu, nlink=%u)\n", inode->i_ino, inode->i_nlink); truncate_inode_pages_final(&inode->i_data); @@ -353,7 +353,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__, + pr_debug("%s(dir=%llu, inode=%llu, \"%pd\", type=%d)\n", __func__, dir->i_ino, inode->i_ino, dentry, type); retval = -EIO; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index f883be50db12..870532192600 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -235,7 +235,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, %llu \"%pd\")\n", __func__, dir->i_ino, d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); @@ -249,7 +249,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - pr_debug("%s(%lu,\"%pd\",0%ho)\n", + pr_debug("%s(%llu,\"%pd\",0%ho)\n", __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); @@ -280,7 +280,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - pr_debug("%s(%lu,\"%pd\",0%ho)\n", + pr_debug("%s(%llu,\"%pd\",0%ho)\n", __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); @@ -306,7 +306,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, %llu \"%pd\")\n", __func__, dir->i_ino, d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); @@ -323,7 +323,7 @@ affs_symlink(struct mnt_idmap *idmap, struct inode *dir, int i, maxlen, error; char c, lc; - pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n", + pr_debug("%s(%llu,\"%pd\" -> \"%s\")\n", __func__, dir->i_ino, dentry, symname); maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; @@ -395,7 +395,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, + pr_debug("%s(%llu, %llu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); @@ -511,7 +511,7 @@ int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + pr_debug("%s(old=%llu,\"%pd\" to new=%llu,\"%pd\")\n", __func__, old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); if (flags & RENAME_EXCHANGE) diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 094aec8d17b8..de31ed2e71df 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -21,7 +21,7 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) char c; char lc; - pr_debug("get_link(ino=%lu)\n", inode->i_ino); + pr_debug("get_link(ino=%llu)\n", inode->i_ino); bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 78caef3f1338..aaaa55878ffd 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -148,7 +148,7 @@ static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress, union afs_xdr_dir_block *block) { if (block->hdr.magic != AFS_DIR_MAGIC) { - pr_warn("%s(%lx): [%zx] bad magic %04x\n", + pr_warn("%s(%llx): [%zx] bad magic %04x\n", __func__, dvnode->netfs.inode.i_ino, progress, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, progress); @@ -214,7 +214,7 @@ static int afs_dir_check(struct afs_vnode *dvnode) */ static int afs_dir_open(struct inode *inode, struct file *file) { - _enter("{%lu}", inode->i_ino); + _enter("{%llu}", inode->i_ino); BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); @@ -523,7 +523,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, int retry_limit = 100; int ret; - _enter("{%lu},%llx,,", dir->i_ino, ctx->pos); + _enter("{%llu},%llx,,", dir->i_ino, ctx->pos); do { if (--retry_limit < 0) { @@ -610,7 +610,7 @@ static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, }; int ret; - _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); + _enter("{%llu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); @@ -783,7 +783,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) long ret; int i; - _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + _enter("{%llu},%p{%pd},", dir->i_ino, dentry, dentry); cookie = kzalloc_obj(struct afs_lookup_cookie); if (!cookie) diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c index d2516e55b5ed..104411c0692f 100644 --- a/fs/afs/dir_search.c +++ b/fs/afs/dir_search.c @@ -194,7 +194,7 @@ int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_dir_iter iter = { .dvnode = dvnode, }; int ret, retry_limit = 3; - _enter("{%lu},,,", dvnode->netfs.inode.i_ino); + _enter("{%llu},,,", dvnode->netfs.inode.i_ino); if (!afs_dir_init_iter(&iter, name)) return -ENOENT; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index aa56e8951e03..1d5e33bc7502 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -59,7 +59,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", + _debug("GOT INODE %p { ino=%llu, vl=%llx, vn=%llx, u=%x }", inode, inode->i_ino, fid.vid, fid.vnode, fid.unique); vnode = AFS_FS_I(inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index dde1857fcabb..a5173434f786 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -683,7 +683,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct key *key; int ret, seq; - _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + _enter("{ ino=%llu v=%u }", inode->i_ino, inode->i_generation); if (vnode->volume && !(query_flags & AT_STATX_DONT_SYNC) && diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index c53dc551053b..c1e210cec436 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -92,7 +92,7 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) - seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); + seq_printf(m, ",pipe_ino=%llu", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index cecbc92f959a..c12caae9a967 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -140,20 +140,20 @@ befs_get_block(struct inode *inode, sector_t block, int res; ulong disk_off; - befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", - (unsigned long)inode->i_ino, (long)block); + befs_debug(sb, "---> befs_get_block() for inode %llu, block %ld", + inode->i_ino, (long)block); if (create) { befs_error(sb, "befs_get_block() was asked to write to " - "block %ld in inode %lu", (long)block, - (unsigned long)inode->i_ino); + "block %ld in inode %llu", (long)block, + inode->i_ino); return -EPERM; } res = befs_fblock2brun(sb, ds, block, &run); if (res != BEFS_OK) { befs_error(sb, - "<--- %s for inode %lu, block %ld ERROR", - __func__, (unsigned long)inode->i_ino, + "<--- %s for inode %llu, block %ld ERROR", + __func__, inode->i_ino, (long)block); return -EFBIG; } @@ -162,8 +162,8 @@ befs_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, disk_off); - befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", - __func__, (unsigned long)inode->i_ino, (long)block, + befs_debug(sb, "<--- %s for inode %llu, block %ld, disk address %lu", + __func__, inode->i_ino, (long)block, (unsigned long)disk_off); return 0; @@ -181,7 +181,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> %s name %pd inode %ld", __func__, + befs_debug(sb, "---> %s name %pd inode %llu", __func__, dentry, dir->i_ino); /* Convert to UTF-8 */ @@ -224,7 +224,7 @@ befs_readdir(struct file *file, struct dir_context *ctx) size_t keysize; char keybuf[BEFS_NAME_LEN + 1]; - befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", + befs_debug(sb, "---> %s name %pD, inode %llu, ctx->pos %lld", __func__, file, inode->i_ino, ctx->pos); while (1) { @@ -233,7 +233,7 @@ befs_readdir(struct file *file, struct dir_context *ctx) if (result == BEFS_ERR) { befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %pD (inode %lu)", + befs_error(sb, "IO error reading %pD (inode %llu)", file, inode->i_ino); return -EIO; @@ -324,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " - "inode = %lu", inode->i_ino); + "inode = %llu", inode->i_ino); goto unacquire_none; } @@ -333,7 +333,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_dump_inode(sb, raw_inode); if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) { - befs_error(sb, "Bad inode: %lu", inode->i_ino); + befs_error(sb, "Bad inode: %llu", inode->i_ino); goto unacquire_bh; } @@ -407,7 +407,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &simple_symlink_inode_operations; } } else { - befs_error(sb, "Inode %lu is not a regular file, " + befs_error(sb, "Inode %llu is not a regular file, " "directory or symlink. THAT IS WRONG! BeFS has no " "on disk special files", inode->i_ino); goto unacquire_bh; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index c375e22c4c0c..481514db4eae 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -35,7 +35,7 @@ static int bfs_readdir(struct file *f, struct dir_context *ctx) int block; if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { - printf("Bad f_pos=%08lx for %s:%08lx\n", + printf("Bad f_pos=%08lx for %s:%08llx\n", (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); return -EINVAL; @@ -180,7 +180,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) goto out_brelse; if (!inode->i_nlink) { - printf("unlinking non-existent file %s:%lu (nlink=%d)\n", + printf("unlinking non-existent file %s:%llu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); set_nlink(inode, 1); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index eaf47851c65f..d879b80a0bed 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -93,7 +93,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, object = cachefiles_cres_object(cres); file = cachefiles_cres_file(cres); - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); @@ -214,7 +214,7 @@ static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, file = cachefiles_cres_file(cres); granularity = max_t(size_t, object->volume->cache->bsize, granularity); - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start, len, i_size_read(file_inode(file))); @@ -294,7 +294,7 @@ int __cachefiles_write(struct cachefiles_object *object, fscache_count_write(); cache = object->volume->cache; - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index e5ec90dccc27..4fdf7687aacb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -147,7 +147,7 @@ retry: } ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %pd{ino=%lu}", + _debug("mkdir -> %pd{ino=%llu}", subdir, d_backing_inode(subdir)->i_ino); if (_is_new) *_is_new = true; @@ -158,7 +158,7 @@ retry: end_creating_keep(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + pr_notice("cachefiles: Inode already in use: %pd (B=%llx)\n", subdir, d_inode(subdir)->i_ino); goto mark_error; } @@ -183,7 +183,7 @@ retry: !d_backing_inode(subdir)->i_op->unlink) goto check_error; - _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); + _leave(" = [%llu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: @@ -529,7 +529,7 @@ static bool cachefiles_create_file(struct cachefiles_object *object) set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); - _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino); + _debug("create -> %pD{ino=%llu}", file, file_inode(file)->i_ino); object->file = file; return true; } @@ -549,7 +549,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + pr_notice("cachefiles: Inode already in use: %pd (B=%llx)\n", dentry, d_inode(dentry)->i_ino); return false; } @@ -657,7 +657,7 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) if (!ret) return false; - _leave(" = t [%lu]", file_inode(object->file)->i_ino); + _leave(" = t [%llu]", file_inode(object->file)->i_ino); return true; new_file: diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 52383b1d0ba6..f8ae78b3f7b6 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -179,7 +179,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, ret = 0; else if (ret != -ENOMEM) cachefiles_io_error(cache, - "Can't remove xattr from %lu" + "Can't remove xattr from %llu" " (error %d)", d_backing_inode(dentry)->i_ino, -ret); } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index f3de43ccb470..64d240759277 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -272,7 +272,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ WARN_ON(elen > 240); if (dir != parent) // leading _ is already there; append _ - elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino); + elen += 1 + sprintf(p + elen, "_%llu", dir->i_ino); out: kfree(cryptbuf); @@ -377,7 +377,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, if (!ret && (dir != fname->dir)) { char tmp_buf[BASE64_CHARS(NAME_MAX)]; - name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%llu", oname->len, oname->name, dir->i_ino); memcpy(oname->name, tmp_buf, name_len); oname->len = name_len; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index c64b8cd81568..d6b9fc3cc1ca 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -533,7 +533,7 @@ int coda_revalidate_inode(struct inode *inode) coda_vattr_to_iattr(inode, &attr); if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { - pr_warn("inode %ld, fid %s changed type!\n", + pr_warn("inode %llu, fid %s changed type!\n", inode->i_ino, coda_f2s(&(cii->c_fid))); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index ad1654f3adf8..40b43866e6a5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -257,7 +257,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc) goto error; } - pr_info("%s: rootinode is %ld dev %s\n", + pr_info("%s: rootinode is %llu dev %s\n", __func__, root->i_ino, root->i_sb->s_id); sb->s_root = d_make_root(root); if (!sb->s_root) { diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e0ba9cd640dc..4edbfccd0bbe 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -125,7 +125,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, old_decode_dev(cramfs_inode->size)); break; default: - printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 07f9cbfe3ea4..570a2231c945 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -365,7 +365,7 @@ void fscrypt_msg(const struct inode *inode, const char *level, vaf.fmt = fmt; vaf.va = &args; if (inode && inode->i_ino) - printk("%sfscrypt (%s, inode %lu): %pV\n", + printk("%sfscrypt (%s, inode %llu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else if (inode) printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index b97de0d1430f..a7a8a3f581a0 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -62,7 +62,7 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dentry_parent = dget_parent(dentry); if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) { fscrypt_warn(inode, - "Inconsistent encryption context (parent directory: %lu)", + "Inconsistent encryption context (parent directory: %llu)", d_inode(dentry_parent)->i_ino); err = -EPERM; } diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 9ec6e5ef0947..be8e6e8011f2 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -969,8 +969,8 @@ static int check_for_busy_inodes(struct super_block *sb, { struct list_head *pos; size_t busy_count = 0; - unsigned long ino; char ino_str[50] = ""; + u64 ino; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -994,7 +994,7 @@ static int check_for_busy_inodes(struct super_block *sb, /* If the inode is currently being created, ino may still be 0. */ if (ino) - snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); + snprintf(ino_str, sizeof(ino_str), ", including ino %llu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 40fa05688d3a..df58ca4a5e3c 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -91,7 +91,7 @@ select_encryption_mode(const union fscrypt_policy *policy, if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %llu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } diff --git a/fs/dcache.c b/fs/dcache.c index 7ba1801d8132..0a4ffda07360 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1637,11 +1637,11 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%llx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? - dentry->d_inode->i_ino : 0UL, + dentry->d_inode->i_ino : (u64)0, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 3b59346d68c5..f25c9a49e251 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1313,7 +1313,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file header region or xattr region, inode %lu\n", + "file header region or xattr region, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; goto out; @@ -1323,7 +1323,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file xattr region either, inode %lu\n", + "file xattr region either, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; } @@ -1335,7 +1335,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) "crypto metadata only in the extended attribute " "region, but eCryptfs was mounted without " "xattr support enabled. eCryptfs will not treat " - "this like an encrypted file, inode %lu\n", + "this like an encrypted file, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 7929411837cf..49b0fbe0428a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -253,7 +253,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) if (rc) goto out_put; ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " - "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + "[0x%.16llx] size: [0x%.16llx]\n", inode, inode->i_ino, (unsigned long long)i_size_read(inode)); goto out; out_put: diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 28407578f83a..4b132729e638 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -132,7 +132,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - pr_warn("extent %d has bad magic number in inode %lu\n", + pr_warn("extent %d has bad magic number in inode %llu\n", i, inode->i_ino); brelse(bh); goto read_inode_error; @@ -140,7 +140,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) } brelse(bh); - pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + pr_debug("efs_iget(): inode %llu, extents %d, mode %o\n", inode->i_ino, in->numextents, inode->i_mode); switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -171,7 +171,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - pr_warn("failed to read inode %lu\n", inode->i_ino); + pr_warn("failed to read inode %llu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a8c278c50083..a1731dafb1e1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1080,7 +1080,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " - " pos:%lli ino:%lx sdev:%x\n", + " pos:%lli ino:%llx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 6c9be60a3e48..5c3183ce350e 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -130,12 +130,12 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, parent = mnt->mnt_sb->s_export_op->get_parent(dentry); if (IS_ERR(parent)) { - dprintk("get_parent of %lu failed, err %ld\n", + dprintk("get_parent of %llu failed, err %ld\n", dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } - dprintk("%s: find name of %lu in %lu\n", __func__, + dprintk("%s: find name of %llu in %llu\n", __func__, dentry->d_inode->i_ino, parent->d_inode->i_ino); err = exportfs_get_name(mnt, parent, nbuf, dentry); if (err == -ENOENT) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 395fc36c089b..278d4be8ecbe 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -141,7 +141,7 @@ out: Ebadsize: if (!quiet) ext2_error(sb, __func__, - "size of directory #%lu is not a multiple " + "size of directory #%llu is not a multiple " "of chunk size", dir->i_ino); goto fail; Eshort: @@ -160,7 +160,7 @@ Einumber: error = "inode out of bounds"; bad_entry: if (!quiet) - ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + ext2_error(sb, __func__, "bad entry in directory #%llu: : %s - " "offset=%llu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode), @@ -170,7 +170,7 @@ Eend: if (!quiet) { p = (ext2_dirent *)(kaddr + offs); ext2_error(sb, "ext2_check_folio", - "entry in directory #%lu spans the page boundary" + "entry in directory #%llu spans the page boundary" "offset=%llu, inode=%lu", dir->i_ino, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode)); @@ -281,7 +281,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(kaddr)) { ext2_error(sb, __func__, - "bad page in #%lu", + "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return PTR_ERR(kaddr); @@ -383,7 +383,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { ext2_error(dir->i_sb, __func__, - "dir %lu size %lld exceeds block count %llu", + "dir %llu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index fdf63e9c6e7c..bf21b57cf98c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -169,9 +169,10 @@ static void ext2_preread_inode(struct inode *inode) unsigned long block_group; unsigned long offset; unsigned long block; + unsigned int ino = inode->i_ino; struct ext2_group_desc * gdp; - block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); if (gdp == NULL) return; @@ -179,7 +180,7 @@ static void ext2_preread_inode(struct inode *inode) /* * Figure out the offset within the block group inode table */ - offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + offset = ((ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * EXT2_INODE_SIZE(inode->i_sb); block = le32_to_cpu(gdp->bg_inode_table) + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); @@ -381,7 +382,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * * So add our directory's i_ino into the starting point for the hash. */ - group = (group + parent->i_ino) % ngroups; + group = (group + (unsigned int)parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some @@ -589,7 +590,7 @@ got: goto fail_free_drop; mark_inode_dirty(inode); - ext2_debug("allocating inode %lu\n", inode->i_ino); + ext2_debug("allocating inode %llu\n", inode->i_ino); ext2_preread_inode(inode); return inode; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dbfe9098a124..45286c0c3b6b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1152,7 +1152,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de */ if (!bh) { ext2_error(inode->i_sb, "ext2_free_branches", - "Read failure, inode=%ld, block=%ld", + "Read failure, inode=%llu, block=%ld", inode->i_ino, nr); continue; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index c885dcc3bd0d..14ada70db36a 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -227,7 +227,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (!ext2_xattr_header_valid(HDR(bh))) { bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -313,7 +313,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (!ext2_xattr_header_valid(HDR(bh))) { bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -454,7 +454,7 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name, if (!ext2_xattr_header_valid(header)) { bad_block: ext2_error(sb, "ext2_xattr_set", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -833,7 +833,7 @@ ext2_xattr_delete_inode(struct inode *inode) if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 1)) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: xattr block %d is out of data blocks range", + "inode %llu: xattr block %d is out of data blocks range", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } @@ -841,14 +841,14 @@ ext2_xattr_delete_inode(struct inode *inode) bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: block %d read error", inode->i_ino, + "inode %llu: block %d read error", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); if (!ext2_xattr_header_valid(HDR(bh))) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } @@ -952,7 +952,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", - "inode %ld: block %ld read error", + "inode %llu: block %ld read error", inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 28b2a3deb954..17edd678fa87 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -535,7 +535,7 @@ static int call_filldir(struct file *file, struct dir_context *ctx, struct super_block *sb = inode->i_sb; if (!fname) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 293f698b7042..85e6c2b543a8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -92,7 +92,7 @@ */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ - pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %llu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) @@ -3229,7 +3229,7 @@ extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, - unsigned long, ext4_fsblk_t, + u64, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae3804f36535..042e1555a674 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4603,7 +4603,7 @@ retry: } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { - ext4_debug("inode #%lu: block %u: len %u: " + ext4_debug("inode #%llu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); @@ -4955,7 +4955,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret != max_blocks) ext4_msg(inode->i_sb, KERN_INFO, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "split block mapping found for atomic write, " "ret = %d", inode->i_ino, map.m_lblk, @@ -4974,7 +4974,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "returned %d or %d", inode->i_ino, map.m_lblk, map.m_len, ret, ret2); @@ -5031,7 +5031,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a1538bac51c6..6e4a191e8219 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -214,7 +214,7 @@ static void ext4_es_print_tree(struct inode *inode) struct ext4_es_tree *tree; struct rb_node *node; - printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + printk(KERN_DEBUG "status extents for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_es_tree; node = rb_first(&tree->root); while (node) { @@ -703,7 +703,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { pr_warn("ES insert assertion failed for " - "inode: %lu we can find an extent " + "inode: %llu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add a delayed/hole extent " "[%d/%d/%llu/%x]\n", @@ -721,7 +721,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -731,7 +731,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } if (ee_status ^ es_status) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -744,7 +744,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "can't find an extent at block %d but we want " "to add a written/unwritten extent " "[%d/%d/%llu/%x]\n", inode->i_ino, @@ -779,7 +779,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, * We want to add a delayed/hole extent but this * block has been allocated. */ - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -788,13 +788,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { pr_warn("ES insert assertion failed for " - "inode: %lu retval %d != es_len %d\n", + "inode: %llu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { pr_warn("ES insert assertion failed for " - "inode: %lu m_pblk %llu != " + "inode: %llu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, ext4_es_pblock(es)); @@ -809,7 +809,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } } else if (retval == 0) { if (ext4_es_is_written(es)) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "We can't find the block but we want to add " "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -919,7 +919,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %llu\n", lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) @@ -1631,7 +1631,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + es_debug("remove [%u/%u) from extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) @@ -1821,7 +1821,7 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); if (inode_cnt) seq_printf(seq, - "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + "maximum:\n %llu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); @@ -1998,7 +1998,7 @@ static void ext4_print_pending_tree(struct inode *inode) struct rb_node *node; struct pending_reservation *pr; - printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + printk(KERN_DEBUG "pending reservations for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_pending_tree; node = rb_first(&tree->root); while (node) { @@ -2214,7 +2214,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + es_debug("add [%u/%u) delayed to extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) return; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f575751f1cae..379fb66dedbc 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -616,7 +616,7 @@ static int __track_range(handle_t *handle, struct inode *inode, void *arg, (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { - ext4_debug("Special inode %ld being modified\n", inode->i_ino); + ext4_debug("Special inode %llu being modified\n", inode->i_ino); return -ECANCELED; } @@ -914,7 +914,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; - ext4_debug("will try writing %d to %d for inode %ld\n", + ext4_debug("will try writing %d to %d for inode %llu\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { @@ -1792,7 +1792,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, cur = start; remaining = len; - ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); @@ -1903,7 +1903,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret) goto out; - ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", + ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b20a1bf866ab..628a74b2bbe6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -253,13 +253,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) return; } if (icount_read(inode) > 1) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d", __func__, __LINE__, inode->i_ino, icount_read(inode)); return; } if (inode->i_nlink) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } @@ -631,7 +631,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * * So add our directory's i_ino into the starting point for the hash. */ - *group = (*group + parent->i_ino) % ngroups; + *group = (*group + (unsigned int)parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free @@ -1275,7 +1275,7 @@ got: * twice. */ err = -EIO; - ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + ext4_error(sb, "failed to insert inode %llu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); @@ -1344,7 +1344,7 @@ got: goto fail_free_drop; } - ext4_debug("allocating inode %lu\n", inode->i_ino); + ext4_debug("allocating inode %llu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index da76353b3a57..5aec759eed70 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -102,7 +102,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %llu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1f6bc05593df..f846fcb7db24 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -119,7 +119,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, - "can't get inode location %lu", + "can't get inode location %llu", inode->i_ino); return 0; } @@ -512,7 +512,7 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { - ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + ext4_warning(inode->i_sb, "inode %llu doesn't have inline data.", inode->i_ino); goto out; } @@ -934,7 +934,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; - trace_printk("inode %lu\n", dir->i_ino); + trace_printk("inode %llu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); @@ -1071,7 +1071,7 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, - "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + "error restoring inline_data for inode -- potential data loss! (inode %llu, error %d)", inode->i_ino, ret); return; } @@ -1740,7 +1740,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, - "error %d getting inode %lu block", + "error %d getting inode %llu block", err, dir->i_ino); return false; } @@ -1755,7 +1755,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - no `..'", + "bad inline directory (dir #%llu) - no `..'", dir->i_ino); goto out; } @@ -1769,7 +1769,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - " + "bad inline directory (dir #%llu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..d50f31124a78 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -262,7 +262,7 @@ void ext4_evict_inode(struct inode *inode) err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, - "couldn't truncate inode %lu (err %d)", + "couldn't truncate inode %llu (err %d)", inode->i_ino, err); goto stop_handle; } @@ -342,7 +342,7 @@ void ext4_da_update_reserve_space(struct inode *inode, spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_warning(inode->i_sb, "%s: ino %lu, used %d " + ext4_warning(inode->i_sb, "%s: ino %llu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); @@ -475,7 +475,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { - printk("ES cache assertion failed for inode: %lu " + printk("ES cache assertion failed for inode: %llu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, @@ -515,7 +515,7 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, if (unlikely(retval != map2.m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", + "%llu: retval %d != map->m_len %d", inode->i_ino, retval, map2.m_len); WARN_ON(1); } @@ -563,7 +563,7 @@ int ext4_map_query_blocks(handle_t *handle, struct inode *inode, if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", + "%llu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } @@ -630,7 +630,7 @@ int ext4_map_create_blocks(handle_t *handle, struct inode *inode, if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, - "ES len assertion failed for inode %lu: " + "ES len assertion failed for inode %llu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); @@ -937,7 +937,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, { int ret = 0; - ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_unwritten: inode %llu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); @@ -1659,7 +1659,7 @@ void ext4_da_release_space(struct inode *inode, int to_free) * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " - "ino %lu, to_free %d with only %d reserved " + "ino %llu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); @@ -2491,7 +2491,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " - "inode %lu at logical offset %llu with" + "inode %llu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, @@ -2535,7 +2535,7 @@ update_disksize: err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, - "Failed to mark inode %lu dirty", + "Failed to mark inode %llu dirty", inode->i_ino); } if (!err) @@ -2909,7 +2909,7 @@ retry: if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " - "%ld pages, ino %lu; err %d", __func__, + "%ld pages, ino %llu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3ae9cb50a0c0..1d0c3d4bdf47 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -477,7 +477,7 @@ static long swap_inode_boot_loader(struct super_block *sb, if (err < 0) { /* No need to update quota information. */ ext4_warning(inode->i_sb, - "couldn't mark inode #%lu dirty (err %d)", + "couldn't mark inode #%llu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); @@ -493,7 +493,7 @@ static long swap_inode_boot_loader(struct super_block *sb, if (err < 0) { /* No need to update quota information. */ ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", + "couldn't mark inode #%llu dirty (err %d)", inode_bl->i_ino, err); goto revert; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..9e8041ac5623 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2266,7 +2266,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } @@ -3032,7 +3032,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; @@ -5628,7 +5628,7 @@ void ext4_discard_preallocations(struct inode *inode) if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; - mb_debug(sb, "discard preallocation for inode %lu\n", + mb_debug(sb, "discard preallocation for inode %llu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 96ab95167bd6..477d43d7e294 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -455,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode) * log, so disable fast commits for this transaction. */ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); - goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + goal = ((((u32)inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index ce1f738dff93..ab17c1d3a7b5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -420,21 +420,21 @@ static int mext_check_validity(struct inode *orig_inode, /* origin and donor should be different inodes */ if (orig_inode == donor_inode) { - ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* origin and donor should belone to the same filesystem */ if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -477,26 +477,26 @@ static int mext_check_validity(struct inode *orig_inode, } if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { - ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { - ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EPERM; } /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } @@ -523,7 +523,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { - ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -533,7 +533,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { - ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %llu, donor %llu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; @@ -550,7 +550,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { - ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: len should not be 0 [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c4b5e252af0e..503dc9ffd614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -144,7 +144,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, - "inode #%lu: lblock %lu: comm %s: " + "inode #%llu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); @@ -841,7 +841,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, - "Directory (ino: %lu) htree depth %#06x exceed" + "Directory (ino: %llu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { @@ -1793,7 +1793,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu", + "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); @@ -2227,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + dxtrace(printk(KERN_DEBUG "Creating index: inode %llu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); @@ -2523,7 +2523,7 @@ again: restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { - ext4_warning(sb, "Directory (ino: %lu) index full, " + ext4_warning(sb, "Directory (ino: %llu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c0022f0bff87..64ea47624233 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -179,8 +179,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) } else brelse(iloc.bh); - ext4_debug("superblock will point to %lu\n", inode->i_ino); - ext4_debug("orphan inode %lu will point to %d\n", + ext4_debug("superblock will point to %llu\n", inode->i_ino); + ext4_debug("orphan inode %llu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); @@ -249,7 +249,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) } mutex_lock(&sbi->s_orphan_lock); - ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); + ext4_debug("remove inode %llu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); @@ -284,7 +284,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - ext4_debug("orphan inode %lu will point to %u\n", + ext4_debug("orphan inode %llu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { @@ -328,9 +328,9 @@ static void ext4_process_orphan(struct inode *inode, if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", + "%s: truncating inode %llu to %lld bytes", __func__, inode->i_ino, inode->i_size); - ext4_debug("truncating inode %lu to %lld bytes\n", + ext4_debug("truncating inode %llu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); @@ -349,9 +349,9 @@ static void ext4_process_orphan(struct inode *inode, } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", + "%s: deleting unreferenced inode %llu", __func__, inode->i_ino); - ext4_debug("deleting unreferenced inode %lu\n", + ext4_debug("deleting unreferenced inode %llu\n", inode->i_ino); (*nr_orphans)++; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a8c95eee91b7..86011275ad83 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -180,7 +180,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) struct super_block *sb = inode->i_sb; int ret = 0; - ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %llu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); @@ -204,7 +204,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, error %d)", inode->i_ino, ret); + "(inode %llu, error %d)", inode->i_ino, ret); } ext4_clear_io_unwritten_flag(io_end); @@ -221,7 +221,7 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) if (list_empty(head)) return; - ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + ext4_debug("Dump inode %llu completed io list\n", inode->i_ino); list_for_each_entry(io_end, head, list) { cur = &io_end->list; before = cur->prev; @@ -229,7 +229,7 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) after = cur->next; io_end1 = container_of(after, ext4_io_end_t, list); - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + ext4_debug("io 0x%p from inode %llu,prev 0x%p,next 0x%p\n", io_end, inode->i_ino, io_end0, io_end1); } #endif @@ -366,7 +366,7 @@ static void ext4_end_bio(struct bio *bio) if (bio->bi_status) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %llu " "starting block %llu)", bio->bi_status, inode->i_ino, (unsigned long long) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 43f680c750ae..781c083000c2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -848,12 +848,12 @@ void __ext4_error_inode(struct inode *inode, const char *function, vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: block %llu: comm %s: %pV\n", + "inode #%llu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); @@ -888,13 +888,13 @@ void __ext4_error_file(struct file *file, const char *function, vaf.va = &args; if (block) printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "EXT4-fs error (device %s): %s:%d: inode #%llu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "EXT4-fs error (device %s): %s:%d: inode #%llu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); @@ -1035,14 +1035,14 @@ void __ext4_warning_inode(const struct inode *inode, const char *function, vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, - unsigned long ino, ext4_fsblk_t block, + u64 ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) @@ -1061,7 +1061,7 @@ __acquires(bitlock) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) - printk(KERN_CONT "inode %lu: ", ino); + printk(KERN_CONT "inode %llu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); @@ -1170,7 +1170,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " - "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + "inode %s:%llu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); @@ -1446,7 +1446,7 @@ static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { - pr_warn("%s: inode %ld still in fc list", + pr_warn("%s: inode %llu still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); @@ -1456,7 +1456,7 @@ static void ext4_destroy_inode(struct inode *inode) { if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): inode tracked as orphan!", + "Inode %llu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), @@ -1467,7 +1467,7 @@ static void ext4_destroy_inode(struct inode *inode) if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7bf9ba19a89d..60aec4712f7f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -64,7 +64,7 @@ #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ - printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + printk(KERN_DEBUG "inode %s:%llu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ @@ -1035,7 +1035,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, ref_count = ext4_xattr_inode_get_ref(ea_inode); if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { ext4_error_inode(ea_inode, __func__, __LINE__, 0, - "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + "EA inode %llu ref wraparound: ref_count=%lld ref_change=%d", ea_inode->i_ino, ref_count, ref_change); brelse(iloc.bh); ret = -EFSCORRUPTED; @@ -1046,7 +1046,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, if (ref_change > 0) { if (ref_count == 1) { - WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + WARN_ONCE(ea_inode->i_nlink, "EA inode %llu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); @@ -1055,7 +1055,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, } else { if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, - "EA inode %lu i_nlink=%u", + "EA inode %llu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); @@ -2854,7 +2854,7 @@ shift: cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { - ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + ext4_warning(inode->i_sb, "Unable to expand inode %llu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8c76400ba631..0b8be500db65 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -773,7 +773,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); f2fs_info_ratelimited(sbi, - "checksum invalid, nid = %lu, %x vs %x", + "checksum invalid, nid = %llu, %x vs %x", dic->inode->i_ino, provided, calculated); } @@ -932,7 +932,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) return false; out: - f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + f2fs_warn(sbi, "access invalid cluster, ino:%llu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f70092e231f0..38802ee2e40d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -368,7 +368,7 @@ start_find_entry: max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { - f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", + f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %llu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; f2fs_i_depth_write(dir, max_depth); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0ed84cc065a7..d73aeef333a2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -34,7 +34,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) if (!f2fs_is_valid_blkaddr(sbi, ei.blk, DATA_GENERIC_ENHANCE) || !f2fs_is_valid_blkaddr(sbi, ei.blk + ei.len - 1, DATA_GENERIC_ENHANCE)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; @@ -50,14 +50,14 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) if (devi == 0) { f2fs_warn(sbi, - "%s: inode (ino=%lx) is an alias of meta device", + "%s: inode (ino=%llx) is an alias of meta device", __func__, inode->i_ino); return false; } if (bdev_is_zoned(FDEV(devi).bdev)) { f2fs_warn(sbi, - "%s: device alias inode (ino=%lx)'s extent info " + "%s: device alias inode (ino=%llx)'s extent info " "[%u, %u, %u] maps to zoned block device", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; @@ -65,7 +65,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) return true; } - f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + f2fs_warn(sbi, "%s: device alias inode (ino=%llx)'s extent info " "[%u, %u, %u] is inconsistent w/ any devices", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb34e864d0ef..760e6d80bbdd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2706,7 +2706,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); if (unlikely(sbi->total_valid_block_count < count)) { - f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%llu, count:%u", sbi->total_valid_block_count, inode->i_ino, count); sbi->total_valid_block_count = 0; set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2719,7 +2719,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); if (unlikely(inode->i_blocks < sectors)) { - f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + f2fs_warn(sbi, "Inconsistent i_blocks, ino:%llu, iblocks:%llu, sectors:%llu", inode->i_ino, (unsigned long long)inode->i_blocks, (unsigned long long)sectors); @@ -2993,7 +2993,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { - f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%llu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8a2f17a8f11..a7957e03ee03 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1917,7 +1917,7 @@ next_alloc: f2fs_up_write(&sbi->pin_sem); err = -ENOSPC; f2fs_warn_ratelimited(sbi, - "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "ino:%llu, start:%lu, end:%lu, need to trigger GC to " "reclaim enough free segment when checkpoint is enabled", inode->i_ino, pg_start, pg_end); goto out_err; @@ -2307,7 +2307,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%llu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) @@ -3494,7 +3494,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) return -EINVAL; if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { - f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", + f2fs_warn(sbi, "%s: Enable GC = ino %llx after %x GC trials", __func__, inode->i_ino, fi->i_gc_failures); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; @@ -3679,7 +3679,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", + "Can't enable fs-verity on inode %llu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -3950,7 +3950,7 @@ out: } else if (released_blocks && atomic_read(&fi->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%llx " "iblocks=%llu, released=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, @@ -4133,7 +4133,7 @@ unlock_inode: } else if (reserved_blocks && atomic_read(&fi->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx " + f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%llx " "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f46b2673d31f..c0c8a1056d6b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1622,7 +1622,7 @@ next_step: iput(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err_ratelimited(sbi, - "inode %lx has both inline_data flag and " + "inode %llu has both inline_data flag and " "data block, nid=%u, ofs_in_node=%u", inode->i_ino, dni.nid, ofs_in_node); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0a1052d5ee62..2669439b9413 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -176,7 +176,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) if (unlikely(dn->data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(dn); set_sbi_flag(fio.sbi, SBI_NEED_FSCK); - f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%llu, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; @@ -431,7 +431,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct folio *ifolio, if (unlikely(dn.data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(&dn); set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); - f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%llu, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); f2fs_handle_error(F2FS_F_SB(folio), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e0f850b3f0c3..f27198d6695b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -203,14 +203,14 @@ static bool sanity_check_compress_inode(struct inode *inode, if (ri->i_compress_algorithm >= COMPRESS_MAX) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", + "%s: inode (ino=%llx) has unsupported compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, ri->i_compress_algorithm); return false; } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + "%s: inode (ino=%llx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), SECTOR_TO_BLOCK(inode->i_blocks)); return false; @@ -218,7 +218,7 @@ static bool sanity_check_compress_inode(struct inode *inode, if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", + "%s: inode (ino=%llx) has unsupported log cluster size: %u, run fsck to fix", __func__, inode->i_ino, ri->i_log_cluster_size); return false; } @@ -262,7 +262,7 @@ static bool sanity_check_compress_inode(struct inode *inode, return true; err_level: - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx) has unsupported compress level: %u, run fsck to fix", __func__, inode->i_ino, clevel); return false; } @@ -276,40 +276,40 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); if (!iblocks) { - f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%llx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } if (ino_of_node(node_folio) != nid_of_node(node_folio)) { - f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%llx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, ino_of_node(node_folio), nid_of_node(node_folio)); return false; } if (ino_of_node(node_folio) == fi->i_xattr_nid) { - f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode i_ino=%llx, xnid=%x, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; } if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { - f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + f2fs_warn(sbi, "%s: directory inode (ino=%llx) has a single i_nlink", __func__, inode->i_ino); return false; } if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) is with extra_attr, but extra_attr feature is off", __func__, inode->i_ino); return false; } if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || fi->i_extra_isize % sizeof(__le32)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_extra_isize: %d, max: %zu", __func__, inode->i_ino, fi->i_extra_isize, F2FS_TOTAL_EXTRA_ATTR_SIZE); return false; @@ -327,7 +327,7 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) f2fs_has_inline_xattr(inode) && (fi->i_inline_xattr_size < MIN_INLINE_XATTR_SIZE || fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu", __func__, inode->i_ino, fi->i_inline_xattr_size, MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE); return false; @@ -335,64 +335,64 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) if (!f2fs_sb_has_extra_attr(sbi)) { if (f2fs_sb_has_project_quota(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); return false; } if (f2fs_sb_has_inode_chksum(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); return false; } if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); return false; } if (f2fs_sb_has_inode_crtime(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); return false; } if (f2fs_sb_has_compression(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); return false; } } if (f2fs_sanity_check_inline_data(inode, node_folio)) { - f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { - f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx, mode=%u) should not have inline_dentry, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) has casefold flag, but casefold feature is off", __func__, inode->i_ino); return false; } if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_xattr_nid: %u, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; } if (IS_DEVICE_ALIASING(inode)) { if (!f2fs_sb_has_device_alias(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) has device alias flag, but the feature is off", __func__, inode->i_ino); return false; } if (!f2fs_is_pinned_file(inode)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + f2fs_warn(sbi, "%s: inode (ino=%llx) has device alias flag, but is not pinned", __func__, inode->i_ino); return false; } @@ -925,7 +925,7 @@ retry: */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { f2fs_warn(F2FS_I_SB(inode), - "f2fs_evict_inode: inconsistent node id, ino:%lu", + "f2fs_evict_inode: inconsistent node id, ino:%llu", inode->i_ino); f2fs_inode_synced(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -954,7 +954,7 @@ retry: */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { f2fs_warn(sbi, - "f2fs_evict_inode: inode is dirty, ino:%lu", + "f2fs_evict_inode: inode is dirty, ino:%llu", inode->i_ino); f2fs_inode_synced(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e360f08a9586..efbb0732d420 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -505,7 +505,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } if (inode->i_nlink == 0) { - f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%llx) has zero i_nlink", __func__, inode->i_ino); err = -EFSCORRUPTED; set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); @@ -515,7 +515,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %lu/%lu", + f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); err = -EPERM; goto out_iput; @@ -573,11 +573,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) } if (unlikely(inode->i_nlink == 0)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has zero i_nlink", + f2fs_warn(sbi, "%s: inode (ino=%llx) has zero i_nlink", __func__, inode->i_ino); goto corrupted; } else if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { - f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + f2fs_warn(sbi, "%s: directory inode (ino=%llx) has a single i_nlink", __func__, inode->i_ino); goto corrupted; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5ca6f518cfa1..5a7679c6317e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -847,7 +847,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) err = -EFSCORRUPTED; f2fs_err_ratelimited(sbi, "inode mapping table is corrupted, run fsck to fix it, " - "ino:%lu, nid:%u, level:%d, offset:%d", + "ino:%llu, nid:%u, level:%d, offset:%d", dn->inode->i_ino, nids[i], level, offset[level]); set_sbi_flag(sbi, SBI_NEED_FSCK); goto release_pages; @@ -1013,7 +1013,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return PTR_ERR(folio); if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { - f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", + f2fs_err(sbi, "incorrect node reference, ino: %llu, nid: %u, ino_of_node: %u", dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); @@ -1194,7 +1194,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) if (level <= 0) { if (!level) { level = -EFSCORRUPTED; - f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + f2fs_err(sbi, "%s: inode ino=%llx has corrupted node block, from:%lu addrs:%u", __func__, inode->i_ino, from, ADDRS_PER_INODE(inode)); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -1265,7 +1265,7 @@ skip_partial: set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, - "truncate node fail, ino:%lu, nid:%u, " + "truncate node fail, ino:%llu, nid:%u, " "offset[0]:%d, offset[1]:%d, nofs:%d", inode->i_ino, dn.nid, offset[0], offset[1], nofs); @@ -1351,7 +1351,7 @@ int f2fs_remove_inode_page(struct inode *inode) if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { f2fs_warn(F2FS_I_SB(inode), - "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%llu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a26071f2b0bc..3d3dacec9482 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -232,7 +232,7 @@ out: name = ""; else name = raw_inode->i_name; - f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", + f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %llu, err = %d", __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; @@ -532,7 +532,7 @@ got_it: max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%llu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; @@ -674,7 +674,7 @@ retry_dn: f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { - f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%llu, ofs:%u, %u", inode->i_ino, ofs_of_node(dn.node_folio), ofs_of_node(folio)); err = -EFSCORRUPTED; @@ -748,7 +748,7 @@ retry_prev: if (f2fs_is_valid_blkaddr(sbi, dest, DATA_GENERIC_ENHANCE_UPDATE)) { - f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%llu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; goto err; @@ -768,7 +768,7 @@ retry_prev: err: f2fs_put_dnode(&dn); out: - f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + f2fs_notice(sbi, "recover_data: ino = %llx, nid = %x (i_size: %s), " "range (%u, %u), recovered = %d, err = %d", inode->i_ino, nid_of_node(folio), file_keep_isize(inode) ? "keep" : "recover", diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 941dc62a6d6f..610d5810074d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -365,7 +365,7 @@ static int lookup_all_xattrs(struct inode *inode, struct folio *ifolio, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -ENODATA; @@ -585,7 +585,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); f2fs_handle_error(F2FS_I_SB(inode), @@ -664,14 +664,14 @@ retry: if (!F2FS_I(inode)->i_xattr_nid) { error = f2fs_recover_xattr_data(inode, NULL); f2fs_notice(F2FS_I_SB(inode), - "recover xattr in inode (%lu), error(%d)", + "recover xattr in inode (%llu), error(%d)", inode->i_ino, error); if (!error) { kfree(base_addr); goto retry; } } - f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "set inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; @@ -699,7 +699,7 @@ retry: while (!IS_XATTR_LAST_ENTRY(last)) { if ((void *)(last) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + f2fs_err(F2FS_I_SB(inode), "inode (%llu) has invalid last xattr entry, entry_size: %zu", inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index 26d367e3668d..e85222892038 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -260,12 +260,12 @@ vxfs_bmap1(struct inode *ip, long iblock) if (VXFS_ISIMMED(vip)) goto unsupp; - printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n", + printk(KERN_WARNING "vxfs: inode %llu has no valid orgtype (%x)\n", ip->i_ino, vip->vii_orgtype); BUG(); unsupp: - printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n", + printk(KERN_WARNING "vxfs: inode %llu has an unsupported orgtype (%x)\n", ip->i_ino, vip->vii_orgtype); return 0; } diff --git a/fs/fserror.c b/fs/fserror.c index 06ca86adab9b..1e4d11fd9562 100644 --- a/fs/fserror.c +++ b/fs/fserror.c @@ -176,7 +176,7 @@ lost_event: lost: if (inode) pr_err_ratelimited( - "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", + "%s: lost file I/O error report for ino %llu type %u pos 0x%llx len 0x%llx error %d", sb->s_id, inode->i_ino, type, pos, len, error); else pr_err_ratelimited( diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index b80ba40e3877..7f5339ee57c1 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -417,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", + hfs_dbg("cnid %u - (ino %llu, name %s) - (ino %llu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index a097908b269d..f066a99a863b 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -411,7 +411,7 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); + hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { hfs_dbg("first_extent: start %u, len %u\n", @@ -482,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", + hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 878535db64d6..95f0333a608b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -270,7 +270,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { atomic64_dec(&HFS_SB(sb)->folder_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) @@ -455,7 +455,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 4b79cd606276..174cd13106ad 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -203,7 +203,7 @@ int hfsplus_create_attr_nolock(struct inode *inode, const char *name, int entry_size; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (name) { @@ -255,7 +255,7 @@ int hfsplus_create_attr(struct inode *inode, hfsplus_attr_entry *entry_ptr; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -337,7 +337,7 @@ int hfsplus_delete_attr_nolock(struct inode *inode, const char *name, struct super_block *sb = inode->i_sb; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (name) { @@ -367,7 +367,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -436,7 +436,7 @@ int hfsplus_replace_attr(struct inode *inode, hfsplus_attr_entry *entry_ptr; int err = 0; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 02c1eee4a4b8..0e961e99b985 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", + hfs_dbg("cnid %u - ino %llu, name %s - ino %llu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d559bf8625f8..054f6da46033 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -313,7 +313,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, if (!S_ISREG(inode->i_mode)) return -EPERM; - hfs_dbg("src_dir->i_ino %lu, dst_dir->i_ino %lu, inode->i_ino %lu\n", + hfs_dbg("src_dir->i_ino %llu, dst_dir->i_ino %llu, inode->i_ino %llu\n", src_dir->i_ino, dst_dir->i_ino, inode->i_ino); mutex_lock(&sbi->vh_mutex); @@ -385,7 +385,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (HFSPLUS_IS_RSRC(inode)) return -EPERM; - hfs_dbg("dir->i_ino %lu, inode->i_ino %lu\n", + hfs_dbg("dir->i_ino %llu, inode->i_ino %llu\n", dir->i_ino, inode->i_ino); mutex_lock(&sbi->vh_mutex); @@ -393,7 +393,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_ino == cnid && atomic_read(&HFSPLUS_I(inode)->opencnt)) { str.name = name; - str.len = sprintf(name, "temp%lu", inode->i_ino); + str.len = sprintf(name, "temp%llu", inode->i_ino); res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, sbi->hidden_dir, &str); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 8e886514d27f..474fde1a1653 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg("ino %lu, iblock %llu - dblock %u\n", + hfs_dbg("ino %llu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -476,7 +476,7 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); + hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { @@ -545,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", + hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 922ff41df042..02be32dc6833 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -230,7 +230,7 @@ static int hfsplus_get_perms(struct inode *inode, inode->i_flags &= ~S_APPEND; return 0; bad_type: - pr_err("invalid file type 0%04o for inode %lu\n", mode, inode->i_ino); + pr_err("invalid file type 0%04o for inode %llu\n", mode, inode->i_ino); return -EIO; } @@ -328,7 +328,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct hfsplus_vh *vhdr = sbi->s_vhdr; int error = 0, error2; - hfs_dbg("inode->i_ino %lu, start %llu, end %llu\n", + hfs_dbg("inode->i_ino %llu, start %llu, end %llu\n", inode->i_ino, start, end); error = file_write_and_wait_range(file, start, end); @@ -639,7 +639,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_entry entry; int res = 0; - hfs_dbg("inode->i_ino %lu\n", inode->i_ino); + hfs_dbg("inode->i_ino %llu\n", inode->i_ino); if (HFSPLUS_IS_RSRC(inode)) main_inode = HFSPLUS_I(inode)->rsrc_inode; @@ -716,7 +716,7 @@ out: if (!res) { res = hfs_btree_write(tree); if (res) { - pr_err("b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %llu\n", res, inode->i_ino); } } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7229a8ae89f9..b3917249c206 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -156,7 +156,7 @@ static int hfsplus_system_write_inode(struct inode *inode) int err = hfs_btree_write(tree); if (err) { - pr_err("b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %llu\n", err, inode->i_ino); return err; } @@ -169,7 +169,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -184,7 +184,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 9904944cbd54..c70bb6f494b2 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -277,7 +277,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, u16 folder_finderinfo_len = sizeof(DInfo) + sizeof(DXInfo); u16 file_finderinfo_len = sizeof(FInfo) + sizeof(FXInfo); - hfs_dbg("ino %lu, name %s, value %p, size %zu\n", + hfs_dbg("ino %llu, name %s, value %p, size %zu\n", inode->i_ino, name ? name : NULL, value, size); @@ -447,7 +447,7 @@ int hfsplus_setxattr(struct inode *inode, const char *name, NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1; int res; - hfs_dbg("ino %lu, name %s, prefix %s, prefixlen %zu, " + hfs_dbg("ino %llu, name %s, prefix %s, prefixlen %zu, " "value %p, size %zu\n", inode->i_ino, name ? name : NULL, prefix ? prefix : NULL, prefixlen, @@ -607,7 +607,7 @@ ssize_t hfsplus_getxattr(struct inode *inode, const char *name, int res; char *xattr_name; - hfs_dbg("ino %lu, name %s, prefix %s\n", + hfs_dbg("ino %llu, name %s, prefix %s\n", inode->i_ino, name ? name : NULL, prefix ? prefix : NULL); @@ -717,7 +717,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) size_t strbuf_size; int xattr_name_len; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); if (!is_xattr_operation_supported(inode)) return -EOPNOTSUPP; @@ -819,7 +819,7 @@ static int hfsplus_removexattr(struct inode *inode, const char *name) int is_xattr_acl_deleted; int is_all_xattrs_deleted; - hfs_dbg("ino %lu, name %s\n", + hfs_dbg("ino %llu, name %s\n", inode->i_ino, name ? name : NULL); if (!HFSPLUS_SB(inode->i_sb)->attr_tree) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index ceb50b2dc91a..3bf11202e2d3 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -96,8 +96,8 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) } if (!fnode_is_dir(fno)) { e = 1; - hpfs_error(inode->i_sb, "not a directory, fnode %08lx", - (unsigned long)inode->i_ino); + hpfs_error(inode->i_sb, "not a directory, fnode %08llx", + inode->i_ino); } if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { e = 1; diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index dde764ebe246..8c6aa060fd87 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -550,9 +550,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (hpfs_sb(i->i_sb)->sb_chk) if (up != i->i_ino) { hpfs_error(i->i_sb, - "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08llx", dno, up, - (unsigned long)i->i_ino); + i->i_ino); return; } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 2149d3ca530b..4664f9ab06ee 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -245,8 +245,8 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, fnode->ea_offs = cpu_to_le16(0xc4); } if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { - hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", - (unsigned long)inode->i_ino, + hpfs_error(s, "fnode %08llx: ea_offs == %03x, ea_size_s == %03x", + inode->i_ino, le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); return; } diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 93d528f4f4f2..0e932cc8be1b 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -250,8 +250,8 @@ void hpfs_write_inode_nolock(struct inode *i) hpfs_brelse4(&qbh); } else hpfs_error(i->i_sb, - "directory %08lx doesn't have '.' entry", - (unsigned long)i->i_ino); + "directory %08llx doesn't have '.' entry", + i->i_ino); } mark_buffer_dirty(bh); brelse(bh); diff --git a/fs/inode.c b/fs/inode.c index 62df5dda0589..5ad169d51728 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -726,7 +726,7 @@ void dump_mapping(const struct address_space *mapping) struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; - unsigned long ino; + u64 ino; /* * If mapping is an invalid pointer, we don't want to crash @@ -750,14 +750,14 @@ void dump_mapping(const struct address_space *mapping) } if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + pr_warn("aops:%ps ino:%llx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { - pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + pr_warn("aops:%ps ino:%llx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } @@ -768,7 +768,7 @@ void dump_mapping(const struct address_space *mapping) * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ - pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", + pr_warn("aops:%ps ino:%llx dentry name(?):\"%s\"\n", a_ops, ino, fname); } @@ -2641,9 +2641,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) /* leave it no_open_fops */ break; default: - printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" - " inode %s:%lu\n", mode, inode->i_sb->s_id, - inode->i_ino); + pr_debug("init_special_inode: bogus i_mode (%o) for inode %s:%llu\n", + mode, inode->i_sb->s_id, inode->i_ino); break; } } diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index e4d57cb969f1..e655763a82ce 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -48,7 +48,7 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( -"%s: writeback error on inode %lu, offset %lld, sector %llu", +"%s: writeback error on inode %llu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, ioend->io_offset, ioend->io_sector); } diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 50b4cb3aea87..397568b9c7e7 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -156,7 +156,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, else { printk(KERN_DEBUG "zisofs: zisofs_inflate returned" - " %d, inode = %lu," + " %d, inode = %llu," " page idx = %d, bh idx = %d," " avail_in = %ld," " avail_out = %ld\n", diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 2ca16c3fe5ef..2fd9948d606e 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -152,7 +152,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" - " in block %lu of inode %lu\n", block, + " in block %lu of inode %llu\n", block, inode->i_ino); brelse(bh); return -EIO; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 5c01536c5e8f..3593e02e75fe 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1261,7 +1261,7 @@ out_noread: out_toomany: printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" - "isofs_read_level3_size: inode=%lu\n", + "isofs_read_level3_size: inode=%llu\n", __func__, inode->i_ino); goto out; } @@ -1380,7 +1380,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { - printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", + printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%llu).\n", inode->i_ino); } @@ -1450,7 +1450,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { - printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); ret = -EIO; goto fail; diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 58f80e1b3ac0..8dd3911717e0 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -100,7 +100,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, /* Basic sanity check, whether name doesn't exceed dir entry */ if (de_len < dlen + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" - " in block %lu of inode %lu\n", block, + " in block %lu of inode %llu\n", block, dir->i_ino); brelse(bh); return 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index cb2c529a8f1b..b60918ed8a99 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1677,7 +1677,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } - jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + jbd2_debug(1, "JBD2: inode %s/%llu, size %lld, bits %d, blksize %ld\n", inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -1689,7 +1689,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), - "%pg-%lu", journal->j_dev, journal->j_inode->i_ino); + "%pg-%llu", journal->j_dev, journal->j_inode->i_ino); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index dca4b5d8aaaa..a90f9092706c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2651,7 +2651,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return -EROFS; journal = transaction->t_journal; - jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + jbd2_debug(4, "Adding inode %llu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 2b38ce1fd8e8..c4088c3b4ac0 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -129,7 +129,7 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) struct jffs2_full_dirent *fd; unsigned long curofs = 1; - jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino); + jffs2_dbg(1, "jffs2_readdir() for dir_i #%llu\n", inode->i_ino); if (!dir_emit_dots(file, ctx)) return 0; @@ -211,7 +211,7 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, jffs2_free_raw_inode(ri); - jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", + jffs2_dbg(1, "%s(): Created ino #%llu with mode %o, nlink %d(%d). nrpages %ld\n", __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5e1ef4bc009b..1e18d3a79840 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -88,7 +88,7 @@ static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio) unsigned char *kaddr; int ret; - jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", + jffs2_dbg(2, "%s(): ino #%llu, page at offset 0x%lx\n", __func__, inode->i_ino, folio->index << PAGE_SHIFT); BUG_ON(!folio_test_locked(folio)); @@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb, uint32_t writtenlen = 0; void *buf; - jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", + jffs2_dbg(1, "%s(): ino #%llu, page at 0x%llx, range %d-%d, flags %lx\n", __func__, inode->i_ino, folio_pos(folio), start, end, folio->flags.f); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index c3ce2c868f7a..6ada8369a762 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -43,7 +43,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int ret; int alloc_type = ALLOC_NORMAL; - jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino); + jffs2_dbg(1, "%s(): ino #%llu\n", __func__, inode->i_ino); /* Special cases - we don't want more than one data node for these types on the medium at any time. So setattr @@ -243,7 +243,7 @@ void jffs2_evict_inode (struct inode *inode) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - jffs2_dbg(1, "%s(): ino #%lu mode %o\n", + jffs2_dbg(1, "%s(): ino #%llu mode %o\n", __func__, inode->i_ino, inode->i_mode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -334,8 +334,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size); if (ret < 0) { /* Eep */ - pr_notice("Read device numbers for inode %lu failed\n", - (unsigned long)inode->i_ino); + pr_notice("Read device numbers for inode %llu failed\n", + inode->i_ino); goto error; } if (f->metadata->size == sizeof(jdev.old_id)) @@ -351,8 +351,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) break; default: - pr_warn("%s(): Bogus i_mode %o for ino %lu\n", - __func__, inode->i_mode, (unsigned long)inode->i_ino); + pr_warn("%s(): Bogus i_mode %o for ino %llu\n", + __func__, inode->i_mode, inode->i_ino); } mutex_unlock(&f->sem); @@ -374,12 +374,12 @@ void jffs2_dirty_inode(struct inode *inode, int flags) struct iattr iattr; if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { - jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", + jffs2_dbg(2, "%s(): not calling setattr() for ino #%llu\n", __func__, inode->i_ino); return; } - jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n", + jffs2_dbg(1, "%s(): calling setattr() for ino #%llu\n", __func__, inode->i_ino); iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME; @@ -428,7 +428,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r struct jffs2_inode_info *f; int ret; - jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n", + jffs2_dbg(1, "%s(): dir_i %llu, mode 0x%x\n", __func__, dir_i->i_ino, mode); c = JFFS2_SB_INFO(sb); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4709762713ef..c7914dbc91ed 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -64,7 +64,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &jfs_file_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { - printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 294a67327c73..13ab21e66f51 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -302,7 +302,7 @@ int diRead(struct inode *ip) unsigned long pageno; int rel_inode; - jfs_info("diRead: ino = %ld", ip->i_ino); + jfs_info("diRead: ino = %llu", ip->i_ino); ipimap = sbi->ipimap; JFS_IP(ip)->ipimap = ipimap; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 64c6eaa7f3f2..c95804f6dc19 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -692,7 +692,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, unsigned long page_index; unsigned long page_offset; - jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", + jfs_info("__get_metapage: ino = %llu, lblock = 0x%lx, abs=%d", inode->i_ino, lblock, absolute); l2bsize = inode->i_blkbits; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 255a847ca0b6..0b6be8b8aeb1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -487,7 +487,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, int async_block = 0; __be32 ret; - dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", + dprintk("lockd: nlmsvc_lock(%s/%llu, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", inode->i_sb->s_id, inode->i_ino, lock->fl.c.flc_type, lock->fl.c.flc_pid, @@ -617,7 +617,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, int mode; __be32 ret; - dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_testlock(%s/%llu, ty=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_type, @@ -676,7 +676,7 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { int error = 0; - dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_unlock(%s/%llu, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_pid, @@ -716,7 +716,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l int status = 0; int mode; - dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_cancel(%s/%llu, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_pid, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dd0214dcb695..79f3dd2fd366 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -47,7 +47,7 @@ static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); - dprintk("lockd: %s %s/%ld\n", + dprintk("lockd: %s %s/%llu\n", msg, inode->i_sb->s_id, inode->i_ino); } #else diff --git a/fs/locks.c b/fs/locks.c index d13ec930b7bb..d8b066fb4210 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -234,7 +234,7 @@ locks_check_ctx_lists(struct inode *inode) if (unlikely(!list_empty(&ctx->flc_flock) || !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_lease))) { - pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", + pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%llx:\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); @@ -251,7 +251,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_ list_for_each_entry(flc, list, flc_list) if (flc->flc_file == filp) - pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " + pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%llx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, @@ -2896,7 +2896,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock_core *flc, (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%lu ", pid, + seq_printf(f, "%d %02x:%02x:%llu ", pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541c6a5bbf..838b072b6cf0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -36,7 +36,7 @@ void __minix_error_inode(struct inode *inode, const char *function, vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "minix-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); @@ -520,7 +520,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev) S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); } else { - printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); make_bad_inode(inode); } @@ -542,7 +542,7 @@ static struct inode *V1_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } if (raw_inode->i_nlinks == 0) { - printk("MINIX-fs: deleted inode referenced: %lu\n", + printk("MINIX-fs: deleted inode referenced: %llu\n", inode->i_ino); brelse(bh); iget_failed(inode); @@ -580,7 +580,7 @@ static struct inode *V2_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } if (raw_inode->i_nlinks == 0) { - printk("MINIX-fs: deleted inode referenced: %lu\n", + printk("MINIX-fs: deleted inode referenced: %llu\n", inode->i_ino); brelse(bh); iget_failed(inode); @@ -692,7 +692,7 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk("IO error syncing minix inode [%s:%08lx]\n", + printk("IO error syncing minix inode [%s:%08llx]\n", inode->i_sb->s_id, inode->i_ino); err = -EIO; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2402f57c8e7d..ddc3789363a5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1906,7 +1906,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) } error = nfs_lookup_verify_inode(inode, flags); - dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", + dfprintk(LOOKUPCACHE, "NFS: %s: inode %llu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; } @@ -2121,7 +2121,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* Expect a negative dentry */ BUG_ON(d_inode(dentry)); - dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: atomic_open(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); @@ -2404,7 +2404,7 @@ static int nfs_do_create(struct inode *dir, struct dentry *dentry, open_flags |= O_CREAT; - dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: create(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -2442,7 +2442,7 @@ nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: mknod(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -2469,7 +2469,7 @@ struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct iattr attr; struct dentry *ret; - dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: mkdir(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; @@ -2507,7 +2507,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: rmdir(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); @@ -2578,7 +2578,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: unlink(%s/%llu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); @@ -2638,7 +2638,7 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: symlink(%s/%llu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) @@ -2660,7 +2660,7 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%llu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); @@ -3414,7 +3414,7 @@ out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); - dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", + dfprintk(VFS, "NFS: permission(%s/%llu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5d08b6409c28..25048a3c2364 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -391,7 +391,7 @@ static int nfs_write_begin(const struct kiocb *iocb, trace_nfs_write_begin(file_inode(file), pos, len); - dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%llu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); @@ -432,7 +432,7 @@ static int nfs_write_end(const struct kiocb *iocb, int status; trace_nfs_write_end(file_inode(file), pos, len); - dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%llu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); /* @@ -557,7 +557,7 @@ static int nfs_launder_folio(struct folio *folio) struct inode *inode = folio->mapping->host; int ret; - dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", + dfprintk(PAGECACHE, "NFS: launder_folio(%llu, %llu)\n", inode->i_ino, folio_pos(folio)); folio_wait_private_2(folio); /* [DEPRECATED] */ @@ -647,7 +647,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) struct address_space *mapping; struct folio *folio = page_folio(vmf->page); - dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%llu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)folio_pos(folio)); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 90a11afa5d05..e85380e3b11d 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -241,7 +241,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) /* Note: if the write is unstable, don't set end_offs until commit */ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + dprintk("%s inode %llu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -456,7 +456,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) u32 j, idx; struct nfs_fh *fh; - dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", + dprintk("--> %s ino %llu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); @@ -514,7 +514,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) if (IS_ERR(ds_clnt)) return PNFS_NOT_ATTEMPTED; - dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n", + dprintk("%s ino %llu sync %d req %zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); @@ -1001,7 +1001,7 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) if (IS_ERR(ds_clnt)) goto out_err; - dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + dprintk("%s ino %llu, how %d cl_count %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; refcount_inc(&ds->ds_clp->cl_count); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f67773d52830..8b1559171fe3 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1631,7 +1631,7 @@ ff_layout_set_layoutcommit(struct inode *inode, return; pnfs_set_layoutcommit(inode, lseg, end_offset); - dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + dprintk("%s inode %llu pls_end_pos %llu\n", __func__, inode->i_ino, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } @@ -2136,7 +2136,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) u32 dss_id; bool ds_fatal_error = false; - dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", + dprintk("--> %s ino %llu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); @@ -2245,7 +2245,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers = nfs4_ff_layout_ds_version(mirror, dss_id); - dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", + dprintk("%s ino %llu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -2336,7 +2336,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) vers = nfs4_ff_layout_ds_version(mirror, dss_id); - dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, + dprintk("%s ino %llu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), vers); data->commit_done_cb = ff_layout_commit_done_cb; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4786343eeee0..98a8f0de1199 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2258,7 +2258,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) bool attr_changed = false; bool have_delegation; - dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", + dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), icount_read(inode), fattr->valid); @@ -2288,7 +2288,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %llu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } @@ -2358,7 +2358,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; - dprintk("NFS: change_attr change on server for file %s/%ld\n", + dprintk("NFS: change_attr change on server for file %s/%llu\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 91bcf67bd743..d839a97df822 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4714,7 +4714,7 @@ static int _nfs4_proc_lookupp(struct inode *inode, nfs_fattr_init(fattr); nfs4_init_sequence(server->nfs_client, &args.seq_args, &res.seq_res, 0, 0); - dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); + dprintk("NFS call lookupp ino=0x%llx\n", inode->i_ino); status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); @@ -10019,7 +10019,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) int status = 0; dprintk("NFS: initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", sync, + "lbw: %llu inode %llu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bc13d1e69449..e79deb9bf664 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -891,7 +891,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, while (!list_empty(layout_list)) { lo = list_entry(layout_list->next, struct pnfs_layout_hdr, plh_bulk_destroy); - dprintk("%s freeing layout for inode %lu\n", __func__, + dprintk("%s freeing layout for inode %llu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; @@ -1440,7 +1440,7 @@ _pnfs_return_layout(struct inode *ino) int status = 0; bool send, valid_layout; - dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); + dprintk("NFS: %s for inode %llu\n", __func__, ino->i_ino); spin_lock(&ino->i_lock); lo = nfsi->layout; @@ -3055,7 +3055,7 @@ pnfs_try_to_write_data(struct nfs_pgio_header *hdr, hdr->mds_ops = call_ops; - dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + dprintk("%s: Writing ino:%llu %u@%llu (how %d)\n", __func__, inode->i_ino, hdr->args.count, hdr->args.offset, how); trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); if (trypnfs != PNFS_NOT_ATTEMPTED) @@ -3181,7 +3181,7 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, hdr->mds_ops = call_ops; - dprintk("%s: Reading ino:%lu %u@%llu\n", + dprintk("%s: Reading ino:%llu %u@%llu\n", __func__, inode->i_ino, hdr->args.count, hdr->args.offset); trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); @@ -3314,7 +3314,7 @@ pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { nfsi->layout->plh_lwb = end_pos; mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", + dprintk("%s: Set layoutcommit for inode %llu ", __func__, inode->i_ino); } else if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; @@ -3363,7 +3363,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (!pnfs_layoutcommit_outstanding(inode)) return 0; - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + dprintk("--> %s inode %llu\n", __func__, inode->i_ino); status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8fdbba7cad96..d2259d948cc3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1039,7 +1039,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name, } inode = d_inode(path.dentry); - dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", + dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%llu)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b9c399b89df..a569d89ac912 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1253,7 +1253,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f if (ret) { struct inode *inode = file_inode(f); - pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%lu: %d\n", + pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%llu: %d\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, ret); @@ -2888,7 +2888,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { struct inode *inode = file_inode(f->nf_file); - seq_printf(s, "superblock: \"%02x:%02x:%ld\"", + seq_printf(s, "superblock: \"%02x:%02x:%llu\"", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ed85dd43da18..ee72c9565e4f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -601,9 +601,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct inode * inode = d_inode(dentry); dev_t ex_dev = exp_sb(exp)->s_dev; - dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", + dprintk("nfsd: fh_compose(exp %02x:%02x/%llu %pd2, ino=%llu)\n", MAJOR(ex_dev), MINOR(ex_dev), - (long) d_inode(exp->ex_path.dentry)->i_ino, + d_inode(exp->ex_path.dentry)->i_ino, dentry, (inode ? inode->i_ino : 0)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c884c3f34afb..eafdf7b7890f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1163,7 +1163,7 @@ nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } else if (unlikely(host_err == -EINVAL)) { struct inode *inode = d_inode(fhp->fh_dentry); - pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n", + pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%llu\n", inode->i_sb->s_id, inode->i_ino); host_err = -ESERVERFAULT; } diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index e7eebb04f9a4..7b1cd2baefcf 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -707,7 +707,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else @@ -748,7 +748,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else @@ -861,7 +861,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)entry_nrs[j]); } else { @@ -906,7 +906,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) last_nrs[k]); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, - "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", + "error %d deleting block that object (entry=%llu, ino=%llu) belongs to", ret, (unsigned long long)last_nrs[k], inode->i_ino); } @@ -923,7 +923,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, - "error %d deleting bitmap block of group=%lu, ino=%lu", + "error %d deleting bitmap block of group=%lu, ino=%llu", ret, group, inode->i_ino); } } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index ccc1a7aa52d2..824f2bd91c16 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -33,7 +33,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, if (err == -EINVAL) { __nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)", inode->i_ino); + "broken bmap (inode number=%llu)", inode->i_ino); err = -EIO; } return err; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 568367129092..2e553d698d0f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -64,7 +64,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) * clearing of an abandoned b-tree node is missing somewhere). */ nilfs_error(inode->i_sb, - "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)", + "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%llu)", (unsigned long long)blocknr, inode->i_ino); goto failed; } diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index dd0c8e560ef6..3c03f5a741d1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -353,7 +353,7 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, nchildren <= 0 || nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { nilfs_crit(inode->i_sb, - "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", + "bad btree node (ino=%llu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, (unsigned long long)blocknr, level, flags, nchildren); ret = 1; @@ -384,7 +384,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX || (nchildren == 0 && level > NILFS_BTREE_LEVEL_NODE_MIN))) { nilfs_crit(inode->i_sb, - "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d", + "bad btree root (ino=%llu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, level, flags, nchildren); ret = 1; } @@ -453,7 +453,7 @@ static int nilfs_btree_bad_node(const struct nilfs_bmap *btree, if (unlikely(nilfs_btree_node_get_level(node) != level)) { dump_stack(); nilfs_crit(btree->b_inode->i_sb, - "btree level mismatch (ino=%lu): %d != %d", + "btree level mismatch (ino=%llu): %d != %d", btree->b_inode->i_ino, nilfs_btree_node_get_level(node), level); return 1; @@ -521,7 +521,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, out_no_wait: if (!buffer_uptodate(bh)) { nilfs_err(btree->b_inode->i_sb, - "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)", + "I/O error reading b-tree node block (ino=%llu, blocknr=%llu)", btree->b_inode->i_ino, (unsigned long long)ptr); brelse(bh); return -EIO; @@ -2104,7 +2104,7 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, if (ret < 0) { if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, - "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", + "writing node/leaf block does not appear in b-tree (ino=%llu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); ret = -EINVAL; @@ -2146,7 +2146,7 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, level >= NILFS_BTREE_LEVEL_MAX) { dump_stack(); nilfs_warn(btree->b_inode->i_sb, - "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)", + "invalid btree level: %d (key=%llu, ino=%llu, blocknr=%llu)", level, (unsigned long long)key, btree->b_inode->i_ino, (unsigned long long)bh->b_blocknr); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index b243199036df..3653db5cdb65 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -150,7 +150,7 @@ out: Ebadsize: nilfs_error(sb, - "size of directory #%lu is not a multiple of chunk size", + "size of directory #%llu is not a multiple of chunk size", dir->i_ino); goto fail; Eshort: @@ -169,7 +169,7 @@ Einumber: error = "disallowed inode number"; bad_entry: nilfs_error(sb, - "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", + "bad entry in directory #%llu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); @@ -177,7 +177,7 @@ bad_entry: Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); nilfs_error(sb, - "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", + "entry in directory #%llu spans the page boundary offset=%lu, inode=%lu", dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: @@ -251,7 +251,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) kaddr = nilfs_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) { - nilfs_error(sb, "bad page in #%lu", inode->i_ino); + nilfs_error(sb, "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } @@ -336,7 +336,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, - "dir %lu size %lld exceeds block count %llu", + "dir %llu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; @@ -382,7 +382,7 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) return next_de; fail: - nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg); + nilfs_error(dir->i_sb, "directory #%llu %s", dir->i_ino, msg); folio_release_kmap(folio, de); return NULL; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 2d8dc6b35b54..8bd0b1374e25 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -338,7 +338,7 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { nilfs_crit(bmap->b_inode->i_sb, - "%s (ino=%lu): invalid key: %llu", + "%s (ino=%llu): invalid key: %llu", __func__, bmap->b_inode->i_ino, (unsigned long long)key); return -EINVAL; @@ -346,7 +346,7 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { nilfs_crit(bmap->b_inode->i_sb, - "%s (ino=%lu): invalid pointer: %llu", + "%s (ino=%llu): invalid pointer: %llu", __func__, bmap->b_inode->i_ino, (unsigned long long)ptr); return -EINVAL; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 561c220799c7..62d4c1b787e9 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -137,7 +137,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) struct inode *inode = bh->b_folio->mapping->host; nilfs_err(inode->i_sb, - "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", + "I/O error reading %s block for GC (ino=%llu, vblocknr=%llu)", buffer_nilfs_node(bh) ? "node" : "data", inode->i_ino, (unsigned long long)bh->b_blocknr); return -EIO; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 51bde45d5865..51f7e125a311 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -108,7 +108,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, * be locked in this case. */ nilfs_warn(inode->i_sb, - "%s (ino=%lu): a race condition while inserting a data block at offset=%llu", + "%s (ino=%llu): a race condition while inserting a data block at offset=%llu", __func__, inode->i_ino, (unsigned long long)blkoff); err = -EAGAIN; @@ -789,7 +789,7 @@ repeat: goto repeat; failed: - nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%lu)", + nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%llu)", ret, ii->vfs_inode.i_ino); } @@ -1026,7 +1026,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) * this inode. */ nilfs_warn(inode->i_sb, - "cannot set file dirty (ino=%lu): the file is being freed", + "cannot set file dirty (ino=%llu): the file is being freed", inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); return -EINVAL; /* @@ -1057,7 +1057,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warn(inode->i_sb, - "cannot mark inode dirty (ino=%lu): error %d loading inode block", + "cannot mark inode dirty (ino=%llu): error %d loading inode block", inode->i_ino, err); return err; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 946b0d3534a5..09adb40c65e5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -203,7 +203,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, err = -EIO; if (!buffer_uptodate(first_bh)) { nilfs_err(inode->i_sb, - "I/O error reading meta-data file (ino=%lu, block-offset=%lu)", + "I/O error reading meta-data file (ino=%llu, block-offset=%lu)", inode->i_ino, block); goto failed_bh; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 40f4b1a28705..40ac679ec56e 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -292,7 +292,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { nilfs_warn(inode->i_sb, - "deleting nonexistent file (ino=%lu), %d", + "deleting nonexistent file (ino=%llu), %d", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 098a3bd103e0..4b1bf559f352 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2024,7 +2024,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { nilfs_warn(sci->sc_super, - "log writer: error %d getting inode block (ino=%lu)", + "log writer: error %d getting inode block (ino=%llu)", err, ii->vfs_inode.i_ino); return err; } diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 9cc7eb863643..0f731eddeb8b 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -84,7 +84,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", + seq_printf(m, "inotify wd:%x ino:%llx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); @@ -111,7 +111,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode = igrab(fsnotify_conn_inode(mark->connector)); if (!inode) return; - seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", + seq_printf(m, "fanotify ino:%llx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); diff --git a/fs/nsfs.c b/fs/nsfs.c index db91de208645..eac326b85314 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -46,7 +46,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; - return dynamic_dname(buffer, buflen, "%s:[%lu]", + return dynamic_dname(buffer, buflen, "%s:[%llu]", ns_ops->name, inode->i_ino); } @@ -394,7 +394,7 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) const struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; - seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + seq_printf(seq, "%s:[%llu]", ns_ops->name, inode->i_ino); return 0; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 174a7cb202a0..51aa008e126a 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -153,7 +153,7 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cntfs3(%s): ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, + printk("%c%cntfs3(%s): ino=%llx,%s %pV\n", KERN_SOH_ASCII, level, sb->s_id, inode->i_ino, name ? name : "", &vaf); va_end(args); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 344fd4d95fbc..d40f5d205bce 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7318,7 +7318,7 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - mlog(ML_ERROR, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %llu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); status = ocfs2_remove_rightmost_empty_extent(osb, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 17ba79f443ee..c7ad912ec7a0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -137,7 +137,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)iblock, bh_result, create); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) - mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + mlog(ML_NOTICE, "get_block on system inode 0x%p (%llu)\n", inode, inode->i_ino); if (S_ISLNK(inode->i_mode)) { @@ -2146,7 +2146,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ((iblock + ((len - 1) >> i_blkbits)) > endblk)) len = (endblk - iblock + 1) << i_blkbits; - mlog(0, "get block of %lu at %llu:%u req %u\n", + mlog(0, "get block of %llu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 1c8abf2c592c..b82fe4431eb1 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -794,7 +794,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (le16_to_cpu(el->l_count) != ocfs2_extent_recs_per_dx_root(inode->i_sb)) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has invalid extent list length %u\n", + "Inode %llu has invalid extent list length %u\n", inode->i_ino, le16_to_cpu(el->l_count)); goto out; } @@ -812,7 +812,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in btree tree block %llu\n", + "Inode %llu has non zero tree depth in btree tree block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -821,7 +821,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has empty extent list at depth %u\n", + "Inode %llu has empty extent list at depth %u\n", inode->i_ino, le16_to_cpu(el->l_tree_depth)); goto out; @@ -839,7 +839,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (!found) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + "Inode %llu has bad extent record (%u, %u, 0) in btree\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 45cce261da65..5821e33df78f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -123,7 +123,7 @@ static int dlmfs_file_open(struct inode *inode, if (S_ISDIR(inode->i_mode)) BUG(); - mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + mlog(0, "open called on inode %llu, flags 0x%x\n", inode->i_ino, file->f_flags); status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); @@ -170,7 +170,7 @@ static int dlmfs_file_release(struct inode *inode, if (S_ISDIR(inode->i_mode)) BUG(); - mlog(0, "close called on inode %lu\n", inode->i_ino); + mlog(0, "close called on inode %llu\n", inode->i_ino); if (fp) { level = fp->fp_lock_level; @@ -242,7 +242,7 @@ static ssize_t dlmfs_file_write(struct file *filp, int bytes_left; struct inode *inode = file_inode(filp); - mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + mlog(0, "inode %llu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); if (*ppos >= DLM_LVB_LEN) @@ -301,7 +301,7 @@ static void dlmfs_evict_inode(struct inode *inode) clear_inode(inode); - mlog(0, "inode %lu\n", inode->i_ino); + mlog(0, "inode %llu\n", inode->i_ino); ip = DLMFS_I(inode); lockres = &ip->ip_lockres; @@ -490,7 +490,7 @@ static int dlmfs_unlink(struct inode *dir, int status; struct inode *inode = d_inode(dentry); - mlog(0, "unlink inode %lu\n", inode->i_ino); + mlog(0, "unlink inode %llu\n", inode->i_ino); /* if there are no current holders, or none that are waiting * to acquire a lock, this basically destroys our lockres. */ diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index d68229422dda..eb5dcd17d437 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -291,7 +291,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -427,7 +427,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -437,7 +437,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(inode->i_sb, - "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n", + "Inode %llu has an invalid extent (next_free_rec %u, count %u)\n", inode->i_ino, le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); @@ -472,7 +472,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (!rec->e_blkno) { ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0)\n", + "Inode %llu has bad extent record (%u, %u, 0)\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -561,7 +561,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + "Inode %llu has non zero tree depth in xattr leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -580,7 +580,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (!rec->e_blkno) { ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + "Inode %llu has bad extent record (%u, %u, 0) in xattr\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 03a51662ea8e..26025ba2656c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1196,7 +1196,7 @@ static void ocfs2_clear_inode(struct inode *inode) inode->i_nlink); mlog_bug_on_msg(osb == NULL, - "Inode=%lu\n", inode->i_ino); + "Inode=%llu\n", inode->i_ino); dquot_drop(inode); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index c4e0117d8977..269b0f27d567 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -471,7 +471,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, qsize_t spacechange, inodechange; unsigned int memalloc; - trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); + trace_ocfs2_recover_local_quota_file(lqinode->i_ino, type); list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { chunk = rchunk->rc_chunk; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c1cdececdfa4..6d7f44d3e929 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2341,7 +2341,7 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } @@ -2524,7 +2524,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } @@ -2650,7 +2650,7 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -2662,7 +2662,7 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, rec = &el->l_recs[i]; if (ocfs2_is_empty_extent(rec)) { - mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + mlog_bug_on_msg(i != 0, "Inode %llu has empty record in " "index %d\n", inode->i_ino, i); continue; } @@ -3325,7 +3325,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(osb)) { - return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + return ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 42ee5db362d3..4d55ad963ac5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3741,7 +3741,7 @@ static int ocfs2_xattr_get_rec(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in xattr tree block %llu\n", + "Inode %llu has non zero tree depth in xattr tree block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -3758,7 +3758,7 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu has bad extent record (%u, %u, 0) in xattr\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 2d4710d0e05e..9e8a2a9e5229 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -1062,7 +1062,7 @@ struct inode *orangefs_iget(struct super_block *sb, unlock_new_inode(inode); gossip_debug(GOSSIP_INODE_DEBUG, - "iget handle %pU, fsid %d hash %ld i_ino %lu\n", + "iget handle %pU, fsid %d hash %ld i_ino %llu\n", &ref->khandle, ref->fs_id, hash, diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 83f80fdb1567..0a35d1a20f13 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -262,7 +262,7 @@ out: return err; fail: - pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to encode file handle (ino=%llu, err=%i)\n", inode->i_ino, err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index d8dd4b052984..ca899fdfaafd 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -591,7 +591,7 @@ out: fail: inode = d_inode(real); - pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%llu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; @@ -831,7 +831,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + pr_warn_ratelimited("failed inode index lookup (ino=%llu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 3f1b763a8bb4..2edad9a14648 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1092,7 +1092,7 @@ static void ovl_cleanup_index(struct dentry *dentry) got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { - pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%llu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower diff --git a/fs/pipe.c b/fs/pipe.c index b44a756c0b41..9841648c9cf3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -873,7 +873,7 @@ static struct vfsmount *pipe_mnt __ro_after_init; */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(buffer, buflen, "pipe:[%lu]", + return dynamic_dname(buffer, buflen, "pipe:[%llu]", d_inode(dentry)->i_ino); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 9eeccff49b2a..aae1a83e8846 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -54,7 +54,7 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%llu\n", (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e091931d7ca1..751b9ba160fb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -442,7 +442,7 @@ static void get_vma_name(struct vm_area_struct *vma, static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, - dev_t dev, unsigned long ino) + dev_t dev, u64 ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); @@ -465,7 +465,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; - unsigned long ino = 0; + u64 ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 8aeb63d397cf..4deb0eeadbde 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -62,7 +62,7 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h { unsigned long phys; - QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); + QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%llu] iblock=[%ld]\n", inode->i_ino, iblock)); phys = qnx4_block_map( inode, iblock ); if ( phys ) { @@ -128,7 +128,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) brelse( bh ); } - QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); + QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %llu = %ld\n", iblock, inode->i_ino, block)); return block; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index c4049bb8bd60..6de49333acad 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -75,7 +75,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, { unsigned phys; - pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + pr_debug("qnx6_get_block inode=[%llu] iblock=[%ld]\n", inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 160c16aa7b6e..5794de5a9069 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -230,7 +230,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) int count = 2; pr_err("Dump in-memory inode:"); - pr_err("\tinode %lu\n", inode->i_ino); + pr_err("\tinode %llu\n", inode->i_ino); pr_err("\tsize %llu\n", (unsigned long long)i_size_read(inode)); pr_err("\tnlink %u\n", inode->i_nlink); @@ -1101,7 +1101,7 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) if (ui->ui_size != ui->synced_i_size && !ui->dirty) { ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean", ui->ui_size, ui->synced_i_size); - ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + ubifs_err(c, "i_ino %llu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); dump_stack(); err = -EINVAL; @@ -1163,7 +1163,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) kfree(pdent); if (i_size_read(dir) != size) { - ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu", + ubifs_err(c, "directory inode %llu has size %llu, but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); ubifs_dump_inode(c, dir); @@ -1171,7 +1171,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) return -EINVAL; } if (dir->i_nlink != nlink) { - ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u", + ubifs_err(c, "directory inode %llu has nlink %u, but calculated nlink is %u", dir->i_ino, dir->i_nlink, nlink); ubifs_dump_inode(c, dir); dump_stack(); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 4c9f57f3b2ad..86d41e077e4d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -223,7 +223,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct ubifs_info *c = dir->i_sb->s_fs_info; struct fscrypt_name nm; - dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); + dbg_gen("'%pd' in dir ino %llu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); if (err == -ENOENT) @@ -281,7 +281,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", + ubifs_warn(c, "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); iput(inode); inode = ERR_PTR(-EPERM); @@ -318,7 +318,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, * parent directory inode. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -386,7 +386,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) * atomically. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); inode = ubifs_new_inode(c, dir, mode, false); @@ -460,7 +460,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, * be released via writeback. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); @@ -589,7 +589,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) bool encrypted = IS_ENCRYPTED(dir); struct ubifs_dir_data *data = file->private_data; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); + dbg_gen("dir ino %llu, f_pos %#llx", dir->i_ino, ctx->pos); if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) /* @@ -764,7 +764,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, * changing the parent inode. */ - dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", + dbg_gen("dent '%pd' to ino %llu (nlink %d) in dir ino %llu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); ubifs_assert(c, inode_is_locked(dir)); @@ -836,7 +836,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) * deletions. */ - dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", + dbg_gen("dent '%pd' from ino %llu (nlink %d) in dir ino %llu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); @@ -941,7 +941,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) * because we have extra space reserved for deletions. */ - dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, + dbg_gen("directory '%pd', ino %llu in dir ino %llu", dentry, inode->i_ino, dir->i_ino); ubifs_assert(c, inode_is_locked(dir)); ubifs_assert(c, inode_is_locked(inode)); @@ -1018,7 +1018,7 @@ static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, * directory inode. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -1096,7 +1096,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, * directory inode. */ - dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino); + dbg_gen("dent '%pd' in dir ino %llu", dentry, dir->i_ino); if (S_ISBLK(mode) || S_ISCHR(mode)) { dev = kmalloc_obj(union ubifs_dev_desc, GFP_NOFS); @@ -1183,7 +1183,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dirtied_ino = 1 }; struct fscrypt_name nm; - dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, + dbg_gen("dent '%pd', target '%s' in dir ino %llu", dentry, symname, dir->i_ino); err = fscrypt_prepare_symlink(dir, symname, len, UBIFS_MAX_INO_DATA, @@ -1349,7 +1349,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * ino_req: marks the target inode as dirty and does not write it. */ - dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", + dbg_gen("dent '%pd' ino %llu in dir ino %llu to dent '%pd' in dir ino %llu flags 0x%x", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino, flags); @@ -1597,7 +1597,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, * parent directory inodes. */ - dbg_gen("dent '%pd' ino %lu in dir ino %lu exchange dent '%pd' ino %lu in dir ino %lu", + dbg_gen("dent '%pd' ino %llu in dir ino %llu exchange dent '%pd' ino %llu in dir ino %llu", old_dentry, fst_inode->i_ino, old_dir->i_ino, new_dentry, snd_inode->i_ino, new_dir->i_ino); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cd04755e792a..e73c28b12f97 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -90,7 +90,7 @@ static int read_block(struct inode *inode, struct folio *folio, size_t offset, return 0; dump: - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", block, inode->i_ino); ubifs_dump_node(c, dn, UBIFS_MAX_DATA_NODE_SZ); return -EINVAL; @@ -106,7 +106,7 @@ static int do_readpage(struct folio *folio) loff_t i_size = i_size_read(inode); size_t offset = 0; - dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + dbg_gen("ino %llu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags.f); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); @@ -162,7 +162,7 @@ static int do_readpage(struct folio *folio) dbg_gen("hole"); err = 0; } else { - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + ubifs_err(c, "cannot read page %lu of inode %llu, error %d", folio->index, inode->i_ino, err); } } @@ -212,7 +212,7 @@ static int write_begin_slow(struct address_space *mapping, int err, appending = !!(pos + len > inode->i_size); struct folio *folio; - dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + dbg_gen("ino %llu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* @@ -526,7 +526,7 @@ static int ubifs_write_end(const struct kiocb *iocb, loff_t end_pos = pos + len; int appending = !!(end_pos > inode->i_size); - dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + dbg_gen("ino %llu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", inode->i_ino, pos, folio->index, len, copied, inode->i_size); if (unlikely(copied < len && !folio_test_uptodate(folio))) { @@ -599,7 +599,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, size_t offset = 0; pgoff_t end_index; - dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + dbg_gen("ino %llu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags.f); end_index = (i_size - 1) >> PAGE_SHIFT; @@ -680,7 +680,7 @@ out_hole: return 0; out_err: - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", page_block, inode->i_ino); return -EINVAL; } @@ -913,7 +913,7 @@ static int do_writepage(struct folio *folio, size_t len) } if (err) { mapping_set_error(folio->mapping, err); - ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + ubifs_err(c, "cannot write folio %lu of inode %llu, error %d", folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } @@ -987,7 +987,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) loff_t i_size = i_size_read(inode), synced_i_size; int err, len = folio_size(folio); - dbg_gen("ino %lu, pg %lu, pg flags %#lx", + dbg_gen("ino %llu, pg %lu, pg flags %#lx", inode->i_ino, folio->index, folio->flags.f); ubifs_assert(c, folio->private != NULL); @@ -1106,7 +1106,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + dbg_gen("ino %llu, size %lld -> %lld", inode->i_ino, old_size, new_size); memset(&req, 0, sizeof(struct ubifs_budget_req)); /* @@ -1258,7 +1258,7 @@ int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode = d_inode(dentry); struct ubifs_info *c = inode->i_sb->s_fs_info; - dbg_gen("ino %lu, mode %#x, ia_valid %#x", + dbg_gen("ino %llu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) @@ -1308,7 +1308,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct ubifs_info *c = inode->i_sb->s_fs_info; int err; - dbg_gen("syncing inode %lu", inode->i_ino); + dbg_gen("syncing inode %llu", inode->i_ino); if (c->ro_mount) /* @@ -1495,7 +1495,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, + dbg_gen("ino %llu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1531,7 +1531,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) err = ubifs_budget_space(c, &req); if (unlikely(err)) { if (err == -ENOSPC) - ubifs_warn(c, "out of space for mmapped file (inode number %lu)", + ubifs_warn(c, "out of space for mmapped file (inode number %llu)", inode->i_ino); return VM_FAULT_SIGBUS; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index e28ab4395e5c..40a95a2fad50 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -982,7 +982,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) int kill_xattrs = ui->xattr_cnt && last_reference; u8 hash[UBIFS_HASH_ARR_SZ]; - dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + dbg_jnl("ino %llu, nlink %u", inode->i_ino, inode->i_nlink); if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { ubifs_err(c, "Cannot delete inode, it has too many xattrs!"); @@ -1743,7 +1743,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, int dn_len = le32_to_cpu(dn->size); if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", blk, inode->i_ino); ubifs_dump_node(c, dn, dn_size); err = -EUCLEAN; @@ -1987,7 +1987,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, u8 hash_host[UBIFS_HASH_ARR_SZ]; u8 hash[UBIFS_HASH_ARR_SZ]; - dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + dbg_jnl("ino %llu, ino %llu", host->i_ino, inode->i_ino); ubifs_assert(c, inode->i_nlink > 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 03bf924756ca..9a77d8b64ffa 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -92,7 +92,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 5; if (!ubifs_compr_present(c, ui->compr_type)) { - ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", + ubifs_warn(c, "inode %llu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(c, ui->compr_type)); } @@ -248,14 +248,14 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) return inode; out_invalid: - ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); + ubifs_err(c, "inode %llu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: - ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); + ubifs_err(c, "failed to read inode %llu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } @@ -316,12 +316,12 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) * As an optimization, do not write orphan inodes to the media just * because this is not needed. */ - dbg_gen("inode %lu, mode %#x, nlink %u", + dbg_gen("inode %llu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) - ubifs_err(c, "can't write inode %lu, error %d", + ubifs_err(c, "can't write inode %llu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); @@ -357,7 +357,7 @@ static void ubifs_evict_inode(struct inode *inode) */ goto out; - dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); @@ -375,7 +375,7 @@ static void ubifs_evict_inode(struct inode *inode) * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ - ubifs_err(c, "can't delete inode %lu, error %d", + ubifs_err(c, "can't delete inode %llu, error %d", inode->i_ino, err); out: @@ -399,7 +399,7 @@ static void ubifs_dirty_inode(struct inode *inode, int flags) ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); if (!ui->dirty) { ui->dirty = 1; - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %llu", inode->i_ino); } } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 694b08d27d7d..c9d8935f6678 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3561,8 +3561,8 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld", - (unsigned long)inode->i_ino, size, + ubifs_err(c, "inode %llu has size %lld, but there are data at offset %lld", + inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); ubifs_dump_inode(c, inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index c21a0c2b3e90..b5a9ab9d8a10 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -76,7 +76,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= ubifs_xattr_max_cnt(c)) { - ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more", + ubifs_err(c, "inode %llu already has too many xattrs (%d), cannot create more", host->i_ino, host_ui->xattr_cnt); return -ENOSPC; } @@ -88,7 +88,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, */ names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1; if (names_len > XATTR_LIST_MAX) { - ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", + ubifs_err(c, "cannot add one more xattr name to inode %llu, total names length would become %d, max. is %d", host->i_ino, names_len, XATTR_LIST_MAX); return -ENOSPC; } @@ -390,7 +390,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int err, len, written = 0; struct fscrypt_name nm = {0}; - dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, + dbg_gen("ino %llu ('%pd'), buffer size %zd", host->i_ino, dentry, size); down_read(&host_ui->xattr_sem); @@ -498,7 +498,7 @@ int ubifs_purge_xattrs(struct inode *host) if (ubifs_inode(host)->xattr_cnt <= ubifs_xattr_max_cnt(c)) return 0; - ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion", + ubifs_warn(c, "inode %llu has too many xattrs, doing a non-atomic deletion", host->i_ino); down_write(&ubifs_inode(host)->xattr_sem); @@ -641,7 +641,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, &init_xattrs, NULL); if (err) { struct ubifs_info *c = dentry->i_sb->s_fs_info; - ubifs_err(c, "cannot initialize security for inode %lu, error %d", + ubifs_err(c, "cannot initialize security for inode %llu, error %d", inode->i_ino, err); } return err; @@ -652,7 +652,7 @@ static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, + dbg_gen("xattr '%s', ino %llu ('%pd'), buf size %zd", name, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); @@ -665,7 +665,7 @@ static int xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", + dbg_gen("xattr '%s', host ino %llu ('%pd'), size %zd", name, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 632453aa3893..f5c81e13eacb 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -22,7 +22,7 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n", + "directory (ino %llu) has entry at pos %llu with incorrect tag %x\n", iter->dir->i_ino, (unsigned long long)iter->pos, le16_to_cpu(iter->fi.descTag.tagIdent)); return -EFSCORRUPTED; @@ -30,7 +30,7 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) len = udf_dir_entry_len(&iter->fi); if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n", + "directory (ino %llu) has entry at pos %llu with unaligned length of impUse field\n", iter->dir->i_ino, (unsigned long long)iter->pos); return -EFSCORRUPTED; } @@ -41,20 +41,20 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) */ if (len > 1 << iter->dir->i_blkbits) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has too big (%u) entry at pos %llu\n", + "directory (ino %llu) has too big (%u) entry at pos %llu\n", iter->dir->i_ino, len, (unsigned long long)iter->pos); return -EFSCORRUPTED; } if (iter->pos + len > iter->dir->i_size) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry past directory size at pos %llu\n", + "directory (ino %llu) has entry past directory size at pos %llu\n", iter->dir->i_ino, (unsigned long long)iter->pos); return -EFSCORRUPTED; } if (udf_dir_entry_len(&iter->fi) != sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n", + "directory (ino %llu) has entry where CRC length (%u) does not match entry length (%u)\n", iter->dir->i_ino, (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength), (unsigned)(udf_dir_entry_len(&iter->fi) - @@ -78,7 +78,7 @@ static int udf_copy_fi(struct udf_fileident_iter *iter) } if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry straddling EOF\n", + "directory (ino %llu) has entry straddling EOF\n", iter->dir->i_ino); return -EFSCORRUPTED; } @@ -184,7 +184,7 @@ static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter) return 0; } udf_err(iter->dir->i_sb, - "extent after position %llu not allocated in directory (ino %lu)\n", + "extent after position %llu not allocated in directory (ino %llu)\n", (unsigned long long)iter->pos, iter->dir->i_ino); return -EFSCORRUPTED; } @@ -272,7 +272,7 @@ int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, if (pos == dir->i_size) return 0; udf_err(dir->i_sb, - "position %llu not allocated in directory (ino %lu)\n", + "position %llu not allocated in directory (ino %llu)\n", (unsigned long long)pos, dir->i_ino); err = -EFSCORRUPTED; goto out; @@ -483,7 +483,7 @@ int udf_fiiter_append_blk(struct udf_fileident_iter *iter) &iter->loffset, &etype); if (err <= 0 || etype != (EXT_RECORDED_ALLOCATED >> 30)) { udf_err(iter->dir->i_sb, - "block %llu not allocated in directory (ino %lu)\n", + "block %llu not allocated in directory (ino %llu)\n", (unsigned long long)block, iter->dir->i_ino); return -EFSCORRUPTED; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 32ae7cfd72c5..b043fe10e5d6 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -133,7 +133,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int result; if (file_permission(filp, MAY_READ) != 0) { - udf_debug("no permission to access inode %lu\n", inode->i_ino); + udf_debug("no permission to access inode %llu\n", inode->i_ino); return -EPERM; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7fae8002344a..902f81729bd8 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -147,7 +147,7 @@ void udf_evict_inode(struct inode *inode) if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, - "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + "Inode %llu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); @@ -1386,13 +1386,13 @@ reread: */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { - udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %llu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", + udf_err(inode->i_sb, "(ino %llu) failed ident=%u\n", inode->i_ino, ident); goto out; } @@ -1641,7 +1641,7 @@ reread: udf_debug("METADATA BITMAP FILE-----\n"); break; default: - udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", + udf_err(inode->i_sb, "(ino %llu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } @@ -1942,7 +1942,7 @@ finish: if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { - udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + udf_warn(inode->i_sb, "IO error syncing udf inode [%08llx]\n", inode->i_ino); err = -EIO; } @@ -2224,7 +2224,7 @@ int udf_next_aext(struct inode *inode, struct extent_position *epos, if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, - "too many indirect extents in inode %lu\n", + "too many indirect extents in inode %llu\n", inode->i_ino); return -EFSCORRUPTED; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5f2e9a892bff..ccafcaa96809 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -550,7 +550,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - udf_debug("Deleting nonexistent file (%lu), %u\n", + udf_debug("Deleting nonexistent file (%llu), %u\n", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } @@ -809,7 +809,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, &diriter); if (retval == -ENOENT) { udf_err(old_inode->i_sb, - "directory (ino %lu) has no '..' entry\n", + "directory (ino %llu) has no '..' entry\n", old_inode->i_ino); retval = -EFSCORRUPTED; } @@ -821,7 +821,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, old_dir->i_ino) { retval = -EFSCORRUPTED; udf_err(old_inode->i_sb, - "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", + "directory (ino %llu) has parent entry pointing to another inode (%llu != %u)\n", old_inode->i_ino, old_dir->i_ino, udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); goto out_oiter; @@ -869,7 +869,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); if (retval) { udf_err(old_dir->i_sb, - "failed to find renamed entry again in directory (ino %lu)\n", + "failed to find renamed entry again in directory (ino %llu)\n", old_dir->i_ino); } else { udf_fiiter_delete_entry(&oiter); diff --git a/fs/udf/super.c b/fs/udf/super.c index 27f463fd1d89..3a2d66c7e856 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1166,7 +1166,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, } map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; - udf_debug("unallocSpaceTable (part %d) @ %lu\n", + udf_debug("unallocSpaceTable (part %d) @ %llu\n", p_index, map->s_uspace.s_table->i_ino); } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 194ed3ab945e..628edfde3a9f 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -245,7 +245,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, sector_t end, i; struct buffer_head *head, *bh; - UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", + UFSD("ENTER, ino %llu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); @@ -340,7 +340,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, unsigned cgno, oldcount, newcount; u64 tmp, request, result; - UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n", + UFSD("ENTER, ino %llu, fragment %llu, goal %llu, count %u\n", inode->i_ino, (unsigned long long)fragment, (unsigned long long)goal, count); @@ -583,7 +583,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, unsigned oldcg, i, j, k, allocsize; u64 result; - UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", + UFSD("ENTER, ino %llu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); sb = inode->i_sb; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 43f1578ab866..f10a50f7e78b 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -150,7 +150,7 @@ out: Ebadsize: ufs_error(sb, __func__, - "size of directory #%lu is not a multiple of chunk size", + "size of directory #%llu is not a multiple of chunk size", dir->i_ino ); goto fail; @@ -169,7 +169,7 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ufs_error(sb, __func__, "bad entry in directory #%lu: %s - " + ufs_error(sb, __func__, "bad entry in directory #%llu: %s - " "offset=%llu, rec_len=%d, name_len=%d", dir->i_ino, error, folio_pos(folio) + offs, rec_len, ufs_get_de_namlen(sb, p)); @@ -177,7 +177,7 @@ bad_entry: Eend: p = (struct ufs_dir_entry *)(kaddr + offs); ufs_error(sb, __func__, - "entry in directory #%lu spans the page boundary" + "entry in directory #%llu spans the page boundary" "offset=%llu", dir->i_ino, folio_pos(folio) + offs); fail: @@ -258,7 +258,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; - UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen); + UFSD("ENTER, dir_ino %llu, name %s, namlen %u\n", dir->i_ino, name, namelen); if (npages == 0 || namelen > UFS_MAXNAMLEN) goto out; @@ -434,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(kaddr)) { ufs_error(sb, __func__, - "bad page in #%lu", + "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return PTR_ERR(kaddr); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 73531827ecee..8e51f4630d18 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -63,7 +63,7 @@ void ufs_free_inode (struct inode * inode) int is_directory; unsigned ino, cg, bit; - UFSD("ENTER, ino %lu\n", inode->i_ino); + UFSD("ENTER, ino %llu\n", inode->i_ino); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -317,7 +317,7 @@ cg_found: bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); if (!bh) { ufs_warning(sb, "ufs_read_inode", - "unable to read inode %lu\n", + "unable to read inode %llu\n", inode->i_ino); err = -EIO; goto fail_remove_inode; @@ -336,7 +336,7 @@ cg_found: } mutex_unlock(&sbi->s_lock); - UFSD("allocating inode %lu\n", inode->i_ino); + UFSD("allocating inode %llu\n", inode->i_ino); UFSD("EXIT\n"); return inode; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e2b0a35de2a7..2a8728c87979 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -400,7 +400,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff mutex_lock(&UFS_I(inode)->truncate_mutex); - UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); + UFSD("ENTER, ino %llu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); if (unlikely(!depth)) { ufs_warning(sb, "ufs_get_block", "block > big"); err = -EIO; @@ -595,7 +595,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) struct super_block *sb = inode->i_sb; umode_t mode; - UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); + UFSD("Reading ufs2 inode, ino %llu\n", inode->i_ino); /* * Copy data to the in-core inode. */ @@ -662,7 +662,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); if (!bh) { - ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + ufs_warning(sb, "ufs_read_inode", "unable to read inode %llu\n", inode->i_ino); goto bad_inode; } @@ -793,17 +793,17 @@ static int ufs_update_inode(struct inode * inode, int do_sync) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * bh; - UFSD("ENTER, ino %lu\n", inode->i_ino); + UFSD("ENTER, ino %llu\n", inode->i_ino); if (inode->i_ino < UFS_ROOTINO || inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); + ufs_warning (sb, "ufs_read_inode", "bad inode number (%llu)\n", inode->i_ino); return -1; } bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); + ufs_warning (sb, "ufs_read_inode", "unable to read inode %llu\n", inode->i_ino); return -1; } if (uspi->fs_magic == UFS2_MAGIC) { @@ -891,7 +891,7 @@ static void ufs_trunc_direct(struct inode *inode) unsigned int old_tail, new_tail; struct to_free ctx = {.inode = inode}; - UFSD("ENTER: ino %lu\n", inode->i_ino); + UFSD("ENTER: ino %llu\n", inode->i_ino); new_frags = DIRECT_FRAGMENT; // new_frags = first fragment past the new EOF @@ -956,7 +956,7 @@ static void ufs_trunc_direct(struct inode *inode) } } done: - UFSD("EXIT: ino %lu\n", inode->i_ino); + UFSD("EXIT: ino %llu\n", inode->i_ino); } static void free_full_branch(struct inode *inode, u64 ind_block, int depth) @@ -1169,7 +1169,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) { int err = 0; - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + UFSD("ENTER: ino %llu, i_size: %llu, old_i_size: %llu\n", inode->i_ino, (unsigned long long)size, (unsigned long long)i_size_read(inode)); diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 0905f9a16b91..b8dc354ae90f 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -226,10 +226,10 @@ typedef __u16 __bitwise __fs16; * inode number to cylinder group number. * inode number to file system block address. */ -#define ufs_inotocg(x) ((x) / uspi->s_ipg) -#define ufs_inotocgoff(x) ((x) % uspi->s_ipg) +#define ufs_inotocg(x) ((unsigned int)(x) / uspi->s_ipg) +#define ufs_inotocgoff(x) ((unsigned int)(x) % uspi->s_ipg) #define ufs_inotofsba(x) (((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf) -#define ufs_inotofsbo(x) ((x) % uspi->s_inopf) +#define ufs_inotofsbo(x) ((unsigned int)(x) % uspi->s_inopf) /* * Compute the cylinder and rotational position of a cyl block addr. diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 034b1d82c355..dff6f74618de 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -203,7 +203,7 @@ struct folio *ufs_get_locked_folio(struct address_space *mapping, folio = read_mapping_folio(mapping, index, NULL); if (IS_ERR(folio)) { - printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", + printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %llu, index: %lu\n", mapping->host->i_ino, index); return folio; } diff --git a/fs/verity/init.c b/fs/verity/init.c index d65206608583..3aa55dec88fc 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -50,7 +50,7 @@ void fsverity_msg(const struct inode *inode, const char *level, vaf.fmt = fmt; vaf.va = &args; if (inode) - printk("%sfs-verity (%s, inode %lu): %pV\n", + printk("%sfs-verity (%s, inode %llu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfs-verity: %pV\n", level, &vaf); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e83b2ec5e49f..9b646cb5335d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -297,7 +297,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, */ if (isize != data_size) zonefs_warn(sb, - "inode %lu: invalid size %lld (should be %lld)\n", + "inode %llu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* @@ -308,7 +308,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, */ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { - zonefs_warn(sb, "inode %lu: read/write access disabled\n", + zonefs_warn(sb, "inode %llu: read/write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) z->z_flags |= ZONEFS_ZONE_OFFLINE; @@ -316,7 +316,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, data_size = 0; } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { - zonefs_warn(sb, "inode %lu: write access disabled\n", + zonefs_warn(sb, "inode %llu: write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_READONLY)) z->z_flags |= ZONEFS_ZONE_READONLY; @@ -402,7 +402,7 @@ void __zonefs_io_error(struct inode *inode, bool write) memalloc_noio_restore(noio_flag); if (ret != 1) { - zonefs_err(sb, "Get inode %lu zone information failed %d\n", + zonefs_err(sb, "Get inode %llu zone information failed %d\n", inode->i_ino, ret); zonefs_warn(sb, "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; diff --git a/include/linux/fs.h b/include/linux/fs.h index b4f5e5fdbe4b..e820c14f9c5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -783,7 +783,7 @@ struct inode { #endif /* Stat data, not accessed from path walking */ - unsigned long i_ino; + u64 i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 923b24b321cc..4084e926e284 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -344,7 +344,7 @@ out: static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { - pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + pr_warn("ref_ctr %s failed for inode: 0x%llx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, @@ -982,7 +982,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { - pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + pr_warn("ref_ctr_offset mismatch. inode: 0x%llx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b816c56124ab..5fc54836dfa8 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1305,7 +1305,7 @@ static int nr_info_show(struct seq_file *seq, void *v) seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); seq_printf(seq, -"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", +"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %llu\n", ax2asc(buf, &nr->source_addr), devname, nr->my_index, @@ -1329,7 +1329,7 @@ static int nr_info_show(struct seq_file *seq, void *v) nr->window, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); bh_unlock_sock(s); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 841d62481048..53557176b41e 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1479,7 +1479,7 @@ static int rose_info_show(struct seq_file *seq, void *v) callsign = ax2asc(buf, &rose->source_call); seq_printf(seq, - "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", + "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %llu\n", rose2asc(rsbuf, &rose->source_addr), callsign, devname, @@ -1498,7 +1498,7 @@ static int rose_info_show(struct seq_file *seq, void *v) rose->idle / (60 * HZ), sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); } return 0; diff --git a/net/socket.c b/net/socket.c index 136b98c54fb3..395c271afb1d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -373,7 +373,7 @@ static const struct super_operations sockfs_ops = { */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(buffer, buflen, "socket:[%lu]", + return dynamic_dname(buffer, buflen, "socket:[%llu]", d_inode(dentry)->i_ino); } diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 0412814a2295..7e0dbff8f538 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -96,7 +96,7 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) devname = x25->neighbour->dev->name; seq_printf(seq, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu " - "%3lu %3lu %3lu %5d %5d %ld\n", + "%3lu %3lu %3lu %5d %5d %llu\n", !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr, !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr, devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr, @@ -104,7 +104,7 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); out: return 0; } diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2f84bd23edb6..7b645f40e71c 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -149,7 +149,7 @@ static int aafs_count; static int aafs_show_path(struct seq_file *seq, struct dentry *dentry) { - seq_printf(seq, "%s:[%lu]", AAFS_NAME, d_inode(dentry)->i_ino); + seq_printf(seq, "%s:[%llu]", AAFS_NAME, d_inode(dentry)->i_ino); return 0; } @@ -2644,7 +2644,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer, char name[32]; int res; - res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, + res = snprintf(name, sizeof(name), "%s:[%llu]", AAFS_NAME, d_inode(dentry)->i_ino); if (res > 0 && res < sizeof(name)) res = readlink_copy(buffer, buflen, name, strlen(name)); diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 0ec5e4c22cb2..d8d9e5ff1cd2 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -62,7 +62,7 @@ void integrity_audit_message(int audit_msgno, struct inode *inode, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } audit_log_format(ab, " res=%d errno=%d", !result, errno); audit_log_end(ab); diff --git a/security/ipe/audit.c b/security/ipe/audit.c index 3f0deeb54912..93fb59fbddd6 100644 --- a/security/ipe/audit.c +++ b/security/ipe/audit.c @@ -153,7 +153,7 @@ void ipe_audit_match(const struct ipe_eval_ctx *const ctx, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } else { audit_log_format(ab, " dev=? ino=?"); } diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7d623b00495c..737f5a263a8f 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -202,7 +202,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -215,7 +215,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -228,7 +228,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); @@ -246,7 +246,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -265,7 +265,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); rcu_read_unlock(); break; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d8224ea113d1..8f38de4d223e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1400,7 +1400,7 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, if (rc < 0) { kfree(context); if (rc != -ENODATA) { - pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", + pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%llu\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } @@ -1412,13 +1412,13 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; - unsigned long ino = inode->i_ino; + u64 ino = inode->i_ino; if (rc == -EINVAL) { - pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", + pr_notice_ratelimited("SELinux: inode=%llu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { - pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", + pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%llu\n", __func__, context, -rc, dev, ino); } } @@ -3477,7 +3477,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" - "for (%s, %lu), rc=%d\n", + "for (%s, %llu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 98af9d7b9434..2eb3368a3632 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -182,7 +182,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc) char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -195,7 +195,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc) smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) inode=(%s %ld) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) inode=(%s %llu) %s\n", smk_bu_mess[rc], tsp->smk_task->smk_known, isp->smk_inode->smk_known, acc, inode->i_sb->s_id, inode->i_ino, current->comm); return 0; @@ -214,7 +214,7 @@ static int smk_bu_file(struct file *file, int mode, int rc) char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -223,7 +223,7 @@ static int smk_bu_file(struct file *file, int mode, int rc) rc = 0; smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); @@ -244,7 +244,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file, char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -253,7 +253,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file, rc = 0; smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); -- cgit v1.2.3 From 309a4f8ffd1d428e426cd7b80f9f8956490dd233 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Mar 2026 10:00:10 +0100 Subject: ext4: fix signed format specifier in ext4_load_inode trace event The ext4_load_inode trace event uses %lld (signed) to print the ino field which is u64 (unsigned). Use %llu instead. Signed-off-by: Christian Brauner --- include/trace/events/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 84ef091af2d3..f493642cf121 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1781,7 +1781,7 @@ TRACE_EVENT(ext4_load_inode, __entry->ino = ino; ), - TP_printk("dev %d,%d ino %lld", + TP_printk("dev %d,%d ino %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino) ); -- cgit v1.2.3 From ebeca1f930eac8f11f815d58eb38fa5d07e7c16e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce cgroup sub-sched support A system often runs multiple workloads especially in multi-tenant server environments where a system is split into partitions servicing separate more-or-less independent workloads each requiring an application-specific scheduler. To support such and other use cases, sched_ext is in the process of growing multiple scheduler support. When partitioning a system in terms of CPUs for such use cases, an oft-taken approach is hard partitioning the system using cpuset. While it would be possible to tie sched_ext multiple scheduler support to cpuset partitions, such an approach would have fundamental limitations stemming from the lack of dynamism and flexibility. Users often don't care which specific CPUs are assigned to which workload and want to take advantage of optimizations which are enabled by running workloads on a larger machine - e.g. opportunistic over-commit, improving latency critical workload characteristics while maintaining bandwidth fairness, employing control mechanisms based on different criteria than on-CPU time for e.g. flexible memory bandwidth isolation, packing similar parts from different workloads on same L3s to improve cache efficiency, and so on. As this sort of dynamic behaviors are impossible or difficult to implement with hard partitioning, sched_ext is implementing cgroup sub-sched support where schedulers can be attached to the cgroup hierarchy and a parent scheduler is responsible for controlling the CPUs that each child can use at any given moment. This makes CPU distribution dynamically controlled by BPF allowing high flexibility. This patch adds the skeletal sched_ext cgroup sub-sched support: - sched_ext_ops.sub_cgroup_id and .sub_attach/detach() are added. Non-zero sub_cgroup_id indicates that the scheduler is to be attached to the identified cgroup. A sub-sched is attached to the cgroup iff the nearest ancestor scheduler implements .sub_attach() and grants the attachment. Max nesting depth is limited by SCX_SUB_MAX_DEPTH. - When a scheduler exits, all its descendant schedulers are exited together. Also, cgroup.scx_sched added which points to the effective scheduler instance for the cgroup. This is updated on scheduler init/exit and inherited on cgroup online. When a cgroup is offlined, the attached scheduler is automatically exited. - Sub-sched support is gated on CONFIG_EXT_SUB_SCHED which is automatically enabled if both SCX and cgroups are enabled. Sub-sched support is not tied to the CPU controller but rather the cgroup hierarchy itself. This is intentional as the support for cpu.weight and cpu.max based resource control is orthogonal to sub-sched support. Note that CONFIG_CGROUPS around cgroup subtree iteration support for scx_task_iter is replaced with CONFIG_EXT_SUB_SCHED for consistency. - This allows loading sub-scheds and most framework operations such as propagating disable down the hierarchy work. However, sub-scheds are not operational yet and all tasks stay with the root sched. This will serve as the basis for building up full sub-sched support. - DSQs point to the scx_sched they belong to. - scx_qmap is updated to allow attachment of sub-scheds and also serving as sub-scheds. - scx_is_descendant() is added but not yet used in this patch. It is used by later changes in the series and placed here as this is where the function belongs. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/cgroup-defs.h | 4 + include/linux/sched/ext.h | 3 + init/Kconfig | 4 + kernel/sched/ext.c | 532 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/ext_internal.h | 67 +++++- tools/sched_ext/scx_qmap.bpf.c | 9 +- tools/sched_ext/scx_qmap.c | 13 +- 7 files changed, 596 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..dd61767cf9bb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,9 @@ struct cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched __rcu *scx_sched; +#endif /* All ancestors including self */ union { diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 0150b3fe6230..fa4349b319e6 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -78,6 +78,7 @@ struct scx_dispatch_q { u64 id; struct rhash_head hash_node; struct llist_node free_node; + struct scx_sched *sched; struct rcu_head rcu; }; @@ -157,6 +158,8 @@ struct scx_dsq_list_node { .priv = (__priv), \ } +struct scx_sched; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. diff --git a/init/Kconfig b/init/Kconfig index b55deae9256c..06abd8e272cb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1176,6 +1176,10 @@ config EXT_GROUP_SCHED endif #CGROUP_SCHED +config EXT_SUB_SCHED + def_bool y + depends on SCHED_CLASS_EXT + config SCHED_MM_CID def_bool y depends on SMP && RSEQ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 142845bcddaa..bb3e33b660da 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,8 @@ #include #include "ext_idle.h" +static DEFINE_RAW_SPINLOCK(scx_sched_lock); + /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -19,6 +21,12 @@ */ static struct scx_sched __rcu *scx_root; +/* + * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. + * Readers can hold either or rcu_read_lock(). + */ +static LIST_HEAD(scx_sched_all); + /* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to @@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_parent - Find the parent sched + * @sch: sched to find the parent of + * + * Returns the parent scheduler or %NULL if @sch is root. + */ +static struct scx_sched *scx_parent(struct scx_sched *sch) +{ + if (sch->level) + return sch->ancestors[sch->level - 1]; + else + return NULL; +} + +/** + * scx_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sched whose descendants to walk + * + * To be used by scx_for_each_descendant_pre(). Find the next descendant to + * visit for pre-order traversal of @root's descendants. @root is included in + * the iteration and the first node to be visited. + */ +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, + struct scx_sched *root) +{ + struct scx_sched *next; + + lockdep_assert(lockdep_is_held(&scx_enable_mutex) || + lockdep_is_held(&scx_sched_lock)); + + /* if first iteration, visit @root */ + if (!pos) + return root; + + /* visit the first child if exists */ + next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + while (pos != root) { + if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) + return list_next_entry(pos, sibling); + pos = scx_parent(pos); + } + + return NULL; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +#endif /* CONFIG_EXT_SUB_SCHED */ + +/** + * scx_is_descendant - Test whether sched is a descendant + * @sch: sched to test + * @ancestor: ancestor sched to test against + * + * Test whether @sch is a descendant of @ancestor. + */ +static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) +{ + if (sch->level < ancestor->level) + return false; + return sch->ancestors[ancestor->level] == ancestor; +} + +/** + * scx_for_each_descendant_pre - pre-order walk of a sched's descendants + * @pos: iteration cursor + * @root: sched to walk the descendants of + * + * Walk @root's descendants. @root is included in the iteration and the first + * node to be visited. Must be called with either scx_enable_mutex or + * scx_sched_lock held. + */ +#define scx_for_each_descendant_pre(pos, root) \ + for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ + (pos) = scx_next_descendant_pre((pos), (root))) + static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, struct task_struct *p) { @@ -514,7 +605,7 @@ struct scx_task_iter { struct rq_flags rf; u32 cnt; bool list_locked; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED struct cgroup *cgrp; struct cgroup_subsys_state *css_pos; struct css_task_iter css_iter; @@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) { memset(iter, 0, sizeof(*iter)); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (cgrp) { lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; @@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { if (iter->css_pos) css_task_iter_end(&iter->css_iter); @@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) cond_resched(); } -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { while (iter->css_pos) { struct task_struct *p; @@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork scx_set_task_state(p, SCX_TASK_INIT); if (p->scx.disallow) { - if (unlikely(fork)) { + if (unlikely(scx_parent(sch))) { + scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", + p->comm, p->pid); + } else if (unlikely(fork)) { scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", p->comm, p->pid); } else { @@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg, percpu_up_read(&scx_cgroup_ops_rwsem); } +#endif /* CONFIG_EXT_GROUP_SCHED */ + +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +static struct cgroup *root_cgroup(void) +{ + return &cgrp_dfl_root.cgrp; +} + +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} static void scx_cgroup_lock(void) { +#ifdef CONFIG_EXT_GROUP_SCHED percpu_down_write(&scx_cgroup_ops_rwsem); +#endif cgroup_lock(); } static void scx_cgroup_unlock(void) { cgroup_unlock(); +#ifdef CONFIG_EXT_GROUP_SCHED percpu_up_write(&scx_cgroup_ops_rwsem); +#endif } - -#else /* CONFIG_EXT_GROUP_SCHED */ - +#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +static struct cgroup *root_cgroup(void) { return NULL; } +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} - -#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ /* * Omitted operations: @@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; + dsq->sched = sch; } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->error_irq_work); kthread_destroy_worker(sch->helper); +#ifdef CONFIG_EXT_SUB_SCHED + kfree(sch->cgrp_path); + if (sch_cgroup(sch)) + cgroup_put(sch_cgroup(sch)); +#endif /* CONFIG_EXT_SUB_SCHED */ + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return "disabled by sysrq-S"; + case SCX_EXIT_PARENT: + return "parent exiting"; case SCX_EXIT_ERROR: return "runtime error"; case SCX_EXIT_ERROR_BPF: @@ -4430,6 +4560,69 @@ static void free_kick_syncs(void) } } +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + + drain_descendants(sch); + + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + /* TODO - perform actual disabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + raw_spin_lock_irq(&scx_sched_lock); + list_del_init(&sch->sibling); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (sch->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + static void scx_root_disable(struct scx_sched *sch) { struct scx_exit_info *ei = sch->exit_info; @@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch) struct task_struct *p; int cpu; - /* guarantee forward progress by bypassing scx_ops */ + /* guarantee forward progress and wait for descendants to be disabled */ scx_bypass(true); WRITE_ONCE(scx_aborting, false); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch) scx_exit_task(p); } scx_task_iter_stop(&sti); + + scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), NULL); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* @@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch) cancel_delayed_work_sync(&scx_watchdog_work); + raw_spin_lock_irq(&scx_sched_lock); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + /* * scx_root clearing must be inside cpus_read_lock(). See * handle_hotplug(). @@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) * successfully reach scx_bypass(). */ WRITE_ONCE(scx_aborting, true); + + /* + * Propagate exits to descendants immediately. Each has a dedicated + * helper kthread and can run in parallel. While most of disabling is + * serialized, running them in separate threads allows parallelizing + * ops.exit(), which can take arbitrarily long prolonging bypass mode. + * + * This doesn't cause recursions as propagation only takes place for + * non-propagation exits. + */ + if (kind != SCX_EXIT_PARENT) { + scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { + struct scx_sched *pos; + scx_for_each_descendant_pre(pos, sch) + scx_disable(pos, SCX_EXIT_PARENT); + } + } + return true; } @@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work) ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - scx_root_disable(sch); + if (scx_parent(sch)) + scx_sub_disable(sch); + else + scx_root_disable(sch); } static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) @@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void) return 0; } -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, + struct cgroup *cgrp, + struct scx_sched *parent) { struct scx_sched *sch; + s32 level = parent ? parent->level + 1 : 0; int node, ret; - sch = kzalloc_obj(*sch); + sch = kzalloc_flex(*sch, ancestors, level); if (!sch) return ERR_PTR(-ENOMEM); @@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; } - init_dsq(dsq, SCX_DSQ_GLOBAL); + init_dsq(dsq, SCX_DSQ_GLOBAL, sch); sch->global_dsqs[node] = dsq; } @@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sched_set_fifo(sch->helper->task); + if (parent) + memcpy(sch->ancestors, parent->ancestors, + level * sizeof(parent->ancestors[0])); + sch->ancestors[level] = sch; + sch->level = level; + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); @@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) ops->priv = sch; sch->kobj.kset = scx_kset; + +#ifdef CONFIG_EXT_SUB_SCHED + char *buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto err_stop_helper; + cgroup_path(cgrp, buf, PATH_MAX); + sch->cgrp_path = kstrdup(buf, GFP_KERNEL); + kfree(buf); + if (!sch->cgrp_path) + goto err_stop_helper; + + sch->cgrp = cgrp; + INIT_LIST_HEAD(&sch->children); + INIT_LIST_HEAD(&sch->sibling); + + if (parent) + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, + &parent->sub_kset->kobj, + "sub-%llu", cgroup_id(cgrp)); + else + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + + if (ret < 0) { + kfree(sch->cgrp_path); + goto err_stop_helper; + } + + if (ops->sub_attach) { + sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); + if (!sch->sub_kset) { + kobject_put(&sch->kobj); + return ERR_PTR(-ENOMEM); + } + } + +#else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) goto err_stop_helper; - +#endif /* CONFIG_EXT_SUB_SCHED */ return sch; err_stop_helper: @@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; - sch = scx_alloc_and_add_sched(ops); + sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); goto err_free_ksyncs; @@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work) atomic_long_set(&scx_nr_rejected, 0); - for_each_possible_cpu(cpu) - cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.local_dsq.sched = sch; + rq->scx.bypass_dsq.sched = sch; + rq->scx.cpuperf_target = SCX_CPUPERF_ONE; + } /* * Keep CPUs stable during enable so that the BPF scheduler can track @@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) */ rcu_assign_pointer(scx_root, sch); + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + scx_idle_enable(ops); if (sch->ops.init) { @@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) * never sees uninitialized tasks. */ scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), sch); ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5392,6 +5671,185 @@ err_disable: cmd->ret = 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/* verify that a scheduler can be attached to @cgrp and return the parent */ +static struct scx_sched *find_parent_sched(struct cgroup *cgrp) +{ + struct scx_sched *parent = cgrp->scx_sched; + struct scx_sched *pos; + + lockdep_assert_held(&scx_sched_lock); + + /* can't attach twice to the same cgroup */ + if (parent->cgrp == cgrp) + return ERR_PTR(-EBUSY); + + /* does $parent allow sub-scheds? */ + if (!parent->ops.sub_attach) + return ERR_PTR(-EOPNOTSUPP); + + /* can't insert between $parent and its exiting children */ + list_for_each_entry(pos, &parent->children, sibling) + if (cgroup_is_descendant(pos->cgrp, cgrp)) + return ERR_PTR(-EBUSY); + + return parent; +} + +static void scx_sub_enable_workfn(struct kthread_work *work) +{ + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp; + struct scx_sched *parent, *sch; + s32 ret; + + mutex_lock(&scx_enable_mutex); + + if (!scx_enabled()) { + ret = -ENODEV; + goto out_unlock; + } + + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + raw_spin_lock_irq(&scx_sched_lock); + parent = find_parent_sched(cgrp); + if (IS_ERR(parent)) { + raw_spin_unlock_irq(&scx_sched_lock); + ret = PTR_ERR(parent); + goto out_put_cgrp; + } + kobject_get(&parent->kobj); + raw_spin_unlock_irq(&scx_sched_lock); + + sch = scx_alloc_and_add_sched(ops, cgrp, parent); + kobject_put(&parent->kobj); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto out_put_cgrp; + } + + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail(&sch->sibling, &parent->children); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + + if (sch->level >= SCX_SUB_MAX_DEPTH) { + scx_error(sch, "max nesting depth %d violated", + SCX_SUB_MAX_DEPTH); + goto err_disable; + } + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + } + + if (validate_ops(sch, ops)) + goto err_disable; + + struct scx_sub_attach_args sub_attach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + + ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL, + &sub_attach_args); + if (ret) { + ret = ops_sanitize_err(sch, "sub_attach", ret); + scx_error(sch, "parent rejected (%d)", ret); + goto err_disable; + } + sch->sub_attached = true; + + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + /* + * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see + * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. + */ + set_cgroup_sched(sch_cgroup(sch), sch); + if (!(cgrp->self.flags & CSS_ONLINE)) { + scx_error(sch, "cgroup is not online"); + goto err_unlock_and_disable; + } + + /* TODO - perform actual enabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); + kobject_uevent(&sch->kobj, KOBJ_ADD); + ret = 0; + goto out_unlock; + +out_put_cgrp: + cgroup_put(cgrp); +out_unlock: + mutex_unlock(&scx_enable_mutex); + cmd->ret = ret; + return; + +err_unlock_and_disable: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable: + mutex_unlock(&scx_enable_mutex); + kthread_flush_work(&sch->disable_work); + cmd->ret = 0; +} + +static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + struct cgroup *parent = cgroup_parent(cgrp); + + if (!cgroup_on_dfl(cgrp)) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + /* inherit ->scx_sched from $parent */ + if (parent) + rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); + break; + case CGROUP_LIFETIME_OFFLINE: + /* if there is a sched attached, shoot it down */ + if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) + scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, + SCX_ECODE_RSN_CGROUP_OFFLINE, + "cgroup %llu going offline", cgroup_id(cgrp)); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_cgroup_lifetime_nb = { + .notifier_call = scx_cgroup_lifetime_notify, +}; + +static s32 __init scx_cgroup_lifetime_notifier_init(void) +{ + return blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &scx_cgroup_lifetime_nb); +} +core_initcall(scx_cgroup_lifetime_notifier_init); +#endif /* CONFIG_EXT_SUB_SCHED */ + static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { static struct kthread_worker *helper; @@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_unlock(&helper_mutex); } - kthread_init_work(&cmd.work, scx_root_enable_workfn); +#ifdef CONFIG_EXT_SUB_SCHED + if (ops->sub_cgroup_id > 1) + kthread_init_work(&cmd.work, scx_sub_enable_workfn); + else +#endif /* CONFIG_EXT_SUB_SCHED */ + kthread_init_work(&cmd.work, scx_root_enable_workfn); cmd.ops = ops; kthread_queue_work(READ_ONCE(helper), &cmd.work); @@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, hotplug_seq): ops->hotplug_seq = *(u64 *)(udata + moff); return 1; +#ifdef CONFIG_EXT_SUB_SCHED + case offsetof(struct sched_ext_ops, sub_cgroup_id): + ops->sub_cgroup_id = *(u64 *)(udata + moff); + return 1; +#endif /* CONFIG_EXT_SUB_SCHED */ } return 0; @@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): + case offsetof(struct sched_ext_ops, sub_attach): + case offsetof(struct sched_ext_ops, sub_detach): break; default: if (prog->sleepable) @@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} -#endif +#endif /* CONFIG_EXT_GROUP_SCHED */ +static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } +static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} static s32 sched_ext_ops__init(void) { return -EINVAL; } @@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, .init = sched_ext_ops__init, @@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void) struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); - init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); + /* local/bypass dsq's sch will be set during scx_root_enable() */ + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL); + INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) if (!dsq) return -ENOMEM; - init_dsq(dsq, dsq_id); - rcu_read_lock(); sch = rcu_dereference(scx_root); - if (sch) + if (sch) { + init_dsq(dsq, dsq_id, sch); ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); - else + } else { ret = -ENODEV; + } rcu_read_unlock(); if (ret) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 417d3c6f02fe..75b7f57e20ab 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -28,6 +28,8 @@ enum scx_consts { SCX_BYPASS_LB_DONOR_PCT = 125, SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + + SCX_SUB_MAX_DEPTH = 4, }; enum scx_exit_kind { @@ -38,6 +40,7 @@ enum scx_exit_kind { SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + SCX_EXIT_PARENT, /* parent exiting */ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ @@ -62,6 +65,7 @@ enum scx_exit_kind { enum scx_exit_code { /* Reasons */ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32, /* Actions */ SCX_ECODE_ACT_RESTART = 1LLU << 48, @@ -213,7 +217,7 @@ struct scx_exit_task_args { bool cancelled; }; -/* argument container for ops->cgroup_init() */ +/* argument container for ops.cgroup_init() */ struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; @@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason { }; /* - * Argument container for ops->cpu_acquire(). Currently empty, but may be + * Argument container for ops.cpu_acquire(). Currently empty, but may be * expanded in the future. */ struct scx_cpu_acquire_args {}; -/* argument container for ops->cpu_release() */ +/* argument container for ops.cpu_release() */ struct scx_cpu_release_args { /* the reason the CPU was preempted */ enum scx_cpu_preempt_reason reason; @@ -250,9 +254,7 @@ struct scx_cpu_release_args { struct task_struct *task; }; -/* - * Informational context provided to dump operations. - */ +/* informational context provided to dump operations */ struct scx_dump_ctx { enum scx_exit_kind kind; s64 exit_code; @@ -261,6 +263,18 @@ struct scx_dump_ctx { u64 at_jiffies; }; +/* argument container for ops.sub_attach() */ +struct scx_sub_attach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + +/* argument container for ops.sub_detach() */ +struct scx_sub_detach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -721,6 +735,20 @@ struct sched_ext_ops { #endif /* CONFIG_EXT_GROUP_SCHED */ + /** + * @sub_attach: Attach a sub-scheduler + * @args: argument container, see the struct definition + * + * Return 0 to accept the sub-scheduler. -errno to reject. + */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + + /** + * @sub_detach: Detach a sub-scheduler + * @args: argument container, see the struct definition + */ + void (*sub_detach)(struct scx_sub_detach_args *args); + /* * All online ops must come before ops.cpu_online(). */ @@ -762,6 +790,10 @@ struct sched_ext_ops { */ void (*exit)(struct scx_exit_info *info); + /* + * Data fields must comes after all ops fields. + */ + /** * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ @@ -796,6 +828,12 @@ struct sched_ext_ops { */ u64 hotplug_seq; + /** + * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the + * specified cgroup. + */ + u64 sub_cgroup_id; + /** * @name: BPF scheduler's name * @@ -900,6 +938,8 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + s32 level; + /* * Updates to the following warned bitfields can race causing RMW issues * but it doesn't really matter. @@ -907,6 +947,18 @@ struct scx_sched { bool warned_zero_slice:1; bool warned_deprecated_rq:1; + struct list_head all; + +#ifdef CONFIG_EXT_SUB_SCHED + struct list_head children; + struct list_head sibling; + struct cgroup *cgrp; + char *cgrp_path; + struct kset *sub_kset; + + bool sub_attached; +#endif /* CONFIG_EXT_SUB_SCHED */ + atomic_t exit_kind; struct scx_exit_info *exit_info; @@ -916,6 +968,9 @@ struct scx_sched { struct irq_work error_irq_work; struct kthread_work disable_work; struct rcu_work rcu_work; + + /* all ancestors including self */ + struct scx_sched *ancestors[]; }; enum scx_wake_flags { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index d51d8c38f1cf..ff6ff34177ab 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -41,6 +41,7 @@ const volatile u32 dsp_batch; const volatile bool highpri_boosting; const volatile bool print_dsqs_and_events; const volatile bool print_msgs; +const volatile u64 sub_cgroup_id; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -862,7 +863,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - if (print_msgs) + if (print_msgs && !sub_cgroup_id) print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); @@ -892,6 +893,11 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) UEI_RECORD(uei, ei); } +s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args) +{ + return 0; +} + SCX_OPS_DEFINE(qmap_ops, .select_cpu = (void *)qmap_select_cpu, .enqueue = (void *)qmap_enqueue, @@ -907,6 +913,7 @@ SCX_OPS_DEFINE(qmap_ops, .cgroup_init = (void *)qmap_cgroup_init, .cgroup_set_weight = (void *)qmap_cgroup_set_weight, .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, + .sub_attach = (void *)qmap_sub_attach, .cpu_online = (void *)qmap_cpu_online, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index ef701d45ba43..5d762d10f4db 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "scx_qmap.bpf.skel.h" @@ -67,7 +68,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -96,6 +97,16 @@ int main(int argc, char **argv) case 'H': skel->rodata->highpri_boosting = true; break; + case 'c': { + struct stat st; + if (stat(optarg, &st) < 0) { + perror("stat"); + return 1; + } + skel->struct_ops.qmap_ops->sub_cgroup_id = st.st_ino; + skel->rodata->sub_cgroup_id = st.st_ino; + break; + } case 'd': skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); if (skel->rodata->disallow_tgid < 0) -- cgit v1.2.3 From 88234b075c3fc23d57406e1867523b6aba783ebf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce scx_task_sched[_rcu]() In preparation of multiple scheduler support, add p->scx.sched which points to the scx_sched instance that the task is scheduled by, which is currently always scx_root. Add scx_task_sched[_rcu]() accessors which return the associated scx_sched of the specified task and replace the raw scx_root dereferences with it where applicable. scx_task_on_sched() is also added to test whether a given task is on the specified sched. As scx_root is still the only scheduler, this shouldn't introduce user-visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 7 +++++ kernel/sched/ext.c | 63 ++++++++++++++++++++++++++++----------------- kernel/sched/ext_internal.h | 59 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index fa4349b319e6..3213e31c7979 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -165,6 +165,13 @@ struct scx_sched; * for a task to be scheduled by SCX. */ struct sched_ext_entity { +#ifdef CONFIG_CGROUPS + /* + * Associated scx_sched. Updated either during fork or while holding + * both p->pi_lock and rq lock. + */ + struct scx_sched __rcu *sched; +#endif struct scx_dispatch_q *dsq; atomic_long_t ops_state; u64 ddsp_dsq_id; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bb3e33b660da..d56539449f26 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -19,7 +19,7 @@ static DEFINE_RAW_SPINLOCK(scx_sched_lock); * are used as temporary markers to indicate that the dereferences need to be * updated to point to the associated scheduler instances rather than scx_root. */ -static struct scx_sched __rcu *scx_root; +struct scx_sched __rcu *scx_root; /* * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. @@ -304,9 +304,15 @@ static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, return NULL; } + +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) +{ + rcu_assign_pointer(p->scx.sched, sch); +} #else /* CONFIG_EXT_SUB_SCHED */ static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /** @@ -1542,7 +1548,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct task_struct **ddsp_taskp; struct scx_dispatch_q *dsq; unsigned long qseq; @@ -1672,7 +1678,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); int sticky_cpu = p->scx.sticky_cpu; if (enq_flags & ENQUEUE_WAKEUP) @@ -1723,7 +1729,7 @@ out: static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); unsigned long opss; u64 op_deq_flags = deq_flags; @@ -1794,7 +1800,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); @@ -1838,8 +1844,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { - struct scx_sched *sch = scx_root; struct task_struct *p = rq->donor; + struct scx_sched *sch = scx_task_sched(p); if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1849,10 +1855,10 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { - struct scx_sched *sch = scx_root; struct task_struct *from = rq->donor; + struct scx_sched *sch = scx_task_sched(from); - if (SCX_HAS_OP(sch, yield)) + if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, from, to); else @@ -2517,7 +2523,7 @@ static void process_ddsp_deferred_locals(struct rq *rq) */ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct scx_dispatch_q *dsq; list_del_init(&p->scx.dsq_list.node); @@ -2531,7 +2537,7 @@ static void process_ddsp_deferred_locals(struct rq *rq) static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (p->scx.flags & SCX_TASK_QUEUED) { /* @@ -2628,7 +2634,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); @@ -2722,14 +2728,14 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); + refill_task_slice_dfl(scx_task_sched(p), p); } else { p = first_local_task(rq); if (!p) return NULL; if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = rcu_dereference_sched(scx_root); + struct scx_sched *sch = scx_task_sched(p); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", @@ -2817,7 +2823,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); bool rq_bypass; /* @@ -2878,7 +2884,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); set_cpus_allowed_common(p, ac); @@ -3022,7 +3028,7 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(curr); update_curr_scx(rq); @@ -3212,11 +3218,12 @@ static void scx_disable_task(struct task_struct *p) static void scx_exit_task(struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct scx_exit_task_args args = { .cancelled = false, }; + lockdep_assert_held(&p->pi_lock); lockdep_assert_rq_held(task_rq(p)); switch (scx_get_task_state(p)) { @@ -3238,6 +3245,7 @@ static void scx_exit_task(struct task_struct *p) if (SCX_HAS_OP(sch, exit_task)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), p, &args); + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3267,12 +3275,18 @@ void scx_pre_fork(struct task_struct *p) int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + s32 ret; + percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_init_task_enabled) - return scx_init_task(p, task_group(p), true); - else - return 0; + if (scx_init_task_enabled) { + ret = scx_init_task(p, task_group(p), true); + if (!ret) + scx_set_task_sched(p, scx_root); + return ret; + } + + return 0; } void scx_post_fork(struct task_struct *p) @@ -3377,7 +3391,7 @@ void sched_ext_dead(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); lockdep_assert_rq_held(task_rq(p)); @@ -3396,7 +3410,7 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (task_dead_and_done(p)) return; @@ -4062,7 +4076,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) if (!scx_enabled()) return true; - sch = rcu_dereference_sched(scx_root); + sch = scx_task_sched(p); if (unlikely(!sch)) return true; @@ -5582,6 +5596,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } + scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 75b7f57e20ab..0612006019da 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1223,6 +1223,7 @@ enum scx_ops_state { #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); /* @@ -1243,3 +1244,61 @@ static inline bool scx_rq_bypassing(struct rq *rq) { return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); } + +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_task_sched - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock + * held. + */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(p->scx.sched, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +/** + * scx_task_sched_rcu - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. The returned scx_sched is RCU protected. + */ +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(p->scx.sched); +} + +/** + * scx_task_on_sched - Is a task on the specified sched? + * @sch: sched to test against + * @p: task of interest + * + * Returns %true if @p is on @sch, %false otherwise. + */ +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return rcu_access_pointer(p->scx.sched) == sch; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(scx_root, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(scx_root); +} + +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return true; +} +#endif /* CONFIG_EXT_SUB_SCHED */ -- cgit v1.2.3 From 337ec00b1d9c676f637651c2cefddb8612b867ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:04 -1000 Subject: sched_ext: Implement cgroup sub-sched enabling and disabling The preceding changes implemented the framework to support cgroup sub-scheds and updated scheduling paths and kfuncs so that they have minimal but working support for sub-scheds. However, actual sub-sched enabling/disabling hasn't been implemented yet and all tasks stayed on scx_root. Implement cgroup sub-sched enabling and disabling to actually activate sub-scheds: - Both enable and disable operations bypass only the tasks in the subtree of the child being enabled or disabled to limit disruptions. - When enabling, all candidate tasks are first initialized for the child sched. Once that succeeds, the tasks are exited for the parent and then switched over to the child. This adds a bit of complication but guarantees that child scheduler failures are always contained. - Disabling works the same way in the other direction. However, when the parent may fail to initialize a task, disabling is propagated up to the parent. While this means that a parent sched fail due to a child sched event, the failure can only originate from the parent itself (its ops.init_task()). The only effect a malfunctioning child can have on the parent is attempting to move the tasks back to the parent. After this change, although not all the necessary mechanisms are in place yet, sub-scheds can take control of their tasks and schedule them. v2: Fix missing scx_cgroup_unlock()/percpu_up_write() in abort path (Cheng-Yang Chou). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 285 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 280 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3213e31c7979..f354d7d34306 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -88,6 +88,7 @@ enum scx_ent_flags { SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ SCX_TASK_STATE_BITS = 2, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3f237b9da970..70d0f9e8ef61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -51,6 +51,17 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +#ifdef CONFIG_EXT_SUB_SCHED +/* + * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit + * tasks for the sub-sched being enabled. Use a global variable instead of a + * per-task field as all enables are serialized. + */ +static struct scx_sched *scx_enabling_sub_sched; +#else +#define scx_enabling_sub_sched (struct scx_sched *)NULL +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * A monotically increasing sequence number that is incremented every time a * scheduler is enabled. This can be used by to check if any custom sched_ext @@ -3342,6 +3353,17 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, { __scx_disable_and_exit_task(sch, p); + /* + * If set, @p exited between __scx_init_task() and scx_enable_task() in + * scx_sub_enable() and is initialized for both the associated sched and + * its parent. Disable and exit for the child too. + */ + if ((p->scx.flags & SCX_TASK_SUB_INIT) && + !WARN_ON_ONCE(!scx_enabling_sub_sched)) { + __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3377,9 +3399,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) percpu_rwsem_assert_held(&scx_fork_rwsem); if (scx_init_task_enabled) { - ret = scx_init_task(scx_root, p, true); +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; +#else + struct scx_sched *sch = scx_root; +#endif + ret = scx_init_task(sch, p, true); if (!ret) - scx_set_task_sched(p, scx_root); + scx_set_task_sched(p, sch); return ret; } @@ -4643,9 +4670,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass) struct rq *rq = cpu_rq(cpu); struct task_struct *p, *n; + raw_spin_lock(&scx_sched_lock); raw_spin_rq_lock(rq); - raw_spin_lock(&scx_sched_lock); scx_for_each_descendant_pre(pos, sch) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); @@ -4654,6 +4681,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass) else pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; } + raw_spin_unlock(&scx_sched_lock); /* @@ -4798,23 +4826,139 @@ static void drain_descendants(struct scx_sched *sch) wait_event(scx_unlink_waitq, list_empty(&sch->children)); } +static void scx_fail_parent(struct scx_sched *sch, + struct task_struct *failed, s32 fail_code) +{ + struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + + scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", + fail_code, failed->comm, failed->pid); + + /* + * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into + * it. This may cause downstream failures on the BPF side but $parent is + * dying anyway. + */ + scx_bypass(parent, true); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (scx_task_on_sched(parent, p)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + scx_disable_and_exit_task(sch, p); + rcu_assign_pointer(p->scx.sched, parent); + } + } + scx_task_iter_stop(&sti); +} + static void scx_sub_disable(struct scx_sched *sch) { struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + int ret; + /* + * Guarantee forward progress and wait for descendants to be disabled. + * To limit + * disruptions, $parent is not bypassed. Tasks are fully prepped and + * then inserted back into $parent. + */ + scx_bypass(sch, true); drain_descendants(sch); + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ mutex_lock(&scx_enable_mutex); percpu_down_write(&scx_fork_rwsem); scx_cgroup_lock(); set_cgroup_sched(sch_cgroup(sch), parent); - /* TODO - perform actual disabling here */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* filter out duplicate visits */ + if (scx_task_on_sched(parent, p)) + continue; + + /* + * By the time control reaches here, all descendant schedulers + * should already have been disabled. + */ + WARN_ON_ONCE(!scx_task_on_sched(sch, p)); + + /* + * If $p is about to be freed, nothing prevents $sch from + * unloading before $p reaches sched_ext_free(). Disable and + * exit $p right away. + */ + if (!tryget_task_struct(p)) { + scx_disable_and_exit_task(sch, p); + continue; + } + + scx_task_iter_unlock(&sti); + + /* + * $p is READY or ENABLED on @sch. Initialize for $parent, + * disable and exit from @sch, and then switch over to $parent. + * + * If a task fails to initialize for $parent, the only available + * action is disabling $parent too. While this allows disabling + * of a child sched to cause the parent scheduler to fail, the + * failure can only originate from ops.init_task() of the + * parent. A child can't directly affect the parent through its + * own failures. + */ + ret = __scx_init_task(parent, p, false); + if (ret) { + scx_fail_parent(sch, p, ret); + put_task_struct(p); + break; + } + + rq = task_rq_lock(p, &rf); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p is initialized for $parent and still attached to + * @sch. Disable and exit for @sch, switch over to + * $parent, override the state to READY to account for + * $p having already been initialized, and then enable. + */ + scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT); + rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_state(p, SCX_TASK_READY); + scx_enable_task(parent, p); + } + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + /* + * All tasks are moved off of @sch but there may still be on-going + * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use + * the expedited version as ancestors may be waiting in bypass mode. + * Also, tell the parent that there is no need to keep running bypass + * DSQs for us. + */ + synchronize_rcu_expedited(); disable_bypass_dsp(sch); raw_spin_lock_irq(&scx_sched_lock); @@ -5933,13 +6077,30 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp) return parent; } +static bool assert_task_ready_or_enabled(struct task_struct *p) +{ + enum scx_task_state state = scx_get_task_state(p); + + switch (state) { + case SCX_TASK_READY: + case SCX_TASK_ENABLED: + return true; + default: + WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", + state, p->comm, p->pid); + return false; + } +} + static void scx_sub_enable_workfn(struct kthread_work *work) { struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); struct sched_ext_ops *ops = cmd->ops; struct cgroup *cgrp; struct scx_sched *parent, *sch; - s32 ret; + struct scx_task_iter sti; + struct task_struct *p; + s32 i, ret; mutex_lock(&scx_enable_mutex); @@ -6011,6 +6172,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) } sch->sub_attached = true; + scx_bypass(sch, true); + + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + percpu_down_write(&scx_fork_rwsem); scx_cgroup_lock(); @@ -6024,16 +6191,121 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto err_unlock_and_disable; } - /* TODO - perform actual enabling here */ + /* + * Initialize tasks for the new child $sch without exiting them for + * $parent so that the tasks can always be reverted back to $parent + * sched on child init failure. + */ + WARN_ON_ONCE(scx_enabling_sub_sched); + scx_enabling_sub_sched = sch; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* + * Task iteration may visit the same task twice when racing + * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which + * finished __scx_init_task() and skip if set. + * + * A task may exit and get freed between __scx_init_task() + * completion and scx_enable_task(). In such cases, + * scx_disable_and_exit_task() must exit the task for both the + * parent and child scheds. + */ + if (p->scx.flags & SCX_TASK_SUB_INIT) + continue; + + /* see scx_root_enable() */ + if (!tryget_task_struct(p)) + continue; + + if (!assert_task_ready_or_enabled(p)) { + ret = -EINVAL; + goto abort; + } + + scx_task_iter_unlock(&sti); + + /* + * As $p is still on $parent, it can't be transitioned to INIT. + * Let's worry about task state later. Use __scx_init_task(). + */ + ret = __scx_init_task(sch, p, false); + if (ret) + goto abort; + + rq = task_rq_lock(p, &rf); + p->scx.flags |= SCX_TASK_SUB_INIT; + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); + + /* + * All tasks are prepped. Disable/exit tasks for $parent and enable for + * the new @sch. + */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * Use clearing of %SCX_TASK_SUB_INIT to detect and skip + * duplicate iterations. + */ + if (!(p->scx.flags & SCX_TASK_SUB_INIT)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p must be either READY or ENABLED. If ENABLED, + * __scx_disabled_and_exit_task() first disables and + * makes it READY. However, after exiting $p, it will + * leave $p as READY. + */ + assert_task_ready_or_enabled(p); + __scx_disable_and_exit_task(parent, p); + + /* + * $p is now only initialized for @sch and READY, which + * is what we want. Assign it to @sch and enable. + */ + rcu_assign_pointer(p->scx.sched, sch); + scx_enable_task(sch, p); + + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + + scx_enabling_sub_sched = NULL; scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_bypass(sch, false); + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); kobject_uevent(&sch->kobj, KOBJ_ADD); ret = 0; goto out_unlock; +abort: + put_task_struct(p); + scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (p->scx.flags & SCX_TASK_SUB_INIT) { + __scx_disable_and_exit_task(sch, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); out_put_cgrp: cgroup_put(cgrp); out_unlock: @@ -6042,6 +6314,7 @@ out_unlock: return; err_unlock_and_disable: + /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable: -- cgit v1.2.3 From 1837c76b780a4201e3d9f718e17c09b536df700f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 4 Mar 2026 13:58:38 +0100 Subject: drm/amdgpu: Move test for fbdev GEM object into generic helper Provide a generic helper that tests if fbdev emulation is backed by a specific GEM object. Not all drivers use client buffers (yet), hence also test against the first GEM object in the fbdev framebuffer. Convert amdgpu. The helper will also be useful for radeon. Reviewed-by: Alex Deucher Signed-off-by: Thomas Zimmermann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 21 +++----------------- drivers/gpu/drm/drm_fb_helper.c | 30 +++++++++++++++++++++++++++++ include/drm/drm_fb_helper.h | 8 ++++++++ 3 files changed, 41 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index bef9dce2e7ea..f5cd68542442 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -1738,21 +1738,6 @@ bool amdgpu_crtc_get_scanout_position(struct drm_crtc *crtc, stime, etime, mode); } -static bool -amdgpu_display_robj_is_fb(struct amdgpu_device *adev, struct amdgpu_bo *robj) -{ - struct drm_device *dev = adev_to_drm(adev); - struct drm_fb_helper *fb_helper = dev->fb_helper; - - if (!fb_helper || !fb_helper->buffer) - return false; - - if (gem_to_amdgpu_bo(fb_helper->buffer->gem) != robj) - return false; - - return true; -} - int amdgpu_display_suspend_helper(struct amdgpu_device *adev) { struct drm_device *dev = adev_to_drm(adev); @@ -1775,7 +1760,6 @@ int amdgpu_display_suspend_helper(struct amdgpu_device *adev) list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); struct drm_framebuffer *fb = crtc->primary->fb; - struct amdgpu_bo *robj; if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); @@ -1790,8 +1774,9 @@ int amdgpu_display_suspend_helper(struct amdgpu_device *adev) if (!fb || !fb->obj[0]) continue; - robj = gem_to_amdgpu_bo(fb->obj[0]); - if (!amdgpu_display_robj_is_fb(adev, robj)) { + if (!drm_fb_helper_gem_is_fb(dev->fb_helper, fb->obj[0])) { + struct amdgpu_bo *robj = gem_to_amdgpu_bo(fb->obj[0]); + r = amdgpu_bo_reserve(robj, true); if (r == 0) { amdgpu_bo_unpin(robj); diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 05803169bed5..a883534e19e4 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1786,3 +1787,32 @@ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) return 0; } EXPORT_SYMBOL(drm_fb_helper_hotplug_event); + +/** + * drm_fb_helper_gem_is_fb - Tests if GEM object is framebuffer + * @fb_helper: fb_helper instance, can be NULL + * @obj: The GEM object to test, can be NULL + * + * Call drm_fb_helper_gem_is_fb to test is a DRM device's fbdev emulation + * uses the specified GEM object for its framebuffer. The result is always + * false if either poiner is NULL. + * + * Returns: + * True if fbdev emulation uses the provided GEM object, or false otherwise. + */ +bool drm_fb_helper_gem_is_fb(const struct drm_fb_helper *fb_helper, + const struct drm_gem_object *obj) +{ + const struct drm_gem_object *gem = NULL; + + if (!fb_helper || !obj) + return false; + if (fb_helper->buffer && fb_helper->buffer->gem) + gem = fb_helper->buffer->gem; + else if (fb_helper->fb) + gem = drm_gem_fb_get_obj(fb_helper->fb, 0); + + return gem == obj; +} +EXPORT_SYMBOL_GPL(drm_fb_helper_gem_is_fb); + diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index 05cca77b7249..ca214b5c0283 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -271,7 +271,15 @@ int drm_fb_helper_ioctl(struct fb_info *info, unsigned int cmd, int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper); +bool drm_fb_helper_gem_is_fb(const struct drm_fb_helper *fb_helper, + const struct drm_gem_object *obj); #else +static inline bool drm_fb_helper_gem_is_fb(const struct drm_fb_helper *fb_helper, + const struct drm_gem_object *obj) +{ + return false; +} + static inline void drm_fb_helper_prepare(struct drm_device *dev, struct drm_fb_helper *helper, unsigned int preferred_bpp, -- cgit v1.2.3 From 3cd963fa915c494a6d0da0287bd10cb6f2204f9e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:42:57 +0000 Subject: net: stmmac: mdio_bus_data->default_an_inband is boolean default_an_inband is declared as an unsigned int, but is set to true/ false and is assigned to phylink_config's member of the same name which is a bool. Declare this also as a bool for consistency. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AT-0000000BtxD-2qm7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 2fc169c7117e..678d03d6d3bd 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -86,10 +86,10 @@ struct stmmac_priv; struct stmmac_mdio_bus_data { unsigned int phy_mask; unsigned int pcs_mask; - unsigned int default_an_inband; int *irqs; int probed_phy_irq; bool needs_reset; + bool default_an_inband; }; struct stmmac_dma_cfg { -- cgit v1.2.3 From e4fd855c52ec5af34d920206190be29919fadca3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:43:02 +0000 Subject: net: stmmac: make pcs_mask and phy_mask u32 The PCS and PHY masks are passed to the mdio bus layer as phy_mask to prevent bus addresses between 0 and 31 inclusive being scanned, and this is declared as u32. Also declare these as u32 in stmmac for type consistency. Since this is a u32, use BIT_U32() rather than BIT() to generate values for these fields. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AY-0000000BtxJ-3smT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- include/linux/stmmac.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index ece2a0c38562..fc13bfb47783 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -699,7 +699,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, /* Intel mgbe SGMII interface uses pcs-xcps */ if (plat->phy_interface == PHY_INTERFACE_MODE_SGMII || plat->phy_interface == PHY_INTERFACE_MODE_1000BASEX) { - plat->mdio_bus_data->pcs_mask = BIT(INTEL_MGBE_XPCS_ADDR); + plat->mdio_bus_data->pcs_mask = BIT_U32(INTEL_MGBE_XPCS_ADDR); plat->mdio_bus_data->default_an_inband = true; plat->select_pcs = intel_mgbe_select_pcs; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index b913fe5af488..ada6c6ef1f5c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -168,7 +168,7 @@ static int loongson_gnet_data(struct pci_dev *pdev, loongson_default_data(pdev, plat); plat->phy_interface = PHY_INTERFACE_MODE_GMII; - plat->mdio_bus_data->phy_mask = ~(u32)BIT(2); + plat->mdio_bus_data->phy_mask = ~BIT_U32(2); plat->fix_mac_speed = loongson_gnet_fix_speed; return 0; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 678d03d6d3bd..965ada809fdf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -84,8 +84,8 @@ struct stmmac_priv; /* Platfrom data for platform device structure's platform_data field */ struct stmmac_mdio_bus_data { - unsigned int phy_mask; - unsigned int pcs_mask; + u32 phy_mask; + u32 pcs_mask; int *irqs; int probed_phy_irq; bool needs_reset; -- cgit v1.2.3 From c698f5cc940de5871ea3c65c94f5fd7fbc6844e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Mar 2026 11:48:29 +0000 Subject: inet_diag: report delayed ack timer information inet_sk_diag_fill() populates r->idiag_timer with the following precedence order: 1 - Retransmit timer. 4 - Probe0 timer. 2 - Keepalive timer. This patch adds a new value, last in the list, if other timers are not active. 5 - Delayed ACK timer. A corresponding iproute2 patch will follow to replace "unknown" with "delack": ESTAB 10 0 [2002:a05:6830:1f86::]:12875 [2002:a05:6830:1f85::]:50438 timer:(unknown,003ms,0) ino:152178 sk:3004 cgroup:unreachable:189 <-> skmem:(r1344,rb12780520,t0,tb262144,f2752,w0,o250,bl0,d0) ts usec_ts ... Also add the following enum in uapi/linux/inet_diag.h as suggested by David Ahern. enum { IDIAG_TIMER_OFF, IDIAG_TIMER_ON, IDIAG_TIMER_KEEPALIVE, IDIAG_TIMER_TIMEWAIT, IDIAG_TIMER_PROBE0, IDIAG_TIMER_DELACK, }; Neal Cardwell suggested to test for ICSK_ACK_TIMER: inet_csk_clear_xmit_timer() does not call sk_stop_timer() because INET_CSK_CLEAR_TIMERS is unset. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260305114829.2163276-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/inet_diag.h | 9 +++++++++ net/ipv4/inet_diag.c | 13 +++++++++---- net/ipv4/tcp_diag.c | 4 ++-- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 86bb2e8b17c9..21f0d735fbae 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -129,6 +129,15 @@ struct inet_diag_msg { __u32 idiag_inode; }; +enum { + IDIAG_TIMER_OFF, + IDIAG_TIMER_ON, + IDIAG_TIMER_KEEPALIVE, + IDIAG_TIMER_TIMEWAIT, + IDIAG_TIMER_PROBE0, + IDIAG_TIMER_DELACK, +}; + /* Extensions */ enum { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9d215485b5c7..34b77aa87d0a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -241,7 +241,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; - r->idiag_timer = 0; + r->idiag_timer = IDIAG_TIMER_OFF; r->idiag_retrans = 0; r->idiag_expires = 0; @@ -284,20 +284,25 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { - r->idiag_timer = 1; + r->idiag_timer = IDIAG_TIMER_ON; r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { - r->idiag_timer = 4; + r->idiag_timer = IDIAG_TIMER_PROBE0; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); } else if (timer_pending(&icsk->icsk_keepalive_timer)) { - r->idiag_timer = 2; + r->idiag_timer = IDIAG_TIMER_KEEPALIVE; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies); + } else if ((READ_ONCE(icsk->icsk_ack.pending) & ICSK_ACK_TIMER) && + timer_pending(&icsk->icsk_delack_timer)) { + r->idiag_timer = IDIAG_TIMER_DELACK; + r->idiag_expires = + jiffies_delta_to_msecs(icsk_delack_timeout(icsk) - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 7935702e394b..ba1fdbe9807f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -212,7 +212,7 @@ static int tcp_twsk_diag_fill(struct sock *sk, r->idiag_retrans = 0; r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; + r->idiag_timer = IDIAG_TIMER_TIMEWAIT; tmo = tw->tw_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; @@ -247,7 +247,7 @@ static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, r = nlmsg_data(nlh); inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; + r->idiag_timer = IDIAG_TIMER_ON; r->idiag_retrans = READ_ONCE(reqsk->num_retrans); BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != -- cgit v1.2.3 From 260d27b3aec9f30d68f9f3cacc674655897eb745 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 4 Mar 2026 21:17:28 +0100 Subject: net: phy: remove phy_attach 378e6523ebb1 ("net: bcmgenet: remove unused platform code") removed the last user of phy_attach(). So remove this function. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8812176a-e319-4e9f-815d-99ea339df8b2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 38 -------------------------------------- include/linux/phy.h | 2 -- 2 files changed, 40 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 3bd415710bf3..d1cbcfc3d2a6 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1895,44 +1895,6 @@ error_put_device: } EXPORT_SYMBOL(phy_attach_direct); -/** - * phy_attach - attach a network device to a particular PHY device - * @dev: network device to attach - * @bus_id: Bus ID of PHY device to attach - * @interface: PHY device's interface - * - * Description: Same as phy_attach_direct() except that a PHY bus_id - * string is passed instead of a pointer to a struct phy_device. - */ -struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, - phy_interface_t interface) -{ - struct phy_device *phydev; - struct device *d; - int rc; - - if (!dev) - return ERR_PTR(-EINVAL); - - /* Search the list of PHY devices on the mdio bus for the - * PHY with the requested name - */ - d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id); - if (!d) { - pr_err("PHY %s not found\n", bus_id); - return ERR_PTR(-ENODEV); - } - phydev = to_phy_device(d); - - rc = phy_attach_direct(dev, phydev, phydev->dev_flags, interface); - put_device(d); - if (rc) - return ERR_PTR(rc); - - return phydev; -} -EXPORT_SYMBOL(phy_attach); - /** * phy_detach - detach a PHY device from its network device * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 6f9979a26892..e9b0d7427b0e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2152,8 +2152,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable, int speed); -struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, - phy_interface_t interface); struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); -- cgit v1.2.3 From c9429bf56405a326845a8a35357b5bdf1dc4558c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 24 Feb 2026 19:29:54 +0000 Subject: rhashtable: consolidate hash computation in rht_key_get_hash() The else-if and else branches in rht_key_get_hash() both compute a hash using either params.hashfn or jhash, differing only in the source of key_len (params.key_len vs ht->p.key_len). Merge the two branches into one by using the ternary `params.key_len ?: ht->p.key_len` to select the key length, removing the duplicated logic. This also improves the performance of the else branch which previously always used jhash and never fell through to jhash2. This branch is going to be used by BPF resizable hashmap, which wraps rhashtable: https://lore.kernel.org/bpf/20260205-rhash-v1-0-30dd6d63c462@meta.com/ Signed-off-by: Mykyta Yatsenko Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 133ccb39137a..0480509a6339 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -129,10 +129,10 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ - if (!__builtin_constant_p(params.key_len)) + if (!__builtin_constant_p(params.key_len)) { hash = ht->p.hashfn(key, ht->key_len, hash_rnd); - else if (params.key_len) { - unsigned int key_len = params.key_len; + } else { + unsigned int key_len = params.key_len ? : ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); @@ -140,13 +140,6 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); - } else { - unsigned int key_len = ht->p.key_len; - - if (params.hashfn) - hash = params.hashfn(key, key_len, hash_rnd); - else - hash = jhash(key, key_len, hash_rnd); } return hash; -- cgit v1.2.3 From 2f5b5ce1e4b89c76a2b177ee689101a274d1a3c6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Feb 2026 17:45:00 -0800 Subject: crypto: acomp - repair kernel-doc warnings Correct kernel-doc: - add the @extra function parameter - add "_extra" to the mismatched function name - spell the "cmpl" parameter correctly to avoid these warnings: Warning: include/crypto/acompress.h:251 function parameter 'extra' not described in 'acomp_request_alloc_extra' Warning: include/crypto/acompress.h:251 expecting prototype for acomp_request_alloc(). Prototype was for acomp_request_alloc_extra() instead Warning: include/crypto/acompress.h:327 function parameter 'cmpl' not described in 'acomp_request_set_callback' Signed-off-by: Randy Dunlap Signed-off-by: Herbert Xu --- include/crypto/acompress.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index 9eacb9fa375d..5d5358dfab73 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -240,9 +240,10 @@ static inline const char *crypto_acomp_driver_name(struct crypto_acomp *tfm) } /** - * acomp_request_alloc() -- allocates asynchronous (de)compression request + * acomp_request_alloc_extra() -- allocates asynchronous (de)compression request * * @tfm: ACOMPRESS tfm handle allocated with crypto_alloc_acomp() + * @extra: amount of extra memory * @gfp: gfp to pass to kzalloc (defaults to GFP_KERNEL) * * Return: allocated handle in case of success or NULL in case of an error @@ -318,7 +319,7 @@ static inline void acomp_request_free(struct acomp_req *req) * * @req: request that the callback will be set for * @flgs: specify for instance if the operation may backlog - * @cmlp: callback which will be called + * @cmpl: callback which will be called * @data: private data used by the caller */ static inline void acomp_request_set_callback(struct acomp_req *req, -- cgit v1.2.3 From 8fe0cdfd9cb073d4090e2f20f16dd4b44de7526e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Feb 2026 17:45:18 -0800 Subject: crypto: des - fix all kernel-doc warnings Use correct function parameter names and add Returns: sections to eliminate all kernel-doc warnings in des.h: Warning: include/crypto/des.h:41 function parameter 'keylen' not described in 'des_expand_key' Warning: include/crypto/des.h:41 No description found for return value of 'des_expand_key' Warning: include/crypto/des.h:54 function parameter 'keylen' not described in 'des3_ede_expand_key' Warning: include/crypto/des.h:54 No description found for return value of 'des3_ede_expand_key' Signed-off-by: Randy Dunlap Signed-off-by: Herbert Xu --- include/crypto/des.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/crypto/des.h b/include/crypto/des.h index 7812b4331ae4..73eec617f480 100644 --- a/include/crypto/des.h +++ b/include/crypto/des.h @@ -34,9 +34,9 @@ void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src); * des_expand_key - Expand a DES input key into a key schedule * @ctx: the key schedule * @key: buffer containing the input key - * @len: size of the buffer contents + * @keylen: size of the buffer contents * - * Returns 0 on success, -EINVAL if the input key is rejected and -ENOKEY if + * Returns: 0 on success, -EINVAL if the input key is rejected and -ENOKEY if * the key is accepted but has been found to be weak. */ int des_expand_key(struct des_ctx *ctx, const u8 *key, unsigned int keylen); @@ -45,9 +45,9 @@ int des_expand_key(struct des_ctx *ctx, const u8 *key, unsigned int keylen); * des3_ede_expand_key - Expand a triple DES input key into a key schedule * @ctx: the key schedule * @key: buffer containing the input key - * @len: size of the buffer contents + * @keylen: size of the buffer contents * - * Returns 0 on success, -EINVAL if the input key is rejected and -ENOKEY if + * Returns: 0 on success, -EINVAL if the input key is rejected and -ENOKEY if * the key is accepted but has been found to be weak. Note that weak keys will * be rejected (and -EINVAL will be returned) when running in FIPS mode. */ -- cgit v1.2.3 From d2ad1cf29a98adafaf85ddd5ccad6e40c14bcff9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Feb 2026 17:45:28 -0800 Subject: crypto: ecc - correct kernel-doc format Fix all kernel-doc warnings in ecc.h: - use correct kernel-doc format - add some Returns: sections - fix spelling and parameter names Fixes these warnings: Warning: include/crypto/internal/ecc.h:82 function parameter 'nbytes' not described in 'ecc_digits_from_bytes' Warning: include/crypto/internal/ecc.h:82 function parameter 'out' not described in 'ecc_digits_from_bytes' Warning: include/crypto/internal/ecc.h:95 No description found for return value of 'ecc_is_key_valid' Warning: include/crypto/internal/ecc.h:110 No description found for return value of 'ecc_gen_privkey' Warning: include/crypto/internal/ecc.h:124 No description found for return value of 'ecc_make_pub_key' Warning: include/crypto/internal/ecc.h:143 No description found for return value of 'crypto_ecdh_shared_secret' Warning: include/crypto/internal/ecc.h:182 No description found for return value of 'vli_is_zero' Warning: include/crypto/internal/ecc.h:194 No description found for return value of 'vli_cmp' Warning: include/crypto/internal/ecc.h:209 function parameter 'right' not described in 'vli_sub' Warning: include/crypto/internal/ecc.h:271 expecting prototype for ecc_aloc_point(). Prototype was for ecc_alloc_point() instead Warning: include/crypto/internal/ecc.h:287 function parameter 'point' not described in 'ecc_point_is_zero' Signed-off-by: Randy Dunlap Signed-off-by: Herbert Xu --- include/crypto/internal/ecc.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h index 57cd75242141..a4b48d76f53a 100644 --- a/include/crypto/internal/ecc.h +++ b/include/crypto/internal/ecc.h @@ -72,8 +72,8 @@ static inline void ecc_swap_digits(const void *in, u64 *out, unsigned int ndigit /** * ecc_digits_from_bytes() - Create ndigits-sized digits array from byte array * @in: Input byte array - * @nbytes Size of input byte array - * @out Output digits array + * @nbytes: Size of input byte array + * @out: Output digits array * @ndigits: Number of digits to create from byte array * * The first byte in the input byte array is expected to hold the most @@ -90,7 +90,7 @@ void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, * @private_key: private key to be used for the given curve * @private_key_len: private key length * - * Returns 0 if the key is acceptable, a negative value otherwise + * Returns: 0 if the key is acceptable, a negative value otherwise */ int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits, const u64 *private_key, unsigned int private_key_len); @@ -104,7 +104,7 @@ int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits, * @ndigits: curve number of digits * @private_key: buffer for storing the generated private key * - * Returns 0 if the private key was generated successfully, a negative value + * Returns: 0 if the private key was generated successfully, a negative value * if an error occurred. */ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, @@ -118,7 +118,7 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, * @private_key: pregenerated private key for the given curve * @public_key: buffer for storing the generated public key * - * Returns 0 if the public key was generated successfully, a negative value + * Returns: 0 if the public key was generated successfully, a negative value * if an error occurred. */ int ecc_make_pub_key(const unsigned int curve_id, unsigned int ndigits, @@ -136,7 +136,7 @@ int ecc_make_pub_key(const unsigned int curve_id, unsigned int ndigits, * Note: It is recommended that you hash the result of crypto_ecdh_shared_secret * before using it for symmetric encryption or HMAC. * - * Returns 0 if the shared secret was generated successfully, a negative value + * Returns: 0 if the shared secret was generated successfully, a negative value * if an error occurred. */ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits, @@ -179,6 +179,8 @@ int ecc_is_pubkey_valid_full(const struct ecc_curve *curve, * * @vli: vli to check. * @ndigits: length of the @vli + * + * Returns: %true if vli == 0, %false otherwise. */ bool vli_is_zero(const u64 *vli, unsigned int ndigits); @@ -189,7 +191,7 @@ bool vli_is_zero(const u64 *vli, unsigned int ndigits); * @right: vli * @ndigits: length of both vlis * - * Returns sign of @left - @right, i.e. -1 if @left < @right, + * Returns: sign of @left - @right, i.e. -1 if @left < @right, * 0 if @left == @right, 1 if @left > @right. */ int vli_cmp(const u64 *left, const u64 *right, unsigned int ndigits); @@ -199,7 +201,7 @@ int vli_cmp(const u64 *left, const u64 *right, unsigned int ndigits); * * @result: where to write result * @left: vli - * @right vli + * @right: vli * @ndigits: length of all vlis * * Note: can modify in-place. @@ -263,7 +265,7 @@ void vli_mod_mult_slow(u64 *result, const u64 *left, const u64 *right, unsigned int vli_num_bits(const u64 *vli, unsigned int ndigits); /** - * ecc_aloc_point() - Allocate ECC point. + * ecc_alloc_point() - Allocate ECC point. * * @ndigits: Length of vlis in u64 qwords. * @@ -281,7 +283,7 @@ void ecc_free_point(struct ecc_point *p); /** * ecc_point_is_zero() - Check if point is zero. * - * @p: Point to check for zero. + * @point: Point to check for zero. * * Return: true if point is the point at infinity, false otherwise. */ -- cgit v1.2.3 From 30b0515342db48ac9ffd9999648de0f7ca1d6a87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add per-CPU data to DSQs Add per-CPU data structure to dispatch queues. Each DSQ now has a percpu scx_dsq_pcpu which contains a back-pointer to the DSQ. This will be used by future changes to implement per-CPU reenqueue tracking for user DSQs. init_dsq() now allocates the percpu data and can fail, so it returns an error code. All callers are updated to handle failures. exit_dsq() is added to free the percpu data and is called from all DSQ cleanup paths. In scx_bpf_create_dsq(), init_dsq() is called before rcu_read_lock() since alloc_percpu() requires GFP_KERNEL context, and dsq->sched is set afterwards. v2: Fix err_free_pcpu to only exit_dsq() initialized bypass DSQs (Andrea Righi). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 +++ kernel/sched/ext.c | 87 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 77 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index f354d7d34306..98cc1f41b91e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,6 +62,10 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_dsq_pcpu { + struct scx_dispatch_q *dsq; +}; + /* * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to @@ -79,6 +83,7 @@ struct scx_dispatch_q { struct rhash_head hash_node; struct llist_node free_node; struct scx_sched *sched; + struct scx_dsq_pcpu __percpu *pcpu; struct rcu_head rcu; }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d8ea12ddc206..aea09eb36873 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4020,15 +4020,42 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, - struct scx_sched *sch) +static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { + s32 cpu; + memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; dsq->sched = sch; + + dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); + if (!dsq->pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + + pcpu->dsq = dsq; + } + + return 0; +} + +static void exit_dsq(struct scx_dispatch_q *dsq) +{ + free_percpu(dsq->pcpu); +} + +static void free_dsq_rcufn(struct rcu_head *rcu) +{ + struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); + + exit_dsq(dsq); + kfree(dsq); } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -4037,7 +4064,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) struct scx_dispatch_q *dsq, *tmp_dsq; llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) - kfree_rcu(dsq, rcu); + call_rcu(&dsq->rcu, free_dsq_rcufn); } static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); @@ -4234,15 +4261,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work) cgroup_put(sch_cgroup(sch)); #endif /* CONFIG_EXT_SUB_SCHED */ - /* - * $sch would have entered bypass mode before the RCU grace period. As - * that blocks new deferrals, all deferred_reenq_local_node's must be - * off-list by now. - */ for_each_possible_cpu(cpu) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + /* + * $sch would have entered bypass mode before the RCU grace + * period. As that blocks new deferrals, all + * deferred_reenq_local_node's must be off-list by now. + */ WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); + + exit_dsq(bypass_dsq(sch, cpu)); } free_percpu(sch->pcpu); @@ -5787,6 +5816,9 @@ static int alloc_kick_syncs(void) static void free_pnode(struct scx_sched_pnode *pnode) { + if (!pnode) + return; + exit_dsq(&pnode->global_dsq); kfree(pnode); } @@ -5798,7 +5830,10 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) if (!pnode) return NULL; - init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch); + if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { + kfree(pnode); + return NULL; + } return pnode; } @@ -5809,7 +5844,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, { struct scx_sched *sch; s32 level = parent ? parent->level + 1 : 0; - s32 node, cpu, ret; + s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; sch = kzalloc_flex(*sch, ancestors, level); if (!sch) @@ -5848,8 +5883,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, goto err_free_pnode; } - for_each_possible_cpu(cpu) - init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + for_each_possible_cpu(cpu) { + ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + if (ret) { + bypass_fail_cpu = cpu; + goto err_free_pcpu; + } + } for_each_possible_cpu(cpu) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); @@ -5931,6 +5971,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, err_stop_helper: kthread_destroy_worker(sch->helper); err_free_pcpu: + for_each_possible_cpu(cpu) { + if (cpu == bypass_fail_cpu) + break; + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); err_free_pnode: for_each_node_state(node, N_POSSIBLE) @@ -7173,7 +7218,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); /* local_dsq's sch will be set during scx_root_enable() */ - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL); + BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -7872,11 +7917,21 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a if (!dsq) return -ENOMEM; + /* + * init_dsq() must be called in GFP_KERNEL context. Init it with NULL + * @sch and update afterwards. + */ + ret = init_dsq(dsq, dsq_id, NULL); + if (ret) { + kfree(dsq); + return ret; + } + rcu_read_lock(); sch = scx_prog_sched(aux); if (sch) { - init_dsq(dsq, dsq_id, sch); + dsq->sched = sch; ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); } else { @@ -7884,8 +7939,10 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a } rcu_read_unlock(); - if (ret) + if (ret) { + exit_dsq(dsq); kfree(dsq); + } return ret; } -- cgit v1.2.3 From 35250720d6ed1e83e0d1e12b7e8bf7b8316d7d58 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Factor out nldsq_cursor_next_task() and nldsq_cursor_lost_task() Factor out cursor-based DSQ iteration from bpf_iter_scx_dsq_next() into nldsq_cursor_next_task() and the task-lost check from scx_dsq_move() into nldsq_cursor_lost_task() to prepare for reuse. As ->priv is only used to record dsq->seq for cursors, update INIT_DSQ_LIST_CURSOR() to take the DSQ pointer and set ->priv from dsq->seq so that users don't have to read it manually. Move scx_dsq_iter_flags enum earlier so nldsq_cursor_next_task() can use SCX_DSQ_ITER_REV. bypass_lb_cpu() now sets cursor.priv to dsq->seq but doesn't use it. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 +- kernel/sched/ext.c | 154 +++++++++++++++++++++++++++++----------------- 2 files changed, 102 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 98cc1f41b91e..303f57dfb947 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -157,11 +157,11 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; -#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ +#define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \ (struct scx_dsq_list_node) { \ - .node = LIST_HEAD_INIT((__node).node), \ + .node = LIST_HEAD_INIT((__cursor).node), \ .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ - .priv = (__priv), \ + .priv = READ_ONCE((__dsq)->seq), \ } struct scx_sched; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index aea09eb36873..f51e4c20cd95 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -570,9 +570,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, return true; } +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being iterated + * @dsq: non-local dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -612,6 +625,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ (p) = nldsq_next_task((dsq), (p), false)) +/** + * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @dsq: non-local dsq being iterated + * + * Find the next task in a cursor based iteration. The caller must have + * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock + * between the iteration steps. + * + * Only tasks which were queued before @cursor was initialized are visible. This + * bounds the iteration and guarantees that vtime never jumps in the other + * direction while iterating. + */ +static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, + struct scx_dispatch_q *dsq) +{ + bool rev = cursor->flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + + lockdep_assert_held(&dsq->lock); + BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); + + if (list_empty(&cursor->node)) + p = NULL; + else + p = container_of(cursor, struct task_struct, scx.dsq_list); + + /* skip cursors and tasks that were queued after @cursor init */ + do { + p = nldsq_next_task(dsq, p, rev); + } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&cursor->node, &p->scx.dsq_list.node); + else + list_move(&cursor->node, &p->scx.dsq_list.node); + } else { + list_del_init(&cursor->node); + } + + return p; +} + +/** + * nldsq_cursor_lost_task - Test whether someone else took the task since iteration + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @rq: rq @p was on + * @dsq: dsq @p was on + * @p: target task + * + * @p is a task returned by nldsq_cursor_next_task(). The locks may have been + * dropped and re-acquired inbetween. Verify that no one else took or is in the + * process of taking @p from @dsq. + * + * On %false return, the caller can assume full ownership of @p. + */ +static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, + struct rq *rq, struct scx_dispatch_q *dsq, + struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + lockdep_assert_held(&dsq->lock); + + /* + * @p could have already left $src_dsq, got re-enqueud, or be in the + * process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != dsq || + u32_before(cursor->priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0)) + return true; + + /* if @p has stayed on @dsq, its rq couldn't have changed */ + if (WARN_ON_ONCE(rq != task_rq(p))) + return true; + + return false; +} /* * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] @@ -619,19 +711,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, * changes without breaking backward compatibility. Can be used with * bpf_for_each(). See bpf_iter_scx_dsq_*(). */ -enum scx_dsq_iter_flags { - /* iterate in the reverse dispatch order */ - SCX_DSQ_ITER_REV = 1U << 16, - - __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, - __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, - - __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, - __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | - __SCX_DSQ_ITER_HAS_SLICE | - __SCX_DSQ_ITER_HAS_VTIME, -}; - struct bpf_iter_scx_dsq_kern { struct scx_dsq_list_node cursor; struct scx_dispatch_q *dsq; @@ -4497,7 +4576,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, struct rq *donor_rq = cpu_rq(donor); struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); struct task_struct *p, *n; - struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; u32 nr_balanced = 0, min_delta_us; @@ -7542,14 +7621,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); - /* - * Did someone else get to it? @p could have already left $src_dsq, got - * re-enqueud, or be in the process of being consumed by someone else. - */ - if (unlikely(p->scx.dsq != src_dsq || - u32_before(kit->cursor.priv, p->scx.dsq_seq) || - p->scx.holding_cpu >= 0) || - WARN_ON_ONCE(src_rq != task_rq(p))) { + /* did someone else get to it while we dropped the locks? */ + if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { raw_spin_unlock(&src_dsq->lock); goto out; } @@ -8188,8 +8261,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, - READ_ONCE(kit->dsq->seq)); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); return 0; } @@ -8203,41 +8275,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; - bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; - struct task_struct *p; - unsigned long flags; if (!kit->dsq) return NULL; - raw_spin_lock_irqsave(&kit->dsq->lock, flags); + guard(raw_spinlock_irqsave)(&kit->dsq->lock); - if (list_empty(&kit->cursor.node)) - p = NULL; - else - p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); - - /* - * Only tasks which were queued before the iteration started are - * visible. This bounds BPF iterations and guarantees that vtime never - * jumps in the other direction while iterating. - */ - do { - p = nldsq_next_task(kit->dsq, p, rev); - } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); - - if (p) { - if (rev) - list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); - else - list_move(&kit->cursor.node, &p->scx.dsq_list.node); - } else { - list_del_init(&kit->cursor.node); - } - - raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); - - return p; + return nldsq_cursor_next_task(&kit->cursor, kit->dsq); } /** -- cgit v1.2.3 From 84b1a0ea0b7c23dec240783a592e480780efe459 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support user-defined DSQs by adding a deferred re-enqueue mechanism similar to the local DSQ handling. Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list. process_deferred_reenq_users() then iterates the DSQ using the cursor helpers and re-enqueues each task. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 ++ kernel/sched/ext.c | 128 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + tools/sched_ext/scx_qmap.bpf.c | 57 +++++++++++++++++- 4 files changed, 190 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 303f57dfb947..e77504faa0bc 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,8 +62,14 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_deferred_reenq_user { + struct list_head node; + u64 flags; +}; + struct scx_dsq_pcpu { struct scx_dispatch_q *dsq; + struct scx_deferred_reenq_user deferred_reenq_user; }; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f51e4c20cd95..805c6689c99a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq drl->flags |= reenq_flags; } + schedule_deferred(rq); + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { + struct rq *rq = this_rq(); + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; + + scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) { + if (list_empty(&dru->node)) + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); + dru->flags |= reenq_flags; + } + schedule_deferred(rq); } else { scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); @@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq) } } +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!task_should_reenq(p, reenq_flags)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags = 0; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + swap(dru->flags, reenq_flags); + list_del_init(&dru->node); + } + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); if (!list_empty(&rq->scx.deferred_reenq_locals)) process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); } #ifdef CONFIG_NO_HZ_FULL @@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); pcpu->dsq = dsq; + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); } return 0; @@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, static void exit_dsq(struct scx_dispatch_q *dsq) { + s32 cpu; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; + struct rq *rq = cpu_rq(cpu); + + /* + * There must have been a RCU grace period since the last + * insertion and @dsq should be off the deferred list by now. + */ + if (WARN_ON_ONCE(!list_empty(&dru->node))) { + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + list_del_init(&dru->node); + } + } + free_percpu(dsq->pcpu); } @@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); raw_spin_lock_init(&rq->scx.deferred_reenq_lock); INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, * supported: * * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) + * - User DSQs * * Re-enqueues are performed asynchronously. Can be called from anywhere. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0794852524e7..893f89ce2a77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -810,6 +810,7 @@ struct scx_rq { raw_spinlock_t deferred_reenq_lock; struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 83e8289e8c0c..a4a1b84fe359 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -26,8 +26,11 @@ enum consts { ONE_SEC_IN_NS = 1000000000, + ONE_MSEC_IN_NS = 1000000, + LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS, SHARED_DSQ = 0, HIGHPRI_DSQ = 1, + LOWPRI_DSQ = 2, HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ }; @@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, if (!(tctx = lookup_task_ctx(p))) return -ESRCH; + if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) + return prev_cpu; + cpu = pick_direct_dispatch_cpu(p, prev_cpu); if (cpu >= 0) { @@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* see lowpri_timerfn() */ + if (__COMPAT_has_generic_reenq() && + p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) { + scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags); + return; + } + /* if select_cpu() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { @@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) return 0; } +struct lowpri_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct lowpri_timer); +} lowpri_timer SEC(".maps"); + +/* + * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and + * the tasks are transferred to SHARED_DSQ. + */ +static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + scx_bpf_dsq_reenq(LOWPRI_DSQ, 0); + bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + return 0; +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { u32 key = 0; @@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) return ret; } + ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1); + if (ret) + return ret; + timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) return -ESRCH; - bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); bpf_timer_set_callback(timer, monitor_timerfn); + ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (ret) + return ret; - return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (__COMPAT_has_generic_reenq()) { + /* see lowpri_timerfn() */ + timer = bpf_map_lookup_elem(&lowpri_timer, &key); + if (!timer) + return -ESRCH; + bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, lowpri_timerfn); + ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + if (ret) + return ret; + } + + return 0; } void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) -- cgit v1.2.3 From 7203d77d6e04f83f7b78838eed099d9cac31700b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Simplify task state handling Task states (NONE, INIT, READY, ENABLED) were defined in a separate enum with unshifted values and then shifted when stored in scx_entity.flags. Simplify by defining them as pre-shifted values directly in scx_ent_flags and removing the separate scx_task_state enum. This removes the need for shifting when reading/writing state values. scx_get_task_state() now returns the masked flags value directly. scx_set_task_state() accepts the pre-shifted state value. scx_dump_task() shifts down for display to maintain readable output. No functional changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 28 ++++++++++++++++------------ kernel/sched/ext.c | 19 +++++++++---------- 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e77504faa0bc..e822b374b17f 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -93,7 +93,7 @@ struct scx_dispatch_q { struct rcu_head rcu; }; -/* scx_entity.flags */ +/* sched_ext_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ @@ -101,21 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ - SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + /* + * Bits 8 and 9 are used to carry task state: + * + * NONE ops.init_task() not called yet + * INIT ops.init_task() succeeded, but task can be cancelled + * READY fully initialized, but not in sched_ext + * ENABLED fully initialized and in sched_ext + */ + SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ SCX_TASK_STATE_BITS = 2, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, - SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -}; - -/* scx_entity.flags & SCX_TASK_STATE_MASK */ -enum scx_task_state { - SCX_TASK_NONE, /* ops.init_task() not called yet */ - SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ - SCX_TASK_READY, /* fully initialized, but not in sched_ext */ - SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_NR_STATES, + /* iteration cursor, not a task */ + SCX_TASK_CURSOR = 1 << 31, }; /* scx_entity.dsq_flags */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ee756f1a70e1..f55e1603fc8c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3284,18 +3284,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static enum scx_task_state scx_get_task_state(const struct task_struct *p) +static u32 scx_get_task_state(const struct task_struct *p) { - return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; + return p->scx.flags & SCX_TASK_STATE_MASK; } -static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +static void scx_set_task_state(struct task_struct *p, u32 state) { - enum scx_task_state prev_state = scx_get_task_state(p); + u32 prev_state = scx_get_task_state(p); bool warn = false; - BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); - switch (state) { case SCX_TASK_NONE: break; @@ -3313,11 +3311,11 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) return; } - WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", prev_state, state, p->comm, p->pid); p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state << SCX_TASK_STATE_SHIFT; + p->scx.flags |= state; } static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) @@ -5794,7 +5792,8 @@ static void scx_dump_task(struct scx_sched *sch, own_marker, sch_id_buf, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", - scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, + p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", @@ -6558,7 +6557,7 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp) static bool assert_task_ready_or_enabled(struct task_struct *p) { - enum scx_task_state state = scx_get_task_state(p); + u32 state = scx_get_task_state(p); switch (state) { case SCX_TASK_READY: -- cgit v1.2.3 From ce897abc21b2d5e74981ff2b848f3a08a580d50a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add SCX_TASK_REENQ_REASON flags SCX_ENQ_REENQ indicates that a task is being re-enqueued but doesn't tell the BPF scheduler why. Add SCX_TASK_REENQ_REASON flags using bits 12-13 of p->scx.flags to communicate the reason during ops.enqueue(): - NONE: Not being reenqueued - KFUNC: Reenqueued by scx_bpf_dsq_reenq() and friends More reasons will be added. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 15 +++++++++++++++ kernel/sched/ext.c | 25 ++++++++++++++++++++++--- kernel/sched/ext_internal.h | 10 +++------- 3 files changed, 40 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e822b374b17f..60a4f65d0174 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -118,6 +118,21 @@ enum scx_ent_flags { SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + /* + * Bits 12 and 13 are used to carry reenqueue reason. In addition to + * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for + * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues. + * + * NONE not being reenqueued + * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + */ + SCX_TASK_REENQ_REASON_SHIFT = 12, + SCX_TASK_REENQ_REASON_BITS = 2, + SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT, + + SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f55e1603fc8c..d5849ed4cd3e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3728,8 +3728,10 @@ static void process_ddsp_deferred_locals(struct rq *rq) } } -static bool task_should_reenq(struct task_struct *p, u64 reenq_flags) +static bool task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) { + *reason = SCX_TASK_REENQ_KFUNC; + if (reenq_flags & SCX_REENQ_ANY) return true; return false; @@ -3751,6 +3753,7 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, scx.dsq_list.node) { struct scx_sched *task_sch = scx_task_sched(p); + u32 reason; /* * If @p is being migrated, @p's current CPU may not agree with @@ -3769,16 +3772,24 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) if (!scx_is_descendant(task_sch, sch)) continue; - if (!task_should_reenq(p, reenq_flags)) + if (!task_should_reenq(p, reenq_flags, &reason)) continue; dispatch_dequeue(rq, p); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + list_add_tail(&p->scx.dsq_list.node, &tasks); } list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { list_del_init(&p->scx.dsq_list.node); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; nr_enqueued++; } @@ -3832,12 +3843,13 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag while (likely(!READ_ONCE(sch->bypass_depth))) { struct rq *task_rq; + u32 reason; p = nldsq_cursor_next_task(&cursor, dsq); if (!p) break; - if (!task_should_reenq(p, reenq_flags)) + if (!task_should_reenq(p, reenq_flags, &reason)) continue; task_rq = task_rq(p); @@ -3860,8 +3872,15 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag /* @p is on @dsq, its rq and @dsq are locked */ dispatch_dequeue_locked(p, dsq); raw_spin_unlock(&dsq->lock); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { raw_spin_rq_unlock(locked_rq); locked_rq = NULL; diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index d9eda2e8701c..f8df73044515 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1080,13 +1080,9 @@ enum scx_enq_flags { SCX_ENQ_PREEMPT = 1LLU << 32, /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. + * The task being enqueued was previously enqueued on a DSQ, but was + * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find + * out why a given task is being reenqueued. */ SCX_ENQ_REENQ = 1LLU << 40, -- cgit v1.2.3 From 7a3aff163c77159d262217382ec0e9c06c847b46 Mon Sep 17 00:00:00 2001 From: Chaohai Chen Date: Thu, 5 Mar 2026 10:51:24 +0800 Subject: scsi: core: Drop using the host_lock to protect async_scan race condition Previously, host_lock was used to prevent bit-set conflicts in async_scan, but this approach introduced naked reads in some code paths. Convert async_scan from a bitfield to a bool type to eliminate bit-level conflicts entirely. Use __guarded_by(&scan_mutex) to indicate that the async_scan variable is protected by scan_mutex. Signed-off-by: Chaohai Chen Reviewed-by: Bart Van Assche Reviewed-by: John Garry Link: https://patch.msgid.link/20260305025125.3649517-1-wdhh6@aliyun.com Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 10 ++-------- include/scsi/scsi_host.h | 7 ++++--- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 60c06fa4ec32..efcaf85ff699 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1943,7 +1943,6 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost) static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) { struct async_scan_data *data = NULL; - unsigned long flags; if (strncmp(scsi_scan_type, "sync", 4) == 0) return NULL; @@ -1962,9 +1961,7 @@ static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) goto err; init_completion(&data->prev_finished); - spin_lock_irqsave(shost->host_lock, flags); - shost->async_scan = 1; - spin_unlock_irqrestore(shost->host_lock, flags); + shost->async_scan = true; mutex_unlock(&shost->scan_mutex); spin_lock(&async_scan_lock); @@ -1992,7 +1989,6 @@ static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) static void scsi_finish_async_scan(struct async_scan_data *data) { struct Scsi_Host *shost; - unsigned long flags; if (!data) return; @@ -2012,9 +2008,7 @@ static void scsi_finish_async_scan(struct async_scan_data *data) scsi_sysfs_add_devices(shost); - spin_lock_irqsave(shost->host_lock, flags); - shost->async_scan = 0; - spin_unlock_irqrestore(shost->host_lock, flags); + shost->async_scan = false; mutex_unlock(&shost->scan_mutex); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index f6e12565a81d..7e2011830ba4 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -660,6 +660,10 @@ struct Scsi_Host { */ unsigned nr_hw_queues; unsigned nr_maps; + + /* Asynchronous scan in progress */ + bool async_scan __guarded_by(&scan_mutex); + unsigned active_mode:2; /* @@ -678,9 +682,6 @@ struct Scsi_Host { /* Task mgmt function in progress */ unsigned tmf_in_progress:1; - /* Asynchronous scan in progress */ - unsigned async_scan:1; - /* Don't resume host in EH */ unsigned eh_noresume:1; -- cgit v1.2.3 From b5e21a29fe9459aef1e6b20b9315e8f3690f8f31 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Thu, 5 Mar 2026 03:08:56 -0800 Subject: scsi: ufs: core: Add support to notify userspace of UniPro QoS events The UniPro stack manages to repair many potential Link problems without the need to notify the Application Layer. Repair mechanisms of the stack include L2 re-transmission and successful handling of PA_INIT.req. Nevertheless, any successful repair sequence requires Link bandwidth that is no longer vailable for the Application. Therefore, it may be useful for an Application to understand how often such repair attempts are made. The DME implements Quality of Service monitoring using a simple counting scheme, counting error events and comparing them against the number of correctly received or transmitted bytes. When the error counter exceeds a programmed threshold before the byte counter overflows, a DME_QoS.ind is issued to the Application and both counters are reset. When the byte counter overflows before the error counter has reached the programmed threshold, both counters are reset without triggering a DME_QoS.ind. The DME provides Link quality monitoring for the following purposes: 1. Detection of re-occurring repaired fatal error conditions on the Link (PA_INIT loop). This kind of detection is useful if capabilities exchanged between local and peer permit a potential operation at a higher M-PHY Gear, but the physical interconnect between local and peer Device does not, or, after Line quality degradation, no longer satisfies channel characteristics. 2. Detection of degraded inbound or outbound Link quality, to allow an Application to issue an ADAPT sequence for a Link running in HS-G4 or higher HS Gears. This kind of detection is used to monitor a slowly degrading Link quality, e.g., one being affected by temperature and voltage variations, against the expected M-PHY bit error rate. Userspace can configure and enable UniPro QoS via UniPro QoS Attributes (via UFS BSG) and get notified by dme_qos_notification without polling UniPro QoS Status attribute. The dme_qos_notification attribute is a bitfield with the following bit assignments: Bit Description === ====================================== 0 DME QoS Monitor has been reset by host 1 QoS from TX is detected 2 QoS from RX is detected 3 QoS from PA_INIT is detected Signed-off-by: Can Guo Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Link: https://patch.msgid.link/20260305110856.959211-2-can.guo@oss.qualcomm.com Signed-off-by: Martin K. Petersen --- Documentation/ABI/testing/sysfs-driver-ufs | 23 +++++++++++++++++++++++ drivers/ufs/core/ufs-sysfs.c | 30 ++++++++++++++++++++++++++++++ drivers/ufs/core/ufshcd.c | 24 +++++++++++++++++++++--- include/ufs/ufshcd.h | 9 +++++++++ include/ufs/ufshci.h | 1 + 5 files changed, 84 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-driver-ufs b/Documentation/ABI/testing/sysfs-driver-ufs index a90612ab5780..3c422aac778b 100644 --- a/Documentation/ABI/testing/sysfs-driver-ufs +++ b/Documentation/ABI/testing/sysfs-driver-ufs @@ -1768,3 +1768,26 @@ Description: ==================== =========================== The attribute is read only. + +What: /sys/bus/platform/drivers/ufshcd/*/dme_qos_notification +What: /sys/bus/platform/devices/*.ufs/dme_qos_notification +Date: March 2026 +Contact: Can Guo +Description: + This attribute reports and clears pending DME (Device Management + Entity) Quality of Service (QoS) notifications. This attribute + is a bitfield with the following bit assignments: + + Bit Description + === ====================================== + 0 DME QoS Monitor has been reset by host + 1 QoS from TX is detected + 2 QoS from RX is detected + 3 QoS from PA_INIT is detected + + Reading this attribute returns the pending DME QoS notification + bits. Writing '0' to this attribute clears pending DME QoS + notification bits. Writing any non-zero value is invalid and + will be rejected. + + The attribute is read/write. diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c index 384d958615d7..99af3c73f1af 100644 --- a/drivers/ufs/core/ufs-sysfs.c +++ b/drivers/ufs/core/ufs-sysfs.c @@ -605,6 +605,34 @@ static ssize_t device_lvl_exception_id_show(struct device *dev, return sysfs_emit(buf, "%llu\n", exception_id); } +static ssize_t dme_qos_notification_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return sysfs_emit(buf, "0x%x\n", atomic_read(&hba->dme_qos_notification)); +} + +static ssize_t dme_qos_notification_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + unsigned int value; + + if (kstrtouint(buf, 0, &value)) + return -EINVAL; + + /* the only supported usecase is to reset the dme_qos_notification */ + if (value) + return -EINVAL; + + atomic_set(&hba->dme_qos_notification, 0); + + return count; +} + static DEVICE_ATTR_RW(rpm_lvl); static DEVICE_ATTR_RO(rpm_target_dev_state); static DEVICE_ATTR_RO(rpm_target_link_state); @@ -621,6 +649,7 @@ static DEVICE_ATTR_RW(pm_qos_enable); static DEVICE_ATTR_RO(critical_health); static DEVICE_ATTR_RW(device_lvl_exception_count); static DEVICE_ATTR_RO(device_lvl_exception_id); +static DEVICE_ATTR_RW(dme_qos_notification); static struct attribute *ufs_sysfs_ufshcd_attrs[] = { &dev_attr_rpm_lvl.attr, @@ -639,6 +668,7 @@ static struct attribute *ufs_sysfs_ufshcd_attrs[] = { &dev_attr_critical_health.attr, &dev_attr_device_lvl_exception_count.attr, &dev_attr_device_lvl_exception_id.attr, + &dev_attr_dme_qos_notification.attr, NULL }; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 017d05ef94e2..8658e6dc8634 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6966,10 +6966,19 @@ static irqreturn_t ufshcd_update_uic_error(struct ufs_hba *hba) } reg = ufshcd_readl(hba, REG_UIC_ERROR_CODE_DME); - if ((reg & UIC_DME_ERROR) && - (reg & UIC_DME_ERROR_CODE_MASK)) { + if (reg & UIC_DME_ERROR) { ufshcd_update_evt_hist(hba, UFS_EVT_DME_ERR, reg); - hba->uic_error |= UFSHCD_UIC_DME_ERROR; + + if (reg & UIC_DME_ERROR_CODE_MASK) + hba->uic_error |= UFSHCD_UIC_DME_ERROR; + + if (reg & UIC_DME_QOS_MASK) { + atomic_set(&hba->dme_qos_notification, + reg & UIC_DME_QOS_MASK); + if (hba->dme_qos_sysfs_handle) + sysfs_notify_dirent(hba->dme_qos_sysfs_handle); + } + retval |= IRQ_HANDLED; } @@ -9101,6 +9110,12 @@ static int ufshcd_post_device_init(struct ufs_hba *hba) /* UFS device is also active now */ ufshcd_set_ufs_dev_active(hba); + + /* Indicate that DME QoS Monitor has been reset */ + atomic_set(&hba->dme_qos_notification, 0x1); + if (hba->dme_qos_sysfs_handle) + sysfs_notify_dirent(hba->dme_qos_sysfs_handle); + ufshcd_force_reset_auto_bkops(hba); ufshcd_set_timestamp_attr(hba); @@ -9733,6 +9748,7 @@ static void ufshcd_hba_exit(struct ufs_hba *hba) hba->is_powered = false; ufs_put_device_desc(hba); } + sysfs_put(hba->dme_qos_sysfs_handle); } static int ufshcd_execute_start_stop(struct scsi_device *sdev, @@ -11052,6 +11068,8 @@ initialized: goto out_disable; ufs_sysfs_add_nodes(hba->dev); + hba->dme_qos_sysfs_handle = sysfs_get_dirent(hba->dev->kobj.sd, + "dme_qos_notification"); async_schedule(ufshcd_async_scan, hba); device_enable_async_suspend(dev); diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 8563b6648976..182f301c11e7 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -943,6 +943,11 @@ enum ufshcd_mcq_opr { * @critical_health_count: count of critical health exceptions * @dev_lvl_exception_count: count of device level exceptions since last reset * @dev_lvl_exception_id: vendor specific information about the device level exception event. + * @dme_qos_notification: Bitfield of pending DME Quality of Service (QoS) + * events. Bits[3:1] reflect the corresponding bits of UIC DME Error Code + * field within the Host Controller's UECDME register. Bit[0] is a flag + * indicating that the DME QoS Monitor has been reset by the host. + * @dme_qos_sysfs_handle: handle for 'dme_qos_notification' sysfs entry * @rpmbs: list of OP-TEE RPMB devices (one per RPMB region) */ struct ufs_hba { @@ -1116,6 +1121,10 @@ struct ufs_hba { int critical_health_count; atomic_t dev_lvl_exception_count; u64 dev_lvl_exception_id; + + atomic_t dme_qos_notification; + struct kernfs_node *dme_qos_sysfs_handle; + u32 vcc_off_delay_us; struct list_head rpmbs; }; diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index 806fdaf52bd9..49a3a279e448 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -271,6 +271,7 @@ enum { /* UECDME - Host UIC Error Code DME 48h */ #define UIC_DME_ERROR 0x80000000 #define UIC_DME_ERROR_CODE_MASK 0x1 +#define UIC_DME_QOS_MASK 0xE /* UTRIACR - Interrupt Aggregation control register - 0x4Ch */ #define INT_AGGR_TIMEOUT_VAL_MASK 0xFF -- cgit v1.2.3 From b8a177f18df1b439aac708da2d8bd9fcd68bb1eb Mon Sep 17 00:00:00 2001 From: Ketan Patil Date: Thu, 26 Feb 2026 16:31:10 +0000 Subject: memory: tegra: Group error handling related registers Group MC error related registers into a struct as they could have SoC specific values. Tegra264 has different register offsets than the existing devices and so in order to add support for Tegra264 we need to first make this change. Signed-off-by: Ketan Patil Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://patch.msgid.link/20260226163115.1152181-2-ketanp@nvidia.com Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/mc.c | 47 ++++++++++++++++++++++++++++------------- drivers/memory/tegra/mc.h | 16 +------------- drivers/memory/tegra/tegra114.c | 3 ++- drivers/memory/tegra/tegra124.c | 4 +++- drivers/memory/tegra/tegra186.c | 3 ++- drivers/memory/tegra/tegra194.c | 3 ++- drivers/memory/tegra/tegra20.c | 3 ++- drivers/memory/tegra/tegra210.c | 3 ++- drivers/memory/tegra/tegra234.c | 3 ++- drivers/memory/tegra/tegra30.c | 3 ++- include/soc/tegra/mc.h | 22 ++++++++++++++++++- 11 files changed, 71 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c index 67a0b0c07712..63f402aa1976 100644 --- a/drivers/memory/tegra/mc.c +++ b/drivers/memory/tegra/mc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2014-2025 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2014-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -56,6 +56,23 @@ static const struct of_device_id tegra_mc_of_match[] = { }; MODULE_DEVICE_TABLE(of, tegra_mc_of_match); +const struct tegra_mc_regs tegra20_mc_regs = { + .cfg_channel_enable = 0xdf8, + .err_status = 0x08, + .err_add = 0x0c, + .err_add_hi = 0x11fc, + .err_vpr_status = 0x654, + .err_vpr_add = 0x658, + .err_sec_status = 0x67c, + .err_sec_add = 0x680, + .err_mts_status = 0x9b0, + .err_mts_add = 0x9b4, + .err_gen_co_status = 0xc00, + .err_gen_co_add = 0xc04, + .err_route_status = 0x9c0, + .err_route_add = 0x9c4, +}; + static void tegra_mc_devm_action_put_device(void *data) { struct tegra_mc *mc = data; @@ -591,37 +608,37 @@ irqreturn_t tegra30_mc_handle_irq(int irq, void *data) switch (intmask) { case MC_INT_DECERR_VPR: - status_reg = MC_ERR_VPR_STATUS; - addr_reg = MC_ERR_VPR_ADR; + status_reg = mc->soc->regs->err_vpr_status; + addr_reg = mc->soc->regs->err_vpr_add; break; case MC_INT_SECERR_SEC: - status_reg = MC_ERR_SEC_STATUS; - addr_reg = MC_ERR_SEC_ADR; + status_reg = mc->soc->regs->err_sec_status; + addr_reg = mc->soc->regs->err_sec_add; break; case MC_INT_DECERR_MTS: - status_reg = MC_ERR_MTS_STATUS; - addr_reg = MC_ERR_MTS_ADR; + status_reg = mc->soc->regs->err_mts_status; + addr_reg = mc->soc->regs->err_mts_add; break; case MC_INT_DECERR_GENERALIZED_CARVEOUT: - status_reg = MC_ERR_GENERALIZED_CARVEOUT_STATUS; - addr_reg = MC_ERR_GENERALIZED_CARVEOUT_ADR; + status_reg = mc->soc->regs->err_gen_co_status; + addr_reg = mc->soc->regs->err_gen_co_add; break; case MC_INT_DECERR_ROUTE_SANITY: - status_reg = MC_ERR_ROUTE_SANITY_STATUS; - addr_reg = MC_ERR_ROUTE_SANITY_ADR; + status_reg = mc->soc->regs->err_route_status; + addr_reg = mc->soc->regs->err_route_add; break; default: - status_reg = MC_ERR_STATUS; - addr_reg = MC_ERR_ADR; + status_reg = mc->soc->regs->err_status; + addr_reg = mc->soc->regs->err_add; #ifdef CONFIG_PHYS_ADDR_T_64BIT if (mc->soc->has_addr_hi_reg) - addr_hi_reg = MC_ERR_ADR_HI; + addr_hi_reg = mc->soc->regs->err_add_hi; #endif break; } @@ -874,7 +891,7 @@ static void tegra_mc_num_channel_enabled(struct tegra_mc *mc) unsigned int i; u32 value; - value = mc_ch_readl(mc, 0, MC_EMEM_ADR_CFG_CHANNEL_ENABLE); + value = mc_ch_readl(mc, 0, mc->soc->regs->cfg_channel_enable); if (value <= 0) { mc->num_channels = mc->soc->num_channels; return; diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h index 1d97cf4d3a94..bbe3e2690c64 100644 --- a/drivers/memory/tegra/mc.h +++ b/drivers/memory/tegra/mc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2014-2025 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2014-2026 NVIDIA CORPORATION. All rights reserved. */ #ifndef MEMORY_TEGRA_MC_H @@ -14,8 +14,6 @@ #define MC_INTSTATUS 0x00 #define MC_INTMASK 0x04 -#define MC_ERR_STATUS 0x08 -#define MC_ERR_ADR 0x0c #define MC_GART_ERROR_REQ 0x30 #define MC_EMEM_ADR_CFG 0x54 #define MC_DECERR_EMEM_OTHERS_STATUS 0x58 @@ -43,19 +41,7 @@ #define MC_EMEM_ARB_OVERRIDE 0xe8 #define MC_TIMING_CONTROL_DBG 0xf8 #define MC_TIMING_CONTROL 0xfc -#define MC_ERR_VPR_STATUS 0x654 -#define MC_ERR_VPR_ADR 0x658 -#define MC_ERR_SEC_STATUS 0x67c -#define MC_ERR_SEC_ADR 0x680 -#define MC_ERR_MTS_STATUS 0x9b0 -#define MC_ERR_MTS_ADR 0x9b4 -#define MC_ERR_ROUTE_SANITY_STATUS 0x9c0 -#define MC_ERR_ROUTE_SANITY_ADR 0x9c4 -#define MC_ERR_GENERALIZED_CARVEOUT_STATUS 0xc00 -#define MC_ERR_GENERALIZED_CARVEOUT_ADR 0xc04 -#define MC_EMEM_ADR_CFG_CHANNEL_ENABLE 0xdf8 #define MC_GLOBAL_INTSTATUS 0xf24 -#define MC_ERR_ADR_HI 0x11fc #define MC_INT_DECERR_ROUTE_SANITY BIT(20) #define MC_INT_DECERR_GENERALIZED_CARVEOUT BIT(17) diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index 41350570c815..ea7e4c7bb5f8 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2014 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2014-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -1114,4 +1114,5 @@ const struct tegra_mc_soc tegra114_mc_soc = { .resets = tegra114_mc_resets, .num_resets = ARRAY_SIZE(tegra114_mc_resets), .ops = &tegra30_mc_ops, + .regs = &tegra20_mc_regs, }; diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 991d4f7bc070..f0cfe14bb475 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2014 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2014-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -1275,6 +1275,7 @@ const struct tegra_mc_soc tegra124_mc_soc = { .num_resets = ARRAY_SIZE(tegra124_mc_resets), .icc_ops = &tegra124_mc_icc_ops, .ops = &tegra30_mc_ops, + .regs = &tegra20_mc_regs, }; #endif /* CONFIG_ARCH_TEGRA_124_SOC */ @@ -1307,5 +1308,6 @@ const struct tegra_mc_soc tegra132_mc_soc = { .num_resets = ARRAY_SIZE(tegra124_mc_resets), .icc_ops = &tegra124_mc_icc_ops, .ops = &tegra30_mc_ops, + .regs = &tegra20_mc_regs, }; #endif /* CONFIG_ARCH_TEGRA_132_SOC */ diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index aee11457bf8e..51e2dd628fb4 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2017-2025 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2017-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -914,5 +914,6 @@ const struct tegra_mc_soc tegra186_mc_soc = { .ops = &tegra186_mc_ops, .ch_intmask = 0x0000000f, .global_intstatus_channel_shift = 0, + .regs = &tegra20_mc_regs, }; #endif diff --git a/drivers/memory/tegra/tegra194.c b/drivers/memory/tegra/tegra194.c index 26035ac3a1eb..5b7ff2dd6812 100644 --- a/drivers/memory/tegra/tegra194.c +++ b/drivers/memory/tegra/tegra194.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2017-2021 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2017-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -1358,4 +1358,5 @@ const struct tegra_mc_soc tegra194_mc_soc = { .icc_ops = &tegra_mc_icc_ops, .ch_intmask = 0x00000f00, .global_intstatus_channel_shift = 8, + .regs = &tegra20_mc_regs, }; diff --git a/drivers/memory/tegra/tegra20.c b/drivers/memory/tegra/tegra20.c index 4748113bfe9d..1b2b598ab564 100644 --- a/drivers/memory/tegra/tegra20.c +++ b/drivers/memory/tegra/tegra20.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2012 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2012-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -778,4 +778,5 @@ const struct tegra_mc_soc tegra20_mc_soc = { .num_resets = ARRAY_SIZE(tegra20_mc_resets), .icc_ops = &tegra20_mc_icc_ops, .ops = &tegra20_mc_ops, + .regs = &tegra20_mc_regs, }; diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index 3c2949c16fde..e166b33848e9 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2015 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2015-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -1287,4 +1287,5 @@ const struct tegra_mc_soc tegra210_mc_soc = { .resets = tegra210_mc_resets, .num_resets = ARRAY_SIZE(tegra210_mc_resets), .ops = &tegra30_mc_ops, + .regs = &tegra20_mc_regs, }; diff --git a/drivers/memory/tegra/tegra234.c b/drivers/memory/tegra/tegra234.c index 5f57cea48b62..512d054d7592 100644 --- a/drivers/memory/tegra/tegra234.c +++ b/drivers/memory/tegra/tegra234.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2022-2026, NVIDIA CORPORATION. All rights reserved. */ #include @@ -1152,4 +1152,5 @@ const struct tegra_mc_soc tegra234_mc_soc = { * supported. */ .num_carveouts = 32, + .regs = &tegra20_mc_regs, }; diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c index a6bcde4b92c0..337501a30a73 100644 --- a/drivers/memory/tegra/tegra30.c +++ b/drivers/memory/tegra/tegra30.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2014 NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2014-2026 NVIDIA CORPORATION. All rights reserved. */ #include @@ -1400,4 +1400,5 @@ const struct tegra_mc_soc tegra30_mc_soc = { .num_resets = ARRAY_SIZE(tegra30_mc_resets), .icc_ops = &tegra30_mc_icc_ops, .ops = &tegra30_mc_ops, + .regs = &tegra20_mc_regs, }; diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index 6ee4c59db620..372f47e824d5 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2014 NVIDIA Corporation + * Copyright (C) 2014-2026 NVIDIA Corporation */ #ifndef __SOC_TEGRA_MC_H__ @@ -168,6 +168,23 @@ struct tegra_mc_ops { int (*probe_device)(struct tegra_mc *mc, struct device *dev); }; +struct tegra_mc_regs { + unsigned int cfg_channel_enable; + unsigned int err_status; + unsigned int err_add; + unsigned int err_add_hi; + unsigned int err_vpr_status; + unsigned int err_vpr_add; + unsigned int err_sec_status; + unsigned int err_sec_add; + unsigned int err_mts_status; + unsigned int err_mts_add; + unsigned int err_gen_co_status; + unsigned int err_gen_co_add; + unsigned int err_route_status; + unsigned int err_route_add; +}; + struct tegra_mc_soc { const struct tegra_mc_client *clients; unsigned int num_clients; @@ -196,6 +213,7 @@ struct tegra_mc_soc { const struct tegra_mc_icc_ops *icc_ops; const struct tegra_mc_ops *ops; + const struct tegra_mc_regs *regs; }; struct tegra_mc { @@ -256,4 +274,6 @@ tegra_mc_get_carveout_info(struct tegra_mc *mc, unsigned int id, } #endif +extern const struct tegra_mc_regs tegra20_mc_regs; + #endif /* __SOC_TEGRA_MC_H__ */ -- cgit v1.2.3 From 4d865a2374037d2d0842f88822fd753f0918b370 Mon Sep 17 00:00:00 2001 From: Ketan Patil Date: Thu, 26 Feb 2026 16:31:12 +0000 Subject: memory: tegra: Add support for multiple IRQs Add support to handle multiple MC interrupts lines, as supported by Tegra264. Turn the single IRQ handler callback into a counted array to allow specifying a separate handler for each interrupt. Move IRQ handlers into tegra_mc_soc struct, so as to specify SoC specific values. Signed-off-by: Ketan Patil Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://patch.msgid.link/20260226163115.1152181-4-ketanp@nvidia.com Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/mc.c | 34 +++++++++++++++++++++------------- drivers/memory/tegra/mc.h | 1 + drivers/memory/tegra/tegra114.c | 2 ++ drivers/memory/tegra/tegra124.c | 4 ++++ drivers/memory/tegra/tegra186.c | 3 ++- drivers/memory/tegra/tegra194.c | 2 ++ drivers/memory/tegra/tegra20.c | 7 ++++++- drivers/memory/tegra/tegra210.c | 2 ++ drivers/memory/tegra/tegra234.c | 2 ++ drivers/memory/tegra/tegra30.c | 2 ++ include/soc/tegra/mc.h | 8 +++++--- 11 files changed, 49 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c index 63f402aa1976..8114574374d5 100644 --- a/drivers/memory/tegra/mc.c +++ b/drivers/memory/tegra/mc.c @@ -398,6 +398,10 @@ unsigned int tegra_mc_get_emem_device_count(struct tegra_mc *mc) } EXPORT_SYMBOL_GPL(tegra_mc_get_emem_device_count); +const irq_handler_t tegra30_mc_irq_handlers[] = { + tegra30_mc_handle_irq +}; + #if defined(CONFIG_ARCH_TEGRA_3x_SOC) || \ defined(CONFIG_ARCH_TEGRA_114_SOC) || \ defined(CONFIG_ARCH_TEGRA_124_SOC) || \ @@ -542,7 +546,6 @@ int tegra30_mc_probe(struct tegra_mc *mc) const struct tegra_mc_ops tegra30_mc_ops = { .probe = tegra30_mc_probe, - .handle_irq = tegra30_mc_handle_irq, }; #endif @@ -943,26 +946,31 @@ static int tegra_mc_probe(struct platform_device *pdev) tegra_mc_num_channel_enabled(mc); - if (mc->soc->ops && mc->soc->ops->handle_irq) { - mc->irq = platform_get_irq(pdev, 0); - if (mc->irq < 0) - return mc->irq; + if (mc->soc->handle_irq) { + unsigned int i; WARN(!mc->soc->client_id_mask, "missing client ID mask for this SoC\n"); + for (i = 0; i < mc->soc->num_interrupts; i++) { + int irq; + + irq = platform_get_irq(pdev, i); + if (irq < 0) + return irq; + + err = devm_request_irq(&pdev->dev, irq, mc->soc->handle_irq[i], 0, + dev_name(&pdev->dev), mc); + if (err < 0) { + dev_err(&pdev->dev, "failed to request IRQ#%u: %d\n", irq, err); + return err; + } + } + if (mc->soc->num_channels) mc_ch_writel(mc, MC_BROADCAST_CHANNEL, mc->soc->intmask, MC_INTMASK); else mc_writel(mc, mc->soc->intmask, MC_INTMASK); - - err = devm_request_irq(&pdev->dev, mc->irq, mc->soc->ops->handle_irq, 0, - dev_name(&pdev->dev), mc); - if (err < 0) { - dev_err(&pdev->dev, "failed to request IRQ#%u: %d\n", mc->irq, - err); - return err; - } } if (mc->soc->reset_ops) { diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h index 5f816d703d81..34ce03ebc51c 100644 --- a/drivers/memory/tegra/mc.h +++ b/drivers/memory/tegra/mc.h @@ -193,6 +193,7 @@ extern const struct tegra_mc_ops tegra186_mc_ops; #endif irqreturn_t tegra30_mc_handle_irq(int irq, void *data); +extern const irq_handler_t tegra30_mc_irq_handlers[1]; extern const char * const tegra_mc_status_names[32]; extern const char * const tegra_mc_error_names[8]; diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index ea7e4c7bb5f8..fffb28eea57f 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -1115,4 +1115,6 @@ const struct tegra_mc_soc tegra114_mc_soc = { .num_resets = ARRAY_SIZE(tegra114_mc_resets), .ops = &tegra30_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index f0cfe14bb475..2cf733198782 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -1276,6 +1276,8 @@ const struct tegra_mc_soc tegra124_mc_soc = { .icc_ops = &tegra124_mc_icc_ops, .ops = &tegra30_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; #endif /* CONFIG_ARCH_TEGRA_124_SOC */ @@ -1309,5 +1311,7 @@ const struct tegra_mc_soc tegra132_mc_soc = { .icc_ops = &tegra124_mc_icc_ops, .ops = &tegra30_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; #endif /* CONFIG_ARCH_TEGRA_132_SOC */ diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index 51e2dd628fb4..eb1eaaffc79a 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -174,7 +174,6 @@ const struct tegra_mc_ops tegra186_mc_ops = { .remove = tegra186_mc_remove, .resume = tegra186_mc_resume, .probe_device = tegra186_mc_probe_device, - .handle_irq = tegra30_mc_handle_irq, }; #if defined(CONFIG_ARCH_TEGRA_186_SOC) @@ -915,5 +914,7 @@ const struct tegra_mc_soc tegra186_mc_soc = { .ch_intmask = 0x0000000f, .global_intstatus_channel_shift = 0, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; #endif diff --git a/drivers/memory/tegra/tegra194.c b/drivers/memory/tegra/tegra194.c index 5b7ff2dd6812..cb0e7886857d 100644 --- a/drivers/memory/tegra/tegra194.c +++ b/drivers/memory/tegra/tegra194.c @@ -1359,4 +1359,6 @@ const struct tegra_mc_soc tegra194_mc_soc = { .ch_intmask = 0x00000f00, .global_intstatus_channel_shift = 8, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; diff --git a/drivers/memory/tegra/tegra20.c b/drivers/memory/tegra/tegra20.c index 1b2b598ab564..6750b08d875f 100644 --- a/drivers/memory/tegra/tegra20.c +++ b/drivers/memory/tegra/tegra20.c @@ -761,9 +761,12 @@ static irqreturn_t tegra20_mc_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static const irq_handler_t tegra20_mc_irq_handlers[] = { + tegra20_mc_handle_irq +}; + static const struct tegra_mc_ops tegra20_mc_ops = { .probe = tegra20_mc_probe, - .handle_irq = tegra20_mc_handle_irq, }; const struct tegra_mc_soc tegra20_mc_soc = { @@ -779,4 +782,6 @@ const struct tegra_mc_soc tegra20_mc_soc = { .icc_ops = &tegra20_mc_icc_ops, .ops = &tegra20_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra20_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra20_mc_irq_handlers), }; diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index e166b33848e9..8283601ab52c 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1288,4 +1288,6 @@ const struct tegra_mc_soc tegra210_mc_soc = { .num_resets = ARRAY_SIZE(tegra210_mc_resets), .ops = &tegra30_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; diff --git a/drivers/memory/tegra/tegra234.c b/drivers/memory/tegra/tegra234.c index 512d054d7592..9586d7528fb7 100644 --- a/drivers/memory/tegra/tegra234.c +++ b/drivers/memory/tegra/tegra234.c @@ -1153,4 +1153,6 @@ const struct tegra_mc_soc tegra234_mc_soc = { */ .num_carveouts = 32, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c index 337501a30a73..ff89b9078772 100644 --- a/drivers/memory/tegra/tegra30.c +++ b/drivers/memory/tegra/tegra30.c @@ -1401,4 +1401,6 @@ const struct tegra_mc_soc tegra30_mc_soc = { .icc_ops = &tegra30_mc_icc_ops, .ops = &tegra30_mc_ops, .regs = &tegra20_mc_regs, + .handle_irq = tegra30_mc_irq_handlers, + .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), }; diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index 372f47e824d5..d07de04c0f33 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -10,10 +10,11 @@ #include #include #include +#include #include #include -#include #include +#include struct clk; struct device; @@ -164,7 +165,6 @@ struct tegra_mc_ops { int (*probe)(struct tegra_mc *mc); void (*remove)(struct tegra_mc *mc); int (*resume)(struct tegra_mc *mc); - irqreturn_t (*handle_irq)(int irq, void *data); int (*probe_device)(struct tegra_mc *mc, struct device *dev); }; @@ -214,6 +214,9 @@ struct tegra_mc_soc { const struct tegra_mc_icc_ops *icc_ops; const struct tegra_mc_ops *ops; const struct tegra_mc_regs *regs; + + const irq_handler_t *handle_irq; + unsigned int num_interrupts; }; struct tegra_mc { @@ -224,7 +227,6 @@ struct tegra_mc { void __iomem *bcast_ch_regs; void __iomem **ch_regs; struct clk *clk; - int irq; const struct tegra_mc_soc *soc; unsigned long tick; -- cgit v1.2.3 From 2e4cfaa78eb98d2623367818c859225c6b6bf701 Mon Sep 17 00:00:00 2001 From: Ketan Patil Date: Thu, 26 Feb 2026 16:31:13 +0000 Subject: memory: tegra: Group SoC specific fields Introduce new SoC specific fields in tegra_mc_soc struct for high address mask and error status type mask because Tegra264 has different values for these than the existing devices. Error status registers e.g. MC_ERR_STATUS_0 has few bits which indicate the type of the error. In order to obtain such type of error from error status register, we use error status type mask. Similarly, these error status registers have bits which indicate the higher address bits of the address responsible for mc error. In order to obtain such higher address, we use high address mask. Make this change to prepare for adding MC interrupt support for Tegra264. Signed-off-by: Ketan Patil Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://patch.msgid.link/20260226163115.1152181-5-ketanp@nvidia.com [krzk: Fix checkpatch warning] Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/mc.c | 11 +++++++---- drivers/memory/tegra/mc.h | 2 -- drivers/memory/tegra/tegra114.c | 1 + drivers/memory/tegra/tegra124.c | 4 ++++ drivers/memory/tegra/tegra186.c | 2 ++ drivers/memory/tegra/tegra194.c | 2 ++ drivers/memory/tegra/tegra20.c | 1 + drivers/memory/tegra/tegra210.c | 2 ++ drivers/memory/tegra/tegra234.c | 2 ++ drivers/memory/tegra/tegra30.c | 1 + include/soc/tegra/mc.h | 2 ++ 11 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c index 8114574374d5..5d0d9b7fc534 100644 --- a/drivers/memory/tegra/mc.c +++ b/drivers/memory/tegra/mc.c @@ -658,9 +658,12 @@ irqreturn_t tegra30_mc_handle_irq(int irq, void *data) addr = mc_ch_readl(mc, channel, addr_hi_reg); else addr = mc_readl(mc, addr_hi_reg); - } else { + } else if (mc->soc->mc_addr_hi_mask) { addr = ((value >> MC_ERR_STATUS_ADR_HI_SHIFT) & - MC_ERR_STATUS_ADR_HI_MASK); + mc->soc->mc_addr_hi_mask); + } else { + dev_err_ratelimited(mc->dev, "Unable to determine high address!"); + return IRQ_NONE; } addr <<= 32; } @@ -685,11 +688,11 @@ irqreturn_t tegra30_mc_handle_irq(int irq, void *data) } } - type = (value & MC_ERR_STATUS_TYPE_MASK) >> + type = (value & mc->soc->mc_err_status_type_mask) >> MC_ERR_STATUS_TYPE_SHIFT; desc = tegra_mc_error_names[type]; - switch (value & MC_ERR_STATUS_TYPE_MASK) { + switch (value & mc->soc->mc_err_status_type_mask) { case MC_ERR_STATUS_TYPE_INVALID_SMMU_PAGE: perm[0] = ' '; perm[1] = '['; diff --git a/drivers/memory/tegra/mc.h b/drivers/memory/tegra/mc.h index 34ce03ebc51c..b286c2558fd5 100644 --- a/drivers/memory/tegra/mc.h +++ b/drivers/memory/tegra/mc.h @@ -78,10 +78,8 @@ #define MC_ERR_STATUS_TYPE_SHIFT 28 #define MC_ERR_STATUS_TYPE_INVALID_SMMU_PAGE (0x6 << 28) -#define MC_ERR_STATUS_TYPE_MASK (0x7 << 28) #define MC_ERR_STATUS_ADR_HI_SHIFT 20 -#define MC_ERR_STATUS_ADR_HI_MASK 0x3 #define MC_BROADCAST_CHANNEL ~0 diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index fffb28eea57f..3a061a2d881e 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -1117,4 +1117,5 @@ const struct tegra_mc_soc tegra114_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 2cf733198782..4d394889c1e9 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -1278,6 +1278,8 @@ const struct tegra_mc_soc tegra124_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; #endif /* CONFIG_ARCH_TEGRA_124_SOC */ @@ -1313,5 +1315,7 @@ const struct tegra_mc_soc tegra132_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; #endif /* CONFIG_ARCH_TEGRA_132_SOC */ diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index eb1eaaffc79a..94cad76c52ac 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -916,5 +916,7 @@ const struct tegra_mc_soc tegra186_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; #endif diff --git a/drivers/memory/tegra/tegra194.c b/drivers/memory/tegra/tegra194.c index cb0e7886857d..38852b2a0f44 100644 --- a/drivers/memory/tegra/tegra194.c +++ b/drivers/memory/tegra/tegra194.c @@ -1361,4 +1361,6 @@ const struct tegra_mc_soc tegra194_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/drivers/memory/tegra/tegra20.c b/drivers/memory/tegra/tegra20.c index 6750b08d875f..a5cc770437ae 100644 --- a/drivers/memory/tegra/tegra20.c +++ b/drivers/memory/tegra/tegra20.c @@ -784,4 +784,5 @@ const struct tegra_mc_soc tegra20_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra20_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra20_mc_irq_handlers), + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index 8283601ab52c..aa606df8a679 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1290,4 +1290,6 @@ const struct tegra_mc_soc tegra210_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/drivers/memory/tegra/tegra234.c b/drivers/memory/tegra/tegra234.c index 9586d7528fb7..67d5d4e01a65 100644 --- a/drivers/memory/tegra/tegra234.c +++ b/drivers/memory/tegra/tegra234.c @@ -1155,4 +1155,6 @@ const struct tegra_mc_soc tegra234_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_addr_hi_mask = 0x3, + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c index ff89b9078772..8a26a2f204e9 100644 --- a/drivers/memory/tegra/tegra30.c +++ b/drivers/memory/tegra/tegra30.c @@ -1403,4 +1403,5 @@ const struct tegra_mc_soc tegra30_mc_soc = { .regs = &tegra20_mc_regs, .handle_irq = tegra30_mc_irq_handlers, .num_interrupts = ARRAY_SIZE(tegra30_mc_irq_handlers), + .mc_err_status_type_mask = (0x7 << 28), }; diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index d07de04c0f33..b9b1763b10b5 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -217,6 +217,8 @@ struct tegra_mc_soc { const irq_handler_t *handle_irq; unsigned int num_interrupts; + unsigned int mc_addr_hi_mask; + unsigned int mc_err_status_type_mask; }; struct tegra_mc { -- cgit v1.2.3 From 9f2614510960f0761144d14e1b4c4d82e0c098e9 Mon Sep 17 00:00:00 2001 From: Ketan Patil Date: Thu, 26 Feb 2026 16:31:14 +0000 Subject: memory: tegra: Prepare for supporting multiple intmask registers Add a new structure for the intmask register e.g. MC_INTMASK_0 and it's mask value. Add an array of these new structures to prepare for supporting multiple intmask registers. This is done in preparation for adding support for Tegra264 which supports multiple intmask registers. Signed-off-by: Ketan Patil Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://patch.msgid.link/20260226163115.1152181-6-ketanp@nvidia.com [krzk: Fix checkpatch warning] Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/mc.c | 16 +++++++++------- drivers/memory/tegra/tegra114.c | 12 ++++++++++-- drivers/memory/tegra/tegra124.c | 28 ++++++++++++++++++++++------ drivers/memory/tegra/tegra186.c | 14 +++++++++++--- drivers/memory/tegra/tegra194.c | 15 +++++++++++---- drivers/memory/tegra/tegra20.c | 14 +++++++++++--- drivers/memory/tegra/tegra210.c | 14 +++++++++++--- drivers/memory/tegra/tegra234.c | 15 +++++++++++---- drivers/memory/tegra/tegra264.c | 17 ++++++++++++----- drivers/memory/tegra/tegra30.c | 12 ++++++++++-- include/soc/tegra/mc.h | 8 +++++++- 11 files changed, 125 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c index 5d0d9b7fc534..dccebbed7833 100644 --- a/drivers/memory/tegra/mc.c +++ b/drivers/memory/tegra/mc.c @@ -586,9 +586,9 @@ irqreturn_t tegra30_mc_handle_irq(int irq, void *data) } /* mask all interrupts to avoid flooding */ - status = mc_ch_readl(mc, channel, MC_INTSTATUS) & mc->soc->intmask; + status = mc_ch_readl(mc, channel, MC_INTSTATUS) & mc->soc->intmasks[0].mask; } else { - status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmask; + status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmasks[0].mask; } if (!status) @@ -969,11 +969,13 @@ static int tegra_mc_probe(struct platform_device *pdev) } } - if (mc->soc->num_channels) - mc_ch_writel(mc, MC_BROADCAST_CHANNEL, mc->soc->intmask, - MC_INTMASK); - else - mc_writel(mc, mc->soc->intmask, MC_INTMASK); + for (i = 0; i < mc->soc->num_intmasks; i++) { + if (mc->soc->num_channels) + mc_ch_writel(mc, MC_BROADCAST_CHANNEL, mc->soc->intmasks[i].mask, + mc->soc->intmasks[i].reg); + else + mc_writel(mc, mc->soc->intmasks[i].mask, mc->soc->intmasks[i].reg); + } } if (mc->soc->reset_ops) { diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index 3a061a2d881e..02dd4e26288a 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -1101,6 +1101,14 @@ static const struct tegra_mc_reset tegra114_mc_resets[] = { TEGRA114_MC_RESET(VI, 0x200, 0x204, 17), }; +static const struct tegra_mc_intmask tegra114_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION | + MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra114_mc_soc = { .clients = tegra114_mc_clients, .num_clients = ARRAY_SIZE(tegra114_mc_clients), @@ -1108,8 +1116,8 @@ const struct tegra_mc_soc tegra114_mc_soc = { .atom_size = 32, .client_id_mask = 0x7f, .smmu = &tegra114_smmu_soc, - .intmask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION | - MC_INT_DECERR_EMEM, + .intmasks = tegra114_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra114_mc_intmasks), .reset_ops = &tegra_mc_reset_ops_common, .resets = tegra114_mc_resets, .num_resets = ARRAY_SIZE(tegra114_mc_resets), diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 4d394889c1e9..df87c5038625 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -1258,6 +1258,15 @@ static const struct tegra_smmu_soc tegra124_smmu_soc = { .num_asids = 128, }; +static const struct tegra_mc_intmask tegra124_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra124_mc_soc = { .clients = tegra124_mc_clients, .num_clients = ARRAY_SIZE(tegra124_mc_clients), @@ -1267,9 +1276,8 @@ const struct tegra_mc_soc tegra124_mc_soc = { .smmu = &tegra124_smmu_soc, .emem_regs = tegra124_mc_emem_regs, .num_emem_regs = ARRAY_SIZE(tegra124_mc_emem_regs), - .intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra124_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra124_mc_intmasks), .reset_ops = &tegra_mc_reset_ops_common, .resets = tegra124_mc_resets, .num_resets = ARRAY_SIZE(tegra124_mc_resets), @@ -1297,6 +1305,15 @@ static const struct tegra_smmu_soc tegra132_smmu_soc = { .num_asids = 128, }; +static const struct tegra_mc_intmask tegra132_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra132_mc_soc = { .clients = tegra124_mc_clients, .num_clients = ARRAY_SIZE(tegra124_mc_clients), @@ -1304,9 +1321,8 @@ const struct tegra_mc_soc tegra132_mc_soc = { .atom_size = 32, .client_id_mask = 0x7f, .smmu = &tegra132_smmu_soc, - .intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra132_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra132_mc_intmasks), .reset_ops = &tegra_mc_reset_ops_common, .resets = tegra124_mc_resets, .num_resets = ARRAY_SIZE(tegra124_mc_resets), diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index 94cad76c52ac..91d56165605f 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -901,15 +901,23 @@ static const struct tegra_mc_client tegra186_mc_clients[] = { }, }; +static const struct tegra_mc_intmask tegra186_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_GENERALIZED_CARVEOUT | MC_INT_DECERR_MTS | + MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra186_mc_soc = { .num_clients = ARRAY_SIZE(tegra186_mc_clients), .clients = tegra186_mc_clients, .num_address_bits = 40, .num_channels = 4, .client_id_mask = 0xff, - .intmask = MC_INT_DECERR_GENERALIZED_CARVEOUT | MC_INT_DECERR_MTS | - MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra186_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra186_mc_intmasks), .ops = &tegra186_mc_ops, .ch_intmask = 0x0000000f, .global_intstatus_channel_shift = 0, diff --git a/drivers/memory/tegra/tegra194.c b/drivers/memory/tegra/tegra194.c index 38852b2a0f44..a8cc57690696 100644 --- a/drivers/memory/tegra/tegra194.c +++ b/drivers/memory/tegra/tegra194.c @@ -1343,16 +1343,23 @@ static const struct tegra_mc_client tegra194_mc_clients[] = { }, }; +static const struct tegra_mc_intmask tegra194_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_ROUTE_SANITY | MC_INT_DECERR_GENERALIZED_CARVEOUT | + MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra194_mc_soc = { .num_clients = ARRAY_SIZE(tegra194_mc_clients), .clients = tegra194_mc_clients, .num_address_bits = 40, .num_channels = 16, .client_id_mask = 0xff, - .intmask = MC_INT_DECERR_ROUTE_SANITY | - MC_INT_DECERR_GENERALIZED_CARVEOUT | MC_INT_DECERR_MTS | - MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra194_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra194_mc_intmasks), .has_addr_hi_reg = true, .ops = &tegra186_mc_ops, .icc_ops = &tegra_mc_icc_ops, diff --git a/drivers/memory/tegra/tegra20.c b/drivers/memory/tegra/tegra20.c index a5cc770437ae..ff9e151b5a4c 100644 --- a/drivers/memory/tegra/tegra20.c +++ b/drivers/memory/tegra/tegra20.c @@ -695,7 +695,7 @@ static irqreturn_t tegra20_mc_handle_irq(int irq, void *data) unsigned int bit; /* mask all interrupts to avoid flooding */ - status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmask; + status = mc_readl(mc, MC_INTSTATUS) & mc->soc->intmasks[0].mask; if (!status) return IRQ_NONE; @@ -769,13 +769,21 @@ static const struct tegra_mc_ops tegra20_mc_ops = { .probe = tegra20_mc_probe, }; +static const struct tegra_mc_intmask tegra20_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_SECURITY_VIOLATION | MC_INT_INVALID_GART_PAGE | + MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra20_mc_soc = { .clients = tegra20_mc_clients, .num_clients = ARRAY_SIZE(tegra20_mc_clients), .num_address_bits = 32, .client_id_mask = 0x3f, - .intmask = MC_INT_SECURITY_VIOLATION | MC_INT_INVALID_GART_PAGE | - MC_INT_DECERR_EMEM, + .intmasks = tegra20_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra20_mc_intmasks), .reset_ops = &tegra20_mc_reset_ops, .resets = tegra20_mc_resets, .num_resets = ARRAY_SIZE(tegra20_mc_resets), diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index aa606df8a679..f58f3ef6f681 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1273,6 +1273,15 @@ static const struct tegra_mc_reset tegra210_mc_resets[] = { TEGRA210_MC_RESET(TSECB, 0x970, 0x974, 13), }; +static const struct tegra_mc_intmask tegra210_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra210_mc_soc = { .clients = tegra210_mc_clients, .num_clients = ARRAY_SIZE(tegra210_mc_clients), @@ -1280,9 +1289,8 @@ const struct tegra_mc_soc tegra210_mc_soc = { .atom_size = 64, .client_id_mask = 0xff, .smmu = &tegra210_smmu_soc, - .intmask = MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_INVALID_APB_ASID_UPDATE | MC_INT_INVALID_SMMU_PAGE | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra210_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra210_mc_intmasks), .reset_ops = &tegra_mc_reset_ops_common, .resets = tegra210_mc_resets, .num_resets = ARRAY_SIZE(tegra210_mc_resets), diff --git a/drivers/memory/tegra/tegra234.c b/drivers/memory/tegra/tegra234.c index 67d5d4e01a65..87b22038a5fb 100644 --- a/drivers/memory/tegra/tegra234.c +++ b/drivers/memory/tegra/tegra234.c @@ -1132,16 +1132,23 @@ static const struct tegra_mc_icc_ops tegra234_mc_icc_ops = { .set = tegra234_mc_icc_set, }; +static const struct tegra_mc_intmask tegra234_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_ROUTE_SANITY | MC_INT_DECERR_GENERALIZED_CARVEOUT | + MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra234_mc_soc = { .num_clients = ARRAY_SIZE(tegra234_mc_clients), .clients = tegra234_mc_clients, .num_address_bits = 40, .num_channels = 16, .client_id_mask = 0x1ff, - .intmask = MC_INT_DECERR_ROUTE_SANITY | - MC_INT_DECERR_GENERALIZED_CARVEOUT | MC_INT_DECERR_MTS | - MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra234_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra234_mc_intmasks), .has_addr_hi_reg = true, .ops = &tegra186_mc_ops, .icc_ops = &tegra234_mc_icc_ops, diff --git a/drivers/memory/tegra/tegra264.c b/drivers/memory/tegra/tegra264.c index 5203e6c11372..172a48b56484 100644 --- a/drivers/memory/tegra/tegra264.c +++ b/drivers/memory/tegra/tegra264.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (C) 2025-2026, NVIDIA CORPORATION. All rights reserved. */ #include @@ -290,16 +290,23 @@ static const struct tegra_mc_icc_ops tegra264_mc_icc_ops = { .set = tegra264_mc_icc_set, }; +static const struct tegra_mc_intmask tegra264_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_DECERR_ROUTE_SANITY | MC_INT_DECERR_GENERALIZED_CARVEOUT | + MC_INT_DECERR_MTS | MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | + MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra264_mc_soc = { .num_clients = ARRAY_SIZE(tegra264_mc_clients), .clients = tegra264_mc_clients, .num_address_bits = 40, .num_channels = 16, .client_id_mask = 0x1ff, - .intmask = MC_INT_DECERR_ROUTE_SANITY | - MC_INT_DECERR_GENERALIZED_CARVEOUT | MC_INT_DECERR_MTS | - MC_INT_SECERR_SEC | MC_INT_DECERR_VPR | - MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, + .intmasks = tegra264_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra264_mc_intmasks), .has_addr_hi_reg = true, .ops = &tegra186_mc_ops, .icc_ops = &tegra264_mc_icc_ops, diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c index 8a26a2f204e9..8389e3af0121 100644 --- a/drivers/memory/tegra/tegra30.c +++ b/drivers/memory/tegra/tegra30.c @@ -1384,6 +1384,14 @@ static const struct tegra_mc_icc_ops tegra30_mc_icc_ops = { .set = tegra30_mc_icc_set, }; +static const struct tegra_mc_intmask tegra30_mc_intmasks[] = { + { + .reg = MC_INTMASK, + .mask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION | + MC_INT_DECERR_EMEM, + }, +}; + const struct tegra_mc_soc tegra30_mc_soc = { .clients = tegra30_mc_clients, .num_clients = ARRAY_SIZE(tegra30_mc_clients), @@ -1393,8 +1401,8 @@ const struct tegra_mc_soc tegra30_mc_soc = { .smmu = &tegra30_smmu_soc, .emem_regs = tegra30_mc_emem_regs, .num_emem_regs = ARRAY_SIZE(tegra30_mc_emem_regs), - .intmask = MC_INT_INVALID_SMMU_PAGE | MC_INT_SECURITY_VIOLATION | - MC_INT_DECERR_EMEM, + .intmasks = tegra30_mc_intmasks, + .num_intmasks = ARRAY_SIZE(tegra30_mc_intmasks), .reset_ops = &tegra_mc_reset_ops_common, .resets = tegra30_mc_resets, .num_resets = ARRAY_SIZE(tegra30_mc_resets), diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index b9b1763b10b5..e6da035d1306 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -185,6 +185,11 @@ struct tegra_mc_regs { unsigned int err_route_add; }; +struct tegra_mc_intmask { + u32 reg; + u32 mask; +}; + struct tegra_mc_soc { const struct tegra_mc_client *clients; unsigned int num_clients; @@ -202,7 +207,6 @@ struct tegra_mc_soc { const struct tegra_smmu_soc *smmu; - u32 intmask; u32 ch_intmask; u32 global_intstatus_channel_shift; bool has_addr_hi_reg; @@ -219,6 +223,8 @@ struct tegra_mc_soc { unsigned int num_interrupts; unsigned int mc_addr_hi_mask; unsigned int mc_err_status_type_mask; + const struct tegra_mc_intmask *intmasks; + unsigned int num_intmasks; }; struct tegra_mc { -- cgit v1.2.3 From 0da9ca4c08e709144a1bd2f765c14205960ac64d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 3 Mar 2026 16:50:03 -0800 Subject: futex: add missing function parameter comments Correct or add the missing function parameter kernel-doc comments to avoid warnings: Warning: include/asm-generic/futex.h:38 function parameter 'op' not described in 'futex_atomic_op_inuser_local' Warning: include/asm-generic/futex.h:38 function parameter 'oparg' not described in 'futex_atomic_op_inuser_local' Warning: include/asm-generic/futex.h:38 function parameter 'oval' not described in 'futex_atomic_op_inuser_local' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260304005008.409858-1-rdunlap@infradead.org --- include/asm-generic/futex.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index 2a19215baae5..fbbcfd801cd0 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -25,7 +25,9 @@ * argument and comparison of the previous * futex value with another constant. * - * @encoded_op: encoded operation to execute + * @op: operation to execute + * @oparg: argument of the operation + * @oval: previous value at @uaddr on successful return * @uaddr: pointer to user space address * * Return: -- cgit v1.2.3 From 1ea4b473504b6dc6a0d21c298519aff2d52433c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:41 +0000 Subject: locking/rwsem: Remove the list_head from struct rw_semaphore Instead of embedding a list_head in struct rw_semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink some core data structures like struct inode. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-2-willy@infradead.org --- include/linux/rwsem.h | 8 ++--- kernel/locking/rwsem.c | 90 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 62 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9bf1d93d3d7b..e7829531c4ba 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct list_head wait_list; + struct rwsem_waiter *first_waiter; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -106,7 +106,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore * .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .first_waiter = NULL, \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } @@ -129,9 +129,9 @@ do { \ * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ -static inline int rwsem_is_contended(struct rw_semaphore *sem) +static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return !list_empty(&sem->wait_list); + return sem->first_waiter != NULL; } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 24df4d98f7d2..e66f37ebc6f6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ + (sem)->first_waiter ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -321,7 +321,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); + sem->first_waiter = NULL; atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); @@ -341,8 +341,6 @@ struct rwsem_waiter { unsigned long timeout; bool handoff_set; }; -#define rwsem_first_waiter(sem) \ - list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -365,12 +363,21 @@ enum rwsem_wake_type { */ #define MAX_READERS_WAKEUP 0x100 -static inline void -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +static inline +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - lockdep_assert_held(&sem->wait_lock); - list_add_tail(&waiter->list, &sem->wait_list); - /* caller will set RWSEM_FLAG_WAITERS */ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return true; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + } + list_del(&waiter->list); + + return false; } /* @@ -385,14 +392,23 @@ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); - list_del(&waiter->list); - if (likely(!list_empty(&sem->wait_list))) + if (__rwsem_del_waiter(sem, waiter)) return true; - atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); return false; } +static inline +struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, + const struct rwsem_waiter *waiter) +{ + struct rwsem_waiter *next = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + if (next == sem->first_waiter) + return NULL; + return next; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -411,7 +427,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter, *tmp; + struct rwsem_waiter *waiter, *next; long oldcount, woken = 0, adjustment = 0; struct list_head wlist; @@ -421,7 +437,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = rwsem_first_waiter(sem); + waiter = sem->first_waiter; if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -506,25 +522,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * put them into wake_q to be woken up later. */ INIT_LIST_HEAD(&wlist); - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + do { + next = next_waiter(sem, waiter); if (waiter->type == RWSEM_WAITING_FOR_WRITE) continue; woken++; list_move_tail(&waiter->list, &wlist); + if (sem->first_waiter == waiter) + sem->first_waiter = next; /* * Limit # of readers that can be woken up per wakeup call. */ if (unlikely(woken >= MAX_READERS_WAKEUP)) break; - } + } while ((waiter = next) != NULL); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); oldcount = atomic_long_read(&sem->count); - if (list_empty(&sem->wait_list)) { + if (!sem->first_waiter) { /* * Combined with list_move_tail() above, this implies * rwsem_del_waiter(). @@ -545,7 +564,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { + list_for_each_entry_safe(waiter, next, &wlist, list) { struct task_struct *tsk; tsk = waiter->task; @@ -577,7 +596,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { - bool first = rwsem_first_waiter(sem) == waiter; + bool first = sem->first_waiter == waiter; wake_q_init(wake_q); @@ -603,7 +622,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - struct rwsem_waiter *first = rwsem_first_waiter(sem); + struct rwsem_waiter *first = sem->first_waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -639,7 +658,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, new |= RWSEM_WRITER_LOCKED; new &= ~RWSEM_FLAG_HANDOFF; - if (list_is_singular(&sem->wait_list)) + if (list_empty(&first->list)) new &= ~RWSEM_FLAG_WAITERS; } } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); @@ -659,7 +678,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on * success. */ - list_del(&waiter->list); + __rwsem_del_waiter(sem, waiter); + rwsem_set_owner(sem); return true; } @@ -994,7 +1014,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat { long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* @@ -1019,7 +1039,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat */ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -1035,7 +1055,8 @@ queue: waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { + first = sem->first_waiter; + if (!first) { /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return @@ -1051,8 +1072,11 @@ queue: return sem; } adjustment += RWSEM_FLAG_WAITERS; + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } else { + list_add_tail(&waiter.list, &first->list); } - rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1110,7 +1134,7 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1129,10 +1153,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); - /* we're now waiting on the lock */ - if (rwsem_first_waiter(sem) != &waiter) { + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), &wake_q); if (!wake_q_empty(&wake_q)) { @@ -1145,6 +1169,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) raw_spin_lock_irq(&sem->wait_lock); } } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } @@ -1218,7 +1244,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1239,7 +1265,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -- cgit v1.2.3 From b9bdd4b6840454ef87f61b6506c9635c57a81650 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:42 +0000 Subject: locking/semaphore: Remove the list_head from struct semaphore Instead of embedding a list_head in struct semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list and remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a semaphore. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-3-willy@infradead.org --- drivers/acpi/osl.c | 2 +- include/linux/semaphore.h | 4 ++-- kernel/locking/semaphore.c | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 5b777316b9ac..2af0db9210fe 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1257,7 +1257,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle) ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle)); - BUG_ON(!list_empty(&sem->wait_list)); + BUG_ON(sem->first_waiter); kfree(sem); sem = NULL; diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 89706157e622..a4c8651ef021 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -15,7 +15,7 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; - struct list_head wait_list; + struct semaphore_waiter *first_waiter; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER unsigned long last_holder; @@ -33,7 +33,7 @@ struct semaphore { { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list) \ + .first_waiter = NULL \ __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 3ef032e22f7e..74d41433ba13 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -21,7 +21,7 @@ * too. * * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. + * semaphore. If it's zero, there may be waiters. */ #include @@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem) hung_task_sem_clear_if_holder(sem); - if (likely(list_empty(&sem->wait_list))) + if (likely(!sem->first_waiter)) sem->count++; else __up(sem, &wake_q); @@ -244,6 +244,21 @@ struct semaphore_waiter { bool up; }; +static inline +void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter) +{ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct semaphore_waiter, list); + } + list_del(&waiter->list); +} + /* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the @@ -252,9 +267,15 @@ struct semaphore_waiter { static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); + struct semaphore_waiter waiter, *first; + + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); + } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } waiter.task = current; waiter.up = false; @@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, } timed_out: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -ETIME; interrupted: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -EINTR; } @@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) static noinline void __sched __up(struct semaphore *sem, struct wake_q_head *wake_q) { - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); + struct semaphore_waiter *waiter = sem->first_waiter; + + sem_del_waiter(sem, waiter); waiter->up = true; wake_q_add(wake_q, waiter->task); } -- cgit v1.2.3 From 25500ba7e77ce9d3d9b5a1929d41a2ee2e23f6fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:43 +0000 Subject: locking/mutex: Remove the list_head from struct mutex Instead of embedding a list_head in struct mutex, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a mutex like struct file. Some of the debug checks have to be deleted because there's no equivalent to checking them in the new scheme (eg an empty waiter->list now means that it is the only waiter, not that the waiter is no longer on the list). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-4-willy@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- kernel/locking/mutex-debug.c | 5 +---- kernel/locking/mutex.c | 49 ++++++++++++++++++++++++-------------------- kernel/locking/ww_mutex.h | 25 +++++++--------------- 5 files changed, 37 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2f648ee204e7..c471b129f703 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -79,7 +79,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ - , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ + , .first_waiter = NULL \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index 80975935ec48..a8f119f81177 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct list_head wait_list; + struct mutex_waiter *first_waiter; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..94930d506bcf 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(!lock->first_waiter); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } void debug_mutex_free_waiter(struct mutex_waiter *waiter) @@ -62,7 +61,6 @@ void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, { struct mutex *blocked_on = __get_task_blocked_on(task); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); @@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c867f6c15530..95f1822122a1 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -47,7 +47,7 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); + lock->first_waiter = NULL; #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -194,33 +194,42 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } -static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) -{ - return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; -} - /* * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct list_head *list) + struct mutex_waiter *first) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); - list_add_tail(&waiter->list, list); - if (__mutex_waiter_is_first(lock, waiter)) + if (!first) + first = lock->first_waiter; + + if (first) { + list_add_tail(&waiter->list, &first->list); + } else { + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter = waiter; __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + } } static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - list_del(&waiter->list); - if (likely(list_empty(&lock->wait_list))) + if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); + lock->first_waiter = NULL; + } else { + if (lock->first_waiter == waiter) { + lock->first_waiter = list_first_entry(&waiter->list, + struct mutex_waiter, list); + } + list_del(&waiter->list); + } debug_mutex_remove_waiter(lock, waiter, current); hung_task_clear_blocker(); @@ -340,7 +349,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && !__mutex_waiter_is_first(lock, waiter)) + if (waiter && lock->first_waiter != waiter) return false; return true; @@ -645,7 +654,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - __mutex_add_waiter(lock, &waiter, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, NULL); } else { /* * Add in stamp order, waking up waiters that must kill @@ -691,7 +700,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas schedule_preempt_disabled(); - first = __mutex_waiter_is_first(lock, &waiter); + first = lock->first_waiter == &waiter; /* * As we likely have been woken up by task @@ -734,8 +743,7 @@ acquired: * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. */ - if (!ww_ctx->is_wait_die && - !__mutex_waiter_is_first(lock, &waiter)) + if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter) __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } @@ -931,6 +939,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { struct task_struct *next = NULL; + struct mutex_waiter *waiter; DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; @@ -962,12 +971,8 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_first_entry(&lock->wait_list, - struct mutex_waiter, list); - + waiter = lock->first_waiter; + if (waiter) { next = waiter->task; debug_mutex_wake_waiter(lock, waiter); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..a0847e91ae04 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -8,20 +8,14 @@ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) { - struct mutex_waiter *w; - - w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; - - return w; + return lock->first_waiter; } static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) { w = list_next_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -31,7 +25,7 @@ static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) { w = list_prev_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -40,22 +34,17 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) { - struct mutex_waiter *w; - - w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; + struct mutex_waiter *w = lock->first_waiter; + if (w) + w = list_prev_entry(w, list); return w; } static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) { - struct list_head *p = &lock->wait_list; - if (pos) - p = &pos->list; - __mutex_add_waiter(lock, waiter, p); + __mutex_add_waiter(lock, waiter, pos); } static inline struct task_struct * -- cgit v1.2.3 From 07574b8ebaac7927e2355b4f343b03b50e04494c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 13:40:30 +0100 Subject: compiler-context-analysys: Add __cond_releases() Useful for things like unlock fastpaths, which on success release the lock. Suggested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Link: https://patch.msgid.link/20260121111213.634625032@infradead.org --- include/linux/compiler-context-analysis.h | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h index 00c074a2ccb0..a9317571e6af 100644 --- a/include/linux/compiler-context-analysis.h +++ b/include/linux/compiler-context-analysis.h @@ -320,6 +320,38 @@ static inline void _context_unsafe_alias(void **p) { } */ #define __releases(...) __releases_ctx_lock(__VA_ARGS__) +/* + * Clang's analysis does not care precisely about the value, only that it is + * either zero or non-zero. So the __cond_acquires() interface might be + * misleading if we say that @ret is the value returned if acquired. Instead, + * provide symbolic variants which we translate. + */ +#define __cond_acquires_impl_not_true(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_false(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonzero(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_0(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonnull(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_NULL(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) + +/** + * __cond_releases() - function attribute, function conditionally + * releases a context lock exclusively + * @ret: abstract value returned by function if context lock releases + * @x: context lock instance pointer + * + * Function attribute declaring that the function conditionally releases the + * given context lock instance @x exclusively. The associated context(s) must + * be active on entry. The function return value @ret denotes when the context + * lock is released. + * + * @ret may be one of: true, false, nonzero, 0, nonnull, NULL. + * + * NOTE: clang does not have a native attribute for this; instead implement + * it as an unconditional release and a conditional acquire for the + * inverted condition -- which is semantically equivalent. + */ +#define __cond_releases(ret, x) __releases(x) __cond_acquires_impl_not_##ret(x) + /** * __acquire() - function to acquire context lock exclusively * @x: context lock instance pointer -- cgit v1.2.3 From 5c4326231cde36fd5e90c41e403df9fac6238f4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 10:06:08 +0100 Subject: locking/mutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.745353747@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- kernel/locking/Makefile | 2 ++ kernel/locking/mutex.c | 33 ++++++++++++++++++++++++++++----- kernel/locking/mutex.h | 1 + kernel/locking/ww_mutex.h | 12 ++++++++++++ 6 files changed, 45 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index c471b129f703..734048c02f4f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -183,7 +183,7 @@ static inline int __must_check __devm_mutex_init(struct device *dev, struct mute */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass) __acquires(lock); -extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); +extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __acquires(lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) __cond_acquires(0, lock); extern int __must_check _mutex_lock_killable(struct mutex *lock, diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index a8f119f81177..24ed599fdda8 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct mutex_waiter *first_waiter; + struct mutex_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index a114949eeed5..264447d606a6 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,6 +3,8 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +CONTEXT_ANALYSIS_mutex.o := y + obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 95f1822122a1..427187ff02db 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,8 +46,9 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); - raw_spin_lock_init(&lock->wait_lock); - lock->first_waiter = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->first_waiter = NULL; + } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -150,6 +151,7 @@ EXPORT_SYMBOL(mutex_init_generic); * follow with a __mutex_trylock() before failing. */ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) + __cond_acquires(true, lock) { unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; @@ -163,6 +165,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) } static __always_inline bool __mutex_unlock_fast(struct mutex *lock) + __cond_releases(true, lock) { unsigned long curr = (unsigned long)current; @@ -201,6 +204,7 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *first) + __must_hold(&lock->wait_lock) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); @@ -219,6 +223,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -268,7 +273,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void __sched __mutex_lock_slowpath(struct mutex *lock); +static void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock); /** * mutex_lock - acquire the mutex @@ -349,7 +355,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && lock->first_waiter != waiter) + if (waiter && data_race(lock->first_waiter != waiter)) return false; return true; @@ -534,7 +540,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, } #endif -static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock); /** * mutex_unlock - release the mutex @@ -574,6 +581,7 @@ EXPORT_SYMBOL(mutex_unlock); * of a unlocked mutex is not allowed. */ void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { __ww_mutex_unlock(lock); mutex_unlock(&lock->base); @@ -587,6 +595,7 @@ static __always_inline int __sched __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + __cond_acquires(0, lock) { DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; @@ -780,6 +789,7 @@ err_early_kill: static int __sched __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } @@ -787,6 +797,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, unsigned long ip, struct ww_acquire_ctx *ww_ctx) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } @@ -834,6 +845,7 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -842,6 +854,7 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -870,12 +883,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) token = io_schedule_prepare(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); + __acquire(lock); io_schedule_finish(token); } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_releases(nonzero, lock) { #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH unsigned tmp; @@ -937,6 +952,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock) { struct task_struct *next = NULL; struct mutex_waiter *waiter; @@ -945,6 +961,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne unsigned long flags; mutex_release(&lock->dep_map, ip); + __release(lock); /* * Release the lock before (potentially) taking the spinlock such that @@ -1066,24 +1083,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io); static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __acquire(lock); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); @@ -1092,6 +1114,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 9ad4da8cea00..b94ef40c1f48 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -7,6 +7,7 @@ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar */ #ifndef CONFIG_PREEMPT_RT +#include /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index a0847e91ae04..c50ea5dd3c44 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -7,12 +7,14 @@ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) + __must_hold(&lock->wait_lock) { return lock->first_waiter; } static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_next_entry(w, list); if (lock->first_waiter == w) @@ -23,6 +25,7 @@ __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_prev_entry(w, list); if (lock->first_waiter == w) @@ -33,6 +36,7 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) + __must_hold(&lock->wait_lock) { struct mutex_waiter *w = lock->first_waiter; @@ -43,6 +47,7 @@ __ww_waiter_last(struct mutex *lock) static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) + __must_hold(&lock->wait_lock) { __mutex_add_waiter(lock, waiter, pos); } @@ -60,16 +65,19 @@ __ww_mutex_has_waiters(struct mutex *lock) } static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) + __acquires(&lock->wait_lock) { raw_spin_lock_irqsave(&lock->wait_lock, *flags); } static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) + __releases(&lock->wait_lock) { raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); } @@ -296,6 +304,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -360,6 +369,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct MUTEX_WAITER *cur; @@ -453,6 +463,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) + __must_hold(&lock->wait_lock) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -503,6 +514,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; -- cgit v1.2.3 From 90bb681dcdf7e69c90b56a18f06c0389a0810b92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 18:17:50 +0100 Subject: locking/rtmutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.851599178@infradead.org --- include/linux/rtmutex.h | 8 ++++---- kernel/locking/Makefile | 2 ++ kernel/locking/rtmutex.c | 18 +++++++++++++++++- kernel/locking/rtmutex_api.c | 2 ++ kernel/locking/rtmutex_common.h | 27 +++++++++++++++++++-------- kernel/locking/ww_mutex.h | 20 +++++++++++++++----- kernel/locking/ww_rt_mutex.c | 1 + scripts/context-analysis-suppression.txt | 1 + 8 files changed, 61 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index ede4c6bf6f22..78e7e588817c 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -22,8 +22,8 @@ extern int max_lock_depth; struct rt_mutex_base { raw_spinlock_t wait_lock; - struct rb_root_cached waiters; - struct task_struct *owner; + struct rb_root_cached waiters __guarded_by(&wait_lock); + struct task_struct *owner __guarded_by(&wait_lock); }; #define __RT_MUTEX_BASE_INITIALIZER(rtbasename) \ @@ -41,7 +41,7 @@ struct rt_mutex_base { */ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) { - return READ_ONCE(lock->owner) != NULL; + return data_race(READ_ONCE(lock->owner) != NULL); } #ifdef CONFIG_RT_MUTEXES @@ -49,7 +49,7 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + unsigned long owner = (unsigned long) data_race(READ_ONCE(lock->owner)); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 264447d606a6..0c07de79388c 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -4,6 +4,8 @@ KCOV_INSTRUMENT := n CONTEXT_ANALYSIS_mutex.o := y +CONTEXT_ANALYSIS_rtmutex_api.o := y +CONTEXT_ANALYSIS_ww_rt_mutex.o := y obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c80902eacd79..ccaba6148b61 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -94,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, static __always_inline struct task_struct * rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { unsigned long val = (unsigned long)owner; @@ -105,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) static __always_inline void rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { /* * lock->wait_lock is held but explicit acquire semantics are needed @@ -114,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) } static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { /* lock->wait_lock is held so the unlock provides release semantics. */ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); @@ -127,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) + __must_hold(&lock->wait_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -328,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, } static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -1206,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1249,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); + __assume_ctx_lock(&rtm->rtmutex.wait_lock); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); if (res) { raw_spin_lock(&task->pi_lock); @@ -1356,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, } static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1505,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner_on_cpu(owner) || need_resched() || - !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) { res = false; break; } @@ -1538,6 +1547,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, */ static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); @@ -1613,6 +1623,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ @@ -1658,6 +1670,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, struct rt_mutex_base *lock, struct rt_mutex_waiter *w) + __must_hold(&lock->wait_lock) { /* * If the result is not -EDEADLOCK or the caller requested @@ -1694,11 +1707,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); int ret; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); lockdep_assert_held(&lock->wait_lock); lockevent_inc(rtmutex_slowlock); @@ -1750,6 +1765,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex_waiter waiter; int ret; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 59dbd29cb219..124219aea46e 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -526,6 +526,7 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __acquires(lock) __no_context_analysis { int ret; @@ -647,6 +648,7 @@ EXPORT_SYMBOL(mutex_trylock); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) + __releases(lock) __no_context_analysis { mutex_release(&lock->dep_map, _RET_IP_); __rt_mutex_unlock(&lock->rtmutex); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index cf6ddd1b23a2..c38b7bdea7b3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -79,12 +79,18 @@ struct rt_wake_q_head { * PI-futex support (proxy locking functions, etc.): */ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); + struct task_struct *proxy_owner) + __must_hold(&lock->wait_lock); + +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *); + struct wake_q_head *) + __must_hold(&lock->wait_lock); + extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); @@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter); -extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, @@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } @@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) */ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); @@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -170,9 +180,10 @@ enum rtmutex_chainwalk { static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; - lock->owner = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; + } } /* Debug functions */ diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index c50ea5dd3c44..b1834ab7e782 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -4,6 +4,7 @@ #define MUTEX mutex #define MUTEX_WAITER mutex_waiter +#define WAIT_LOCK wait_lock static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) @@ -86,9 +87,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock) #define MUTEX rt_mutex #define MUTEX_WAITER rt_mutex_waiter +#define WAIT_LOCK rtmutex.wait_lock static inline struct rt_mutex_waiter * __ww_waiter_first(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) @@ -116,6 +119,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) static inline struct rt_mutex_waiter * __ww_waiter_last(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) @@ -137,21 +141,25 @@ __ww_mutex_owner(struct rt_mutex *lock) static inline bool __ww_mutex_has_waiters(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { return rt_mutex_has_waiters(&lock->rtmutex); } static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __acquires(&lock->rtmutex.wait_lock) { raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __releases(&lock->rtmutex.wait_lock) { raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { lockdep_assert_held(&lock->rtmutex.wait_lock); } @@ -304,7 +312,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -369,7 +377,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur; @@ -396,6 +404,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { DEFINE_WAKE_Q(wake_q); unsigned long flags; + bool has_waiters; ww_mutex_lock_acquired(lock, ctx); @@ -417,7 +426,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx * and/or !empty list. */ - if (likely(!__ww_mutex_has_waiters(&lock->base))) + has_waiters = data_race(__ww_mutex_has_waiters(&lock->base)); + if (likely(!has_waiters)) return; /* @@ -463,7 +473,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -514,7 +524,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index c7196de838ed..e07fb3b96bc3 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) EXPORT_SYMBOL(ww_mutex_lock_interruptible); void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { struct rt_mutex *rtm = &lock->base; diff --git a/scripts/context-analysis-suppression.txt b/scripts/context-analysis-suppression.txt index fd8951d06706..1c51b6153f08 100644 --- a/scripts/context-analysis-suppression.txt +++ b/scripts/context-analysis-suppression.txt @@ -24,6 +24,7 @@ src:*include/linux/mutex*.h=emit src:*include/linux/rcupdate.h=emit src:*include/linux/refcount.h=emit src:*include/linux/rhashtable.h=emit +src:*include/linux/rtmutex*.h=emit src:*include/linux/rwlock*.h=emit src:*include/linux/rwsem.h=emit src:*include/linux/sched*=emit -- cgit v1.2.3 From 739690915ce1f017223ef4e6f3cc966ccfa3c861 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Mar 2026 10:43:56 +0100 Subject: locking/rwsem: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260306101417.GT1282955@noisy.programming.kicks-ass.net --- include/linux/rwsem.h | 4 ++-- kernel/locking/Makefile | 1 + kernel/locking/rwbase_rt.c | 1 + kernel/locking/rwsem.c | 27 ++++++++++++++++++++++++--- 4 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index e7829531c4ba..6a1a7bae5f81 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct rwsem_waiter *first_waiter; + struct rwsem_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -131,7 +131,7 @@ do { \ */ static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return sem->first_waiter != NULL; + return data_race(sem->first_waiter != NULL); } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0c07de79388c..cee1901d4cff 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -6,6 +6,7 @@ KCOV_INSTRUMENT := n CONTEXT_ANALYSIS_mutex.o := y CONTEXT_ANALYSIS_rtmutex_api.o := y CONTEXT_ANALYSIS_ww_rt_mutex.o := y +CONTEXT_ANALYSIS_rwsem.o := y obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 9f4322c07486..82e078c0665a 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, unsigned long flags) + __releases(&rwb->rtmutex.wait_lock) { struct rt_mutex_base *rtm = &rwb->rtmutex; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e66f37ebc6f6..ba4cb74de064 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - (sem)->first_waiter ? "" : "not ")) \ + rwsem_is_contended(sem) ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -320,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - sem->first_waiter = NULL; atomic_long_set(&sem->owner, 0L); + scoped_guard (raw_spinlock_init, &sem->wait_lock) { + sem->first_waiter = NULL; + } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -365,6 +366,7 @@ enum rwsem_wake_type { static inline bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { if (list_empty(&waiter->list)) { sem->first_waiter = NULL; @@ -401,6 +403,7 @@ rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) static inline struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, const struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { struct rwsem_waiter *next = list_first_entry(&waiter->list, struct rwsem_waiter, list); @@ -621,6 +624,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { struct rwsem_waiter *first = sem->first_waiter; long count, new; @@ -1558,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) * lock for reading */ void __sched down_read(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1567,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); int __sched down_read_interruptible(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1581,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_interruptible); int __sched down_read_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1598,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable); * trylock for reading -- returns 1 if successful, 0 if contention */ int down_read_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_read_trylock(sem); @@ -1611,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock); * lock for writing */ void __sched down_write(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1622,6 +1631,7 @@ EXPORT_SYMBOL(down_write); * lock for writing */ int __sched down_write_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1640,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable); * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_write_trylock(sem); @@ -1654,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock); * release a read lock */ void up_read(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); @@ -1664,6 +1676,7 @@ EXPORT_SYMBOL(up_read); * release a write lock */ void up_write(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); @@ -1674,6 +1687,7 @@ EXPORT_SYMBOL(up_write); * downgrade write lock to read lock */ void downgrade_write(struct rw_semaphore *sem) + __no_context_analysis { lock_downgrade(&sem->dep_map, _RET_IP_); __downgrade_write(sem); @@ -1683,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC void down_read_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1691,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); int down_read_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1705,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_killable_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) + __no_context_analysis { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); @@ -1713,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); __down_read(sem); @@ -1727,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1735,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1750,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); -- cgit v1.2.3 From b51caeb24aad565ef26689fb667c60daa60094aa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:49:59 -0400 Subject: RDMA/core: Add rdma_udata_to_dev() Get an ib_device out of a udata so it can be used for debug prints. Link: https://patch.msgid.link/r/2-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ib_core_uverbs.c | 27 +++++++++++++++++++++++++++ include/rdma/uverbs_ioctl.h | 2 ++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index d3836a62a004..bfe37a9c8a72 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -389,3 +389,30 @@ int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, U32_MAX); } EXPORT_SYMBOL(rdma_user_mmap_entry_insert); + +/** + * rdma_udata_to_dev - Get a ib_device from a udata + * @udata: The system calls ib_udata struct + * + * The struct ib_device that is handling the uverbs call. Must not be called if + * udata is NULL. The result can be NULL. + */ +struct ib_device *rdma_udata_to_dev(struct ib_udata *udata) +{ + struct uverbs_attr_bundle *bundle = + rdma_udata_to_uverbs_attr_bundle(udata); + + lockdep_assert_held(&bundle->ufile->device->disassociate_srcu); + + if (bundle->context) + return bundle->context->device; + + /* + * If the context hasn't been created yet use the ufile's dev, but it + * might be NULL if we are racing with disassociate. + */ + return srcu_dereference(bundle->ufile->device->ib_dev, + &bundle->ufile->device->disassociate_srcu); +} +EXPORT_SYMBOL(rdma_udata_to_dev); + diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index e6c0de227fad..bb86d8ae8a83 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -667,6 +667,8 @@ rdma_udata_to_uverbs_attr_bundle(struct ib_udata *udata) (udata ? container_of(rdma_udata_to_uverbs_attr_bundle(udata)->context, \ drv_dev_struct, member) : (drv_dev_struct *)NULL) +struct ib_device *rdma_udata_to_dev(struct ib_udata *udata); + #define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT) static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, -- cgit v1.2.3 From 1de9287ece44022bd694e669153fb7644804e10d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:00 -0400 Subject: RDMA: Add ib_copy_validate_udata_in() Add a new function to consolidate the required compatibility pattern for driver data of checking against a minimum size, and checking for unknown trailing bytes to be zero into a function. This new function uses the faster copy_struct_from_user() instead of trying to directly check for zero. Incorporate the common ibdev_dbg() logging directly into the error paths of the helper. Link: https://patch.msgid.link/r/3-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Tested-by: Sriharsha Basavapatna Acked-by: Sriharsha Basavapatna Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.h | 3 ++ drivers/infiniband/core/uverbs_ioctl.c | 51 ++++++++++++++++++++++++++++++++++ include/rdma/uverbs_ioctl.h | 26 +++++++++++++++++ 3 files changed, 80 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 55f1e3558856..269b393799ab 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -151,6 +151,9 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); +typedef int (*uverbs_api_ioctl_handler_fn)(struct uverbs_attr_bundle *attrs); +uverbs_api_ioctl_handler_fn uverbs_get_handler_fn(struct ib_udata *udata); + extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index f37bb447c230..81798c0875ed 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -70,6 +70,19 @@ struct bundle_priv { u64 internal_buffer[32]; }; +uverbs_api_ioctl_handler_fn uverbs_get_handler_fn(struct ib_udata *udata) +{ + struct uverbs_attr_bundle *bundle = + rdma_udata_to_uverbs_attr_bundle(udata); + struct bundle_priv *pbundle = + container_of(&bundle->hdr, struct bundle_priv, bundle); + + lockdep_assert_held(&bundle->ufile->device->disassociate_srcu); + + return srcu_dereference(pbundle->method_elm->handler, + &bundle->ufile->device->disassociate_srcu); +} + /* * Each method has an absolute minimum amount of memory it needs to allocate, * precompute that amount and determine if the onstack memory can be used or @@ -847,3 +860,41 @@ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, pbundle->uobj_hw_obj_valid); } EXPORT_SYMBOL(uverbs_finalize_uobj_create); + +int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, + size_t kernel_size, size_t minimum_size) +{ + int err; + + if (udata->inlen < minimum_size) { + ibdev_dbg( + rdma_udata_to_dev(udata), + "System call driver input udata too small (%zu < %zu) for ioctl %ps called by %pSR\n", + udata->inlen, minimum_size, + uverbs_get_handler_fn(udata), + __builtin_return_address(0)); + return -EINVAL; + } + + err = copy_struct_from_user(req, kernel_size, udata->inbuf, + udata->inlen); + if (err) { + if (err == -E2BIG) { + ibdev_dbg( + rdma_udata_to_dev(udata), + "System call driver input udata not zero from %zu -> %zu for ioctl %ps called by %pSR\n", + minimum_size, udata->inlen, + uverbs_get_handler_fn(udata), + __builtin_return_address(0)); + return -EOPNOTSUPP; + } + ibdev_dbg( + rdma_udata_to_dev(udata), + "System call driver input udata EFAULT for ioctl %ps called by %pSR\n", + uverbs_get_handler_fn(udata), + __builtin_return_address(0)); + return err; + } + return 0; +} +EXPORT_SYMBOL(_ib_copy_validate_udata_in); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index bb86d8ae8a83..505492443c36 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -897,6 +897,9 @@ int _uverbs_get_const_unsigned(u64 *to, size_t idx, u64 upper_bound, u64 *def_val); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size); + +int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, + size_t kernel_size, size_t minimum_size); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -953,6 +956,14 @@ _uverbs_get_const_unsigned(u64 *to, { return -EINVAL; } + +static inline int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, + size_t kernel_size, + size_t minimum_size) +{ + return -EINVAL; +} + #endif #define uverbs_get_const_signed(_to, _attrs_bundle, _idx) \ @@ -1018,4 +1029,19 @@ uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle, return uverbs_get_const_signed(to, attrs_bundle, idx); } +/** + * ib_copy_validate_udata_in - Copy and validate that the request structure is + * compatible with this kernel + * @_udata: The system calls ib_udata struct + * @_req: The name of an on-stack structure that holds the driver data + * @_end_member: The member in the struct that is the original end of struct + * from the first kernel to introduce it. + * + * Check that the udata input request struct is properly formed for this kernel. + * Then copy it into req + */ +#define ib_copy_validate_udata_in(_udata, _req, _end_member) \ + _ib_copy_validate_udata_in(_udata, &(_req), sizeof(_req), \ + offsetofend(typeof(_req), _end_member)) + #endif -- cgit v1.2.3 From dbf6491bb98d2821f0a23f4e8efd215cb2e5ff21 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:01 -0400 Subject: RDMA: Add ib_copy_validate_udata_in_cm() For structures with comp_mask also absorb the check of comp_mask valid bits into the helper. This is slightly tricky because ~ might not fully extend to 64 bits, the helper inserts an explicit type to ensure that ~ covers all bits. Link: https://patch.msgid.link/r/4-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Tested-by: Sriharsha Basavapatna Acked-by: Sriharsha Basavapatna Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 12 ++++++++++++ include/rdma/uverbs_ioctl.h | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 81798c0875ed..5e5b00c6236f 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -898,3 +898,15 @@ int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, return 0; } EXPORT_SYMBOL(_ib_copy_validate_udata_in); + +int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, + u64 valid_cm) +{ + ibdev_dbg( + rdma_udata_to_dev(udata), + "System call driver input udata has unsupported comp_mask %llx & ~%llx = %llx for ioctl %ps called by %pSR\n", + req_cm, valid_cm, req_cm & ~valid_cm, + uverbs_get_handler_fn(udata), __builtin_return_address(0)); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(_ib_copy_validate_udata_cm_fail); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 505492443c36..a73016a977a1 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -1044,4 +1044,29 @@ uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle, _ib_copy_validate_udata_in(_udata, &(_req), sizeof(_req), \ offsetofend(typeof(_req), _end_member)) +int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, + u64 valid_cm); + +/** + * ib_copy_validate_udata_in_cm - Copy the req structure and check the comp_mask + * @_udata: The system calls ib_udata struct + * @_req: The name of an on-stack structure that holds the driver data + * @_end_member: The member in the struct that is the original end of struct + * from the first kernel to introduce it. + * @_valid_cm: A bitmask of bits permitted in the comp_mask_field. + * + * Check that the udata input request struct is properly formed for this kernel. + * Then copy it into req + */ +#define ib_copy_validate_udata_in_cm(_udata, _req, _end_member, _valid_cm) \ + ({ \ + typeof((_req).comp_mask) __valid_cm = _valid_cm; \ + int ret = \ + ib_copy_validate_udata_in(_udata, _req, _end_member); \ + if (!ret && ((_req).comp_mask & ~__valid_cm)) \ + ret = _ib_copy_validate_udata_cm_fail( \ + _udata, (_req).comp_mask, __valid_cm); \ + ret; \ + }) + #endif -- cgit v1.2.3 From 14badc323ed7153c24a0a9c3175e594aaf1366c9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:02 -0400 Subject: RDMA: Add ib_respond_udata() Wrap the common copy_to_user() pattern used in drivers and enhance it to zero pad as well. Include debug logging on failures. Link: https://patch.msgid.link/r/5-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Tested-by: Sriharsha Basavapatna Acked-by: Sriharsha Basavapatna Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 24 ++++++++++++++++++++++++ include/rdma/uverbs_ioctl.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 5e5b00c6236f..b61af625e679 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -910,3 +910,27 @@ int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, return -EOPNOTSUPP; } EXPORT_SYMBOL(_ib_copy_validate_udata_cm_fail); + +int _ib_respond_udata(struct ib_udata *udata, const void *src, size_t len) +{ + size_t copy_len; + + /* 0 length copy_len is a NOP for copy_to_user() and doesn't fail. */ + copy_len = min(len, udata->outlen); + if (copy_to_user(udata->outbuf, src, copy_len)) + goto err_fault; + if (copy_len < udata->outlen) { + if (clear_user(udata->outbuf + copy_len, + udata->outlen - copy_len)) + goto err_fault; + } + return 0; +err_fault: + ibdev_dbg( + rdma_udata_to_dev(udata), + "System call driver out udata has EFAULT (%zu into %zu) for ioctl %ps called by %pSR\n", + len, udata->outlen, uverbs_get_handler_fn(udata), + __builtin_return_address(0)); + return -EFAULT; +} +EXPORT_SYMBOL(_ib_respond_udata); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index a73016a977a1..38a11bfe1374 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -900,6 +900,7 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, size_t kernel_size, size_t minimum_size); +int _ib_respond_udata(struct ib_udata *udata, const void *src, size_t len); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -964,6 +965,11 @@ static inline int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, return -EINVAL; } +static inline int _ib_respond_udata(struct ib_udata *udata, const void *src, + size_t len) +{ + return -EINVAL; +} #endif #define uverbs_get_const_signed(_to, _attrs_bundle, _idx) \ @@ -1069,4 +1075,31 @@ int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, ret; \ }) +/** + * ib_respond_udata - Copy a driver data response to userspace + * @_udata: The system calls ib_udata struct + * @_rep: Kernel buffer containing the response driver data on the stack + * + * Copy driver data response structures back to userspace in a way that + * is forwards and backwards compatible. Longer kernel structs are truncated, + * userspace has made some kind of error if it needed the truncated information. + * Shorter structs are zero padded. + */ +#define ib_respond_udata(_udata, _rep) \ + _ib_respond_udata(_udata, &(_rep), sizeof(_rep)) + +/** + * ib_respond_empty_udata - Zero fill the response buffer to userspace + * @_udata: The system calls ib_udata struct + * + * Used when there is no driver response data to return. Provides forward + * compatability by zeroing any buffer the user may have provided. + */ +static inline int ib_respond_empty_udata(struct ib_udata *udata) +{ + if (udata && udata->outlen && clear_user(udata->outbuf, udata->outlen)) + return -EFAULT; + return 0; +} + #endif -- cgit v1.2.3 From 4c379ba04c110ba55182535140fda3a7f285d597 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:03 -0400 Subject: RDMA: Add ib_is_udata_in_empty() If the driver doesn't yet support any request driver data it should check that it is all zeroed. This is a common pattern, add a helper around _ib_copy_validate_udata_in() to do this. Link: https://patch.msgid.link/r/6-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Tested-by: Sriharsha Basavapatna Acked-by: Sriharsha Basavapatna Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 38a11bfe1374..e2af17da3e32 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -1075,6 +1075,21 @@ int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, ret; \ }) +/** + * ib_is_udata_in_empty - Check if the udata input buffer is all zeros + * @udata: The system calls ib_udata struct + * + * This should be used if the driver does not currently define a driver data + * struct. Returns 0 if the buffer is empty or all zeros, -EOPNOTSUPP if + * non-zero data is present, or a negative error code on failure. + */ +static inline int ib_is_udata_in_empty(struct ib_udata *udata) +{ + if (!udata || udata->inlen == 0) + return 0; + return _ib_copy_validate_udata_in(udata, NULL, 0, 0); +} + /** * ib_respond_udata - Copy a driver data response to userspace * @_udata: The system calls ib_udata struct -- cgit v1.2.3 From 5ebe8832ef900a889bfb72794086ebcde5fd40b7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:04 -0400 Subject: RDMA: Provide documentation about the uABI compatibility rules Write down how all of this is supposed to work using the new helpers. Link: https://patch.msgid.link/r/7-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Tested-by: Sriharsha Basavapatna Acked-by: Sriharsha Basavapatna Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6142f7e39700..effcaff455ca 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1577,6 +1577,93 @@ struct ib_uobject { const struct uverbs_api_object *uapi_object; }; +/** + * struct ib_udata - Driver request/response data from userspace + * @inbuf: Pointer to request data from userspace + * @outbuf: Pointer to response buffer in userspace + * @inlen: Length of request data + * @outlen: Length of response buffer + * + * struct ib_udata is used to hold the driver data request and response + * structures defined in the uapi. They follow these rules for forwards and + * backwards compatibility: + * + * 1) Userspace can provide a longer request so long as the trailing part the + * kernel doesn't understand is all zeros. + * + * This provides a degree of safety if userspace wrongly tries to use a new + * feature the kernel does not understand with some non-zero value. + * + * It allows a simpler rdma-core implementation because the library can + * simply always use the latest structs for the request, even if they are + * bigger. It simply has to avoid using the new members if they are not + * supported/required. + * + * 2) Userspace can provide a shorter request; the kernel will zero-pad it out + * to fill the storage. The newer kernel should understand that older + * userspace will provide 0 to new fields. The kernel has three options to + * enable new request fields: + * + * - Input comp_mask that says the field is supported + * - Look for non-zero values + * - Check if the udata->inlen size covers the field + * + * This also corrects any bugs related to not filling in request structures + * as the new helper always fully writes to the struct. + * + * 3) Userspace can provide a shorter or longer response struct. If shorter, + * the kernel reply is truncated. The kernel should be designed to not write + * to new reply fields unless userspace has affirmatively requested them. + * + * If the user buffer is longer, the kernel will zero-fill it. + * + * Userspace has three options to enable new response fields: + * + * - Output comp_mask that says the field is supported + * - Look for non-zero values + * - Infer the output must be valid because the request contents demand it + * and old kernels will fail the request + * + * The following helper functions implement these semantics: + * + * ib_copy_validate_udata_in() - Checks the minimum length, and zero trailing:: + * + * struct driver_create_cq_req req; + * int err; + * + * err = ib_copy_validate_udata_in(udata, req, end_member); + * if (err) + * return err; + * + * The third argument specifies the last member of the struct in the first + * kernel version that introduced it, establishing the minimum required size. + * + * ib_copy_validate_udata_in_cm() - The above but also validate a + * comp_mask member only has supported bits set:: + * + * err = ib_copy_validate_udata_in_cm(udata, req, first_version_last_member, + * DRIVER_CREATE_CQ_MASK_FEATURE_A | + * DRIVER_CREATE_CQ_MASK_FEATURE_B); + * + * ib_respond_udata() - Implements the response rules:: + * + * struct driver_create_cq_resp resp = {}; + * + * resp.some_field = value; + * return ib_respond_udata(udata, resp); + * + * ib_is_udata_in_empty() - Used instead of ib_copy_validate_udata_in() if the + * driver does not have a request structure:: + * + * ret = ib_is_udata_in_empty(udata); + * if (ret) + * return ret; + * + * Similarly ib_respond_empty_udata() is used instead of ib_respond_udata() if + * the driver does not have a response structure:: + * + * return ib_respond_empty_udata(udata); + */ struct ib_udata { const void __user *inbuf; void __user *outbuf; -- cgit v1.2.3 From 613713f251c89d089a0da7241573149b9ae8b8ab Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Mar 2026 15:50:10 -0400 Subject: RDMA: Add IB_UVERBS_CORE_SUPPORT_ROBUST_UDATA This flag can be set by drivers once they have finished auditing and implementing the full udata support on every udata operation. My intention going forward is that driver authors proposing new udata uAPI for their drivers must first do the work and set this flag. If this flag is not set the userspace should not try to use udata based uAPI newer than this commit, though on a case by case basis it may be OK based on what checks historical kernels performed on the specific call. Since bnxt_re is audited now, it is the first driver to set the flag. Link: https://patch.msgid.link/r/13-v3-bd56dd443069+49-bnxt_re_uapi_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_std_types_device.c | 8 ++++++++ drivers/infiniband/hw/bnxt_re/main.c | 1 + include/rdma/ib_verbs.h | 6 ++++++ include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + 5 files changed, 17 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2ad760e34122..236061a33bf6 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2706,6 +2706,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; + dev_ops->uverbs_robust_udata |= ops->uverbs_robust_udata; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index a28f9f21bed8..12ca15739cd2 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -247,13 +247,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( { u32 num_comp = attrs->ufile->device->num_comp_vectors; u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_device *ib_dev; int ret; + ib_dev = srcu_dereference(attrs->ufile->device->ib_dev, + &attrs->ufile->device->disassociate_srcu); + if (!ib_dev) + return -EIO; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, &num_comp, sizeof(num_comp)); if (IS_UVERBS_COPY_ERR(ret)) return ret; + if (ib_dev->ops.uverbs_robust_udata) + core_support |= IB_UVERBS_CORE_SUPPORT_ROBUST_UDATA; ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, &core_support, sizeof(core_support)); if (IS_UVERBS_COPY_ERR(ret)) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b576f05e3b26..7af514524632 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1326,6 +1326,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, .uverbs_abi_ver = BNXT_RE_ABI_VERSION, + .uverbs_robust_udata = true, .add_gid = bnxt_re_add_gid, .alloc_hw_port_stats = bnxt_re_ib_alloc_hw_port_stats, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index effcaff455ca..6354c613e9a8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2481,6 +2481,12 @@ struct ib_device_ops { enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; + /* + * Indicates the driver checks every op accepting a udata for the + * correct size on input and always handles the output using the udata + * helpers. + */ + unsigned int uverbs_robust_udata:1; /* * NOTE: New drivers should not make use of device_group; instead new diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 89e6a3f13191..90c5cd8e7753 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -46,6 +46,7 @@ enum ib_uverbs_core_support { IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS = 1 << 0, + IB_UVERBS_CORE_SUPPORT_ROBUST_UDATA = 1 << 1, }; enum ib_uverbs_access_flags { -- cgit v1.2.3 From 1234a9d8aebbf24a46ef5d323bf9074bc911423e Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 2 Mar 2026 16:30:33 +0530 Subject: RDMA/bnxt_re: Support doorbell extensions Some applications may need multiple doorbells to support parallel processing of threads that each operate on a group of resources. The following uapi methods have been implemented in this patch. - BNXT_RE_METHOD_DBR_ALLOC: This will allow the appliation to create extra doorbell regions and use the associated doorbell page index in CREATE_QP and use the associated DB address while ringing the doorbell. - BNXT_RE_METHOD_DBR_FREE: Free the allocated doorbell region. - BNXT_RE_METHOD_GET_DEFAULT_DBR: Return the default doorbell page index and doorbell page address associated with the ucontext. Link: https://patch.msgid.link/r/20260302110036.36387-4-sriharsha.basavapatna@broadcom.com Co-developed-by: Sriharsha Basavapatna Signed-off-by: Sriharsha Basavapatna Signed-off-by: Kalesh AP Reviewed-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 7 ++ drivers/infiniband/hw/bnxt_re/qplib_res.c | 43 ++++++++++ drivers/infiniband/hw/bnxt_re/qplib_res.h | 4 + drivers/infiniband/hw/bnxt_re/uapi.c | 130 ++++++++++++++++++++++++++++++ include/uapi/rdma/bnxt_re-abi.h | 29 +++++++ 5 files changed, 213 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index a11f56730a31..33e0f66b39eb 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -164,6 +164,13 @@ struct bnxt_re_user_mmap_entry { u8 mmap_flag; }; +struct bnxt_re_dbr_obj { + struct bnxt_re_dev *rdev; + struct bnxt_qplib_dpi dpi; + struct bnxt_re_user_mmap_entry *entry; + atomic_t usecnt; /* QPs using this dbr */ +}; + struct bnxt_re_flow { struct ib_flow ib_flow; struct bnxt_re_dev *rdev; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index fa6b8cd137e5..95e0489c53c3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -683,6 +683,49 @@ static int bnxt_qplib_alloc_pd_tbl(struct bnxt_qplib_res *res, } /* DPIs */ +int bnxt_qplib_alloc_uc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi) +{ + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + struct bnxt_qplib_reg_desc *reg; + u32 bit_num; + int rc = 0; + + reg = &dpit->wcreg; + mutex_lock(&res->dpi_tbl_lock); + bit_num = find_first_bit(dpit->tbl, dpit->max); + if (bit_num >= dpit->max) { + rc = -ENOMEM; + goto unlock; + } + /* Found unused DPI */ + clear_bit(bit_num, dpit->tbl); + dpi->bit = bit_num; + dpi->dpi = bit_num + (reg->offset - dpit->ucreg.offset) / PAGE_SIZE; + dpi->umdbr = reg->bar_base + reg->offset + bit_num * PAGE_SIZE; +unlock: + mutex_unlock(&res->dpi_tbl_lock); + return rc; +} + +int bnxt_qplib_free_uc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi) +{ + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + int rc = 0; + + mutex_lock(&res->dpi_tbl_lock); + if (dpi->bit >= dpit->max) { + rc = -EINVAL; + goto unlock; + } + + if (test_and_set_bit(dpi->bit, dpit->tbl)) + rc = -EINVAL; + memset(dpi, 0, sizeof(*dpi)); +unlock: + mutex_unlock(&res->dpi_tbl_lock); + return rc; +} + int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi, void *app, u8 type) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index f01c1bb1fcb4..ffe31c952d50 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -436,6 +436,10 @@ int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, void *app, u8 type); int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi); +int bnxt_qplib_alloc_uc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi); +int bnxt_qplib_free_uc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi); void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); void bnxt_qplib_free_res(struct bnxt_qplib_res *res); diff --git a/drivers/infiniband/hw/bnxt_re/uapi.c b/drivers/infiniband/hw/bnxt_re/uapi.c index 0145882e49f6..3eaee7101615 100644 --- a/drivers/infiniband/hw/bnxt_re/uapi.c +++ b/drivers/infiniband/hw/bnxt_re/uapi.c @@ -331,9 +331,139 @@ DECLARE_UVERBS_NAMED_OBJECT(BNXT_RE_OBJECT_GET_TOGGLE_MEM, &UVERBS_METHOD(BNXT_RE_METHOD_GET_TOGGLE_MEM), &UVERBS_METHOD(BNXT_RE_METHOD_RELEASE_TOGGLE_MEM)); +static int UVERBS_HANDLER(BNXT_RE_METHOD_DBR_ALLOC)(struct uverbs_attr_bundle *attrs) +{ + struct bnxt_re_db_region dbr = {}; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_dbr_obj *obj; + struct ib_ucontext *ib_uctx; + struct bnxt_qplib_dpi *dpi; + struct bnxt_re_dev *rdev; + struct ib_uobject *uobj; + u64 mmap_offset; + int ret; + + ib_uctx = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ib_uctx)) + return PTR_ERR(ib_uctx); + + uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); + rdev = uctx->rdev; + uobj = uverbs_attr_get_uobject(attrs, BNXT_RE_ALLOC_DBR_HANDLE); + + obj = kzalloc_obj(*obj); + if (!obj) + return -ENOMEM; + + dpi = &obj->dpi; + ret = bnxt_qplib_alloc_uc_dpi(&rdev->qplib_res, dpi); + if (ret) + goto free_mem; + + obj->entry = bnxt_re_mmap_entry_insert(uctx, dpi->umdbr, + BNXT_RE_MMAP_UC_DB, + &mmap_offset); + if (!obj->entry) { + ret = -ENOMEM; + goto free_dpi; + } + + obj->rdev = rdev; + uobj->object = obj; + uverbs_finalize_uobj_create(attrs, BNXT_RE_ALLOC_DBR_HANDLE); + + dbr.umdbr = dpi->umdbr; + dbr.dpi = dpi->dpi; + ret = uverbs_copy_to_struct_or_zero(attrs, BNXT_RE_ALLOC_DBR_ATTR, + &dbr, sizeof(dbr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, BNXT_RE_ALLOC_DBR_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (ret) + return ret; + return 0; +free_dpi: + bnxt_qplib_free_uc_dpi(&rdev->qplib_res, dpi); +free_mem: + kfree(obj); + return ret; +} + +static int bnxt_re_dbr_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct bnxt_re_dbr_obj *obj = uobject->object; + struct bnxt_re_dev *rdev = obj->rdev; + + rdma_user_mmap_entry_remove(&obj->entry->rdma_entry); + bnxt_qplib_free_uc_dpi(&rdev->qplib_res, &obj->dpi); + return 0; +} + +static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_DEFAULT_DBR)(struct uverbs_attr_bundle *attrs) +{ + struct bnxt_re_db_region dpi = {}; + struct bnxt_re_ucontext *uctx; + struct ib_ucontext *ib_uctx; + int ret; + + ib_uctx = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ib_uctx)) + return PTR_ERR(ib_uctx); + + uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); + dpi.umdbr = uctx->dpi.umdbr; + dpi.dpi = uctx->dpi.dpi; + + ret = uverbs_copy_to_struct_or_zero(attrs, BNXT_RE_DEFAULT_DBR_ATTR, + &dpi, sizeof(dpi)); + if (ret) + return ret; + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD(BNXT_RE_METHOD_DBR_ALLOC, + UVERBS_ATTR_IDR(BNXT_RE_ALLOC_DBR_HANDLE, + BNXT_RE_OBJECT_DBR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_DBR_ATTR, + UVERBS_ATTR_STRUCT(struct bnxt_re_db_region, + umdbr), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_DBR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY(BNXT_RE_METHOD_DBR_FREE, + UVERBS_ATTR_IDR(BNXT_RE_FREE_DBR_HANDLE, + BNXT_RE_OBJECT_DBR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(BNXT_RE_OBJECT_DBR, + UVERBS_TYPE_ALLOC_IDR(bnxt_re_dbr_cleanup), + &UVERBS_METHOD(BNXT_RE_METHOD_DBR_ALLOC), + &UVERBS_METHOD(BNXT_RE_METHOD_DBR_FREE)); + +DECLARE_UVERBS_NAMED_METHOD(BNXT_RE_METHOD_GET_DEFAULT_DBR, + UVERBS_ATTR_PTR_OUT(BNXT_RE_DEFAULT_DBR_ATTR, + UVERBS_ATTR_STRUCT(struct bnxt_re_db_region, + umdbr), + UA_MANDATORY)); + +DECLARE_UVERBS_GLOBAL_METHODS(BNXT_RE_OBJECT_DEFAULT_DBR, + &UVERBS_METHOD(BNXT_RE_METHOD_GET_DEFAULT_DBR)); + const struct uapi_definition bnxt_re_uapi_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_ALLOC_PAGE), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_NOTIFY_DRV), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_GET_TOGGLE_MEM), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_DBR), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_DEFAULT_DBR), {} }; diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index f24edf1c75eb..ef14e24836b1 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -163,6 +163,8 @@ enum bnxt_re_objects { BNXT_RE_OBJECT_ALLOC_PAGE = (1U << UVERBS_ID_NS_SHIFT), BNXT_RE_OBJECT_NOTIFY_DRV, BNXT_RE_OBJECT_GET_TOGGLE_MEM, + BNXT_RE_OBJECT_DBR, + BNXT_RE_OBJECT_DEFAULT_DBR, }; enum bnxt_re_alloc_page_type { @@ -231,4 +233,31 @@ struct bnxt_re_packet_pacing_caps { struct bnxt_re_query_device_ex_resp { struct bnxt_re_packet_pacing_caps packet_pacing_caps; }; + +struct bnxt_re_db_region { + __u32 dpi; + __u32 reserved; + __aligned_u64 umdbr; +}; + +enum bnxt_re_obj_dbr_alloc_attrs { + BNXT_RE_ALLOC_DBR_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_ALLOC_DBR_ATTR, + BNXT_RE_ALLOC_DBR_OFFSET, +}; + +enum bnxt_re_obj_dbr_free_attrs { + BNXT_RE_FREE_DBR_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_obj_default_dbr_attrs { + BNXT_RE_DEFAULT_DBR_ATTR = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_obj_dpi_methods { + BNXT_RE_METHOD_DBR_ALLOC = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_METHOD_DBR_FREE, + BNXT_RE_METHOD_GET_DEFAULT_DBR, +}; + #endif /* __BNXT_RE_UVERBS_ABI_H__*/ -- cgit v1.2.3 From a06165a705eefff4b524ad72c50c9ad82bdf4fae Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Mon, 2 Mar 2026 16:30:36 +0530 Subject: RDMA/bnxt_re: Support application specific CQs This patch supports application allocated memory for CQs. The application allocates and manages the CQs directly. To support this, the driver exports a new comp_mask to indicate direct control of the CQ. When this comp_mask bit is set in the ureq, the driver maps this application allocated CQ memory into hardware. As the application manages this memory, the CQ depth ('cqe') passed by it must be used as is and the driver shouldn't update it. For CQs, ib_core supports pinning dmabuf based application memory, specified through provider attributes. This umem is mananged by the ib_core and is available in ib_cq. Register 'create_cq_user' devop to process this umem. The driver also supports the legacy interface that allocates umem internally. Link: https://patch.msgid.link/r/20260302110036.36387-7-sriharsha.basavapatna@broadcom.com Signed-off-by: Sriharsha Basavapatna Reviewed-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 36 +++++++++++++++++--------------- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 ++- drivers/infiniband/hw/bnxt_re/main.c | 1 + include/uapi/rdma/bnxt_re-abi.h | 7 ++++++- 4 files changed, 28 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 395225169742..182128ee4f24 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3342,7 +3342,6 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); bnxt_re_put_nq(rdev, nq); - ib_umem_release(cq->umem); atomic_dec(&rdev->stats.res.cq_count); kfree(cq->cql); @@ -3369,8 +3368,8 @@ static int bnxt_re_setup_sginfo(struct bnxt_re_dev *rdev, return 0; } -static int bnxt_re_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct uverbs_attr_bundle *attrs) +int bnxt_re_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs) { struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); @@ -3402,19 +3401,25 @@ static int bnxt_re_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_at if (entries > dev_attr->max_cq_wqes + 1) entries = dev_attr->max_cq_wqes + 1; - rc = ib_copy_validate_udata_in(udata, req, cq_handle); + rc = ib_copy_validate_udata_in_cm(udata, req, cq_handle, + BNXT_RE_CQ_FIXED_NUM_CQE_ENABLE); if (rc) return rc; - cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va, - entries * sizeof(struct cq_base), - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(cq->umem)) { - rc = PTR_ERR(cq->umem); - return rc; + if (req.comp_mask & BNXT_RE_CQ_FIXED_NUM_CQE_ENABLE) + entries = cqe; + + if (!ibcq->umem) { + ibcq->umem = ib_umem_get(&rdev->ibdev, req.cq_va, + entries * sizeof(struct cq_base), + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ibcq->umem)) { + rc = PTR_ERR(ibcq->umem); + goto fail; + } } - rc = bnxt_re_setup_sginfo(rdev, cq->umem, &cq->qplib_cq.sg_info); + rc = bnxt_re_setup_sginfo(rdev, ibcq->umem, &cq->qplib_cq.sg_info); if (rc) goto fail; @@ -3462,7 +3467,6 @@ static int bnxt_re_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_at free_mem: free_page((unsigned long)cq->uctx_cq_page); fail: - ib_umem_release(cq->umem); return rc; } @@ -3475,7 +3479,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; - struct bnxt_qplib_chip_ctx *cctx; int cqe = attr->cqe; int rc, entries; u32 active_cqs; @@ -3493,7 +3496,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } cq->rdev = rdev; - cctx = rdev->chip_ctx; cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq); entries = bnxt_re_init_depth(cqe + 1, uctx); @@ -3542,8 +3544,8 @@ static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) cq->qplib_cq.max_wqe = cq->resize_cqe; if (cq->resize_umem) { - ib_umem_release(cq->umem); - cq->umem = cq->resize_umem; + ib_umem_release(cq->ib_cq.umem); + cq->ib_cq.umem = cq->resize_umem; cq->resize_umem = NULL; cq->resize_cqe = 0; } @@ -4142,7 +4144,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) /* User CQ; the only processing we do is to * complete any pending CQ resize operation. */ - if (cq->umem) { + if (cq->ib_cq.umem) { if (cq->resize_umem) bnxt_re_resize_cq_complete(cq); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 33e0f66b39eb..3d02c16f54b6 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -108,7 +108,6 @@ struct bnxt_re_cq { struct bnxt_qplib_cqe *cql; #define MAX_CQL_PER_POLL 1024 u32 max_cql; - struct ib_umem *umem; struct ib_umem *resize_umem; int resize_cqe; void *uctx_cq_page; @@ -254,6 +253,8 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); +int bnxt_re_create_user_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs); int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 7af514524632..13ad63b9b1de 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1335,6 +1335,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .alloc_ucontext = bnxt_re_alloc_ucontext, .create_ah = bnxt_re_create_ah, .create_cq = bnxt_re_create_cq, + .create_user_cq = bnxt_re_create_user_cq, .create_qp = bnxt_re_create_qp, .create_srq = bnxt_re_create_srq, .create_user_ah = bnxt_re_create_ah, diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index ef14e24836b1..40955eaba32e 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -102,12 +102,17 @@ struct bnxt_re_pd_resp { struct bnxt_re_cq_req { __aligned_u64 cq_va; __aligned_u64 cq_handle; + __aligned_u64 comp_mask; }; -enum bnxt_re_cq_mask { +enum bnxt_re_resp_cq_mask { BNXT_RE_CQ_TOGGLE_PAGE_SUPPORT = 0x1, }; +enum bnxt_re_req_cq_mask { + BNXT_RE_CQ_FIXED_NUM_CQE_ENABLE = 0x1, +}; + struct bnxt_re_cq_resp { __u32 cqid; __u32 tail; -- cgit v1.2.3 From ff85a2ebacbdaec9f28c4660c991295ace93cd1c Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Thu, 5 Mar 2026 17:08:24 +0000 Subject: RDMA/umem: Add pinned revocable dmabuf import interface Added an interface for importing a pinned but revocable dmabuf. This interface can be used by drivers that are capable of revocation so that they can import dmabufs from exporters that may require it, such as VFIO. This interface implements a two step process, where drivers will first call ib_umem_dmabuf_get_pinned_revocable_and_lock() which will pin and map the dmabuf (and provide a functional move_notify/invalidate_mappings callback), but will return with the lock still held so that the driver can then populate the callback via ib_umem_dmabuf_set_revoke_locked() without races from concurrent revocations. This scheme also allows for easier integration with drivers that may not have actually allocated their internal MR objects at the time of the get_pinned_revocable* call. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260305170826.3803155-4-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 61 +++++++++++++++++++++++++++++++++++ include/rdma/ib_umem.h | 19 +++++++++++ 2 files changed, 80 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 9cf9cfc93006..1a810dbdea9a 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -203,6 +203,10 @@ static void ib_umem_dmabuf_revoke_locked(struct dma_buf_attachment *attach) if (umem_dmabuf->revoked) return; + + if (umem_dmabuf->pinned_revoke) + umem_dmabuf->pinned_revoke(umem_dmabuf->private); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); if (umem_dmabuf->pinned) { dma_buf_unpin(umem_dmabuf->attach); @@ -211,6 +215,11 @@ static void ib_umem_dmabuf_revoke_locked(struct dma_buf_attachment *attach) umem_dmabuf->revoked = 1; } +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_revocable_ops = { + .allow_peer2peer = true, + .move_notify = ib_umem_dmabuf_revoke_locked, +}; + static struct ib_umem_dmabuf * ib_umem_dmabuf_get_pinned_and_lock(struct ib_device *device, struct device *dma_device, @@ -263,6 +272,58 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, } EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device); +/** + * ib_umem_dmabuf_get_pinned_revocable_and_lock - Map & pin a revocable dmabuf + * @device: IB device. + * @offset: Start offset. + * @size: Length. + * @fd: dmabuf fd. + * @access: Access flags. + * + * Obtains a umem from a dmabuf for drivers/devices that can support revocation. + * + * Returns with dma_resv_lock held upon success. The driver must set the revoke + * callback prior to unlock by calling ib_umem_dmabuf_set_revoke_locked(). + * + * When a revocation occurs, the revoke callback will be called. The driver must + * ensure that the region is no longer accessed when the callback returns. Any + * subsequent access attempts should also probably cause an AE for MRs. + * + * If the umem is used for an MR, the driver must ensure that the key remains in + * use such that it cannot be obtained by a new region until this region is + * fully deregistered (i.e., ibv_dereg_mr). If a driver needs to serialize with + * revoke calls, it can use dma_resv_lock. + * + * If successful, then the revoke callback may be called at any time and will + * also be called automatically upon ib_umem_release (serialized). The revoke + * callback will be called one time at most. + * + * Return: A pointer to ib_umem_dmabuf on success, or an ERR_PTR on failure. + */ +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_revocable_and_lock(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access) +{ + const struct dma_buf_attach_ops *ops = + &ib_umem_dmabuf_attach_pinned_revocable_ops; + + return ib_umem_dmabuf_get_pinned_and_lock(device, device->dma_device, + offset, size, fd, access, + ops); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_revocable_and_lock); + +void ib_umem_dmabuf_set_revoke_locked(struct ib_umem_dmabuf *umem_dmabuf, + void (*revoke)(void *priv), void *priv) +{ + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + umem_dmabuf->pinned_revoke = revoke; + umem_dmabuf->private = priv; +} +EXPORT_SYMBOL(ib_umem_dmabuf_set_revoke_locked); + struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset, size_t size, int fd, diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 1cc1d4077353..28075e617480 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -32,6 +32,7 @@ struct ib_umem_dmabuf { struct scatterlist *last_sg; unsigned long first_sg_offset; unsigned long last_sg_trim; + void (*pinned_revoke)(void *priv); void *private; u8 pinned : 1; u8 revoked : 1; @@ -137,6 +138,12 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, size_t size, int fd, int access); struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_revocable_and_lock(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access); +void ib_umem_dmabuf_set_revoke_locked(struct ib_umem_dmabuf *umem_dmabuf, + void (*revoke)(void *priv), void *priv); +struct ib_umem_dmabuf * ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, struct device *dma_device, unsigned long offset, size_t size, @@ -189,6 +196,18 @@ ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset, return ERR_PTR(-EOPNOTSUPP); } +static inline struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_revocable_and_lock(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +ib_umem_dmabuf_set_revoke_locked(struct ib_umem_dmabuf *umem_dmabuf, + void (*revoke)(void *priv), void *priv) {} + static inline struct ib_umem_dmabuf * ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, struct device *dma_device, -- cgit v1.2.3 From 3a0b171302eea1732a168e26db3b8461f51cc1f9 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Thu, 5 Mar 2026 17:08:25 +0000 Subject: RDMA/umem: Add helpers for umem dmabuf revoke lock Added helpers to acquire and release the umem dmabuf revoke lock. The intent is to avoid the need for drivers to peek into the ib_umem_dmabuf internals to get the dma_resv_lock and bring us one step closer to abstracting ib_umem_dmabuf away from drivers in general. Signed-off-by: Jacob Moroni Link: https://patch.msgid.link/20260305170826.3803155-5-jmoroni@google.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_dmabuf.c | 16 ++++++++++++++++ include/rdma/ib_umem.h | 4 ++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 1a810dbdea9a..9deded3d58b5 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -334,6 +334,22 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, } EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); +void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + dma_resv_lock(dmabuf->resv, NULL); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke_lock); + +void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + dma_resv_unlock(dmabuf->resv); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke_unlock); + void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) { struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 28075e617480..38414281a686 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -151,6 +151,8 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); +void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf); +void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf); #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -223,6 +225,8 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) } static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { } +static inline void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf) {} +static inline void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) {} static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {} #endif /* CONFIG_INFINIBAND_USER_MEM */ -- cgit v1.2.3 From 27ab4f1e4909a674dfd03058fb9802cae2343a36 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 26 Feb 2026 12:25:54 +0530 Subject: soundwire: amd: refactor bandwidth calculation logic For current platforms(ACP6.3/ACP7.0/ACP7.1/ACP7.2), AMD SoundWire manager doesn't have banked registers for data port programming on Manager's side. Need to use fixed block offsets, hstart & hstop for manager ports. Earlier amd manager driver has support for 12 MHz as a bus clock frequency where frame rate is 48000 and number of bits is 500, frame shape as 50 x 10 with fixed block offset mapping based on port number. Got a new requirement to support 6 MHz as a bus clock frequency. For 6 MHz bus clock frequency amd manager driver needs to support two different frame shapes i.e number of bits as 250 with frame rate as 48000 and frame shape as 125 x 2 and for the second combination number of bits as 500 where frame rate is 24000 and frame shape is 50 x 10. Few SoundWire peripherals doesn't support 125 x 2 as a frame shape for 6 MHz bus clock frequency. They have explicit requirement for the frame shape. In this scenario, amd manager driver needs to use 50 x 10 as a frame shape where frame rate is 24000. Based on the platform and SoundWire topology for 6Mhz support frame shape will be decided which is part of SoundWire manager DisCo tables. For current platforms, amd manager driver supports only two bus clock frequencies(12 MHz & 6 MHz). Refactor bandwidth logic to support different bus clock frequencies. Signed-off-by: Vijendar Mukunda Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260226065638.1251771-3-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 57 ++++++++++++++++++++++++++++++++++++--- include/linux/soundwire/sdw_amd.h | 4 +++ 2 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index 61df5cecdccc..a3316efdf8ac 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -467,12 +467,16 @@ static u32 amd_sdw_read_ping_status(struct sdw_bus *bus) static int amd_sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream) { + struct amd_sdw_manager *amd_manager = to_amd_sdw(bus); struct sdw_transport_data t_data = {0}; struct sdw_master_runtime *m_rt; struct sdw_port_runtime *p_rt; struct sdw_bus_params *b_params = &bus->params; int port_bo, hstart, hstop, sample_int; - unsigned int rate, bps; + unsigned int rate, bps, channels; + unsigned int stream_slot_size, max_slots; + static unsigned int next_offset[AMD_SDW_MAX_MANAGER_COUNT] = {1}; + unsigned int inst_id = amd_manager->instance; port_bo = 0; hstart = 1; @@ -483,11 +487,51 @@ static int amd_sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime list_for_each_entry(m_rt, &bus->m_rt_list, bus_node) { rate = m_rt->stream->params.rate; bps = m_rt->stream->params.bps; + channels = m_rt->stream->params.ch_count; sample_int = (bus->params.curr_dr_freq / rate); + + /* Compute slots required for this stream dynamically */ + stream_slot_size = bps * channels; + list_for_each_entry(p_rt, &m_rt->port_list, port_node) { - port_bo = (p_rt->num * 64) + 1; - dev_dbg(bus->dev, "p_rt->num=%d hstart=%d hstop=%d port_bo=%d\n", - p_rt->num, hstart, hstop, port_bo); + if (p_rt->num >= amd_manager->max_ports) { + dev_err(bus->dev, "Port %d exceeds max ports %d\n", + p_rt->num, amd_manager->max_ports); + return -EINVAL; + } + + if (!amd_manager->port_offset_map[p_rt->num]) { + /* + * port block offset calculation for 6MHz bus clock frequency with + * different frame sizes 50 x 10 and 125 x 2 + */ + if (bus->params.curr_dr_freq == 12000000) { + max_slots = bus->params.row * (bus->params.col - 1); + if (next_offset[inst_id] + stream_slot_size <= + (max_slots - 1)) { + amd_manager->port_offset_map[p_rt->num] = + next_offset[inst_id]; + next_offset[inst_id] += stream_slot_size; + } else { + dev_err(bus->dev, + "No space for port %d\n", p_rt->num); + return -ENOMEM; + } + } else { + /* + * port block offset calculation for 12MHz bus clock + * frequency + */ + amd_manager->port_offset_map[p_rt->num] = + (p_rt->num * 64) + 1; + } + } + port_bo = amd_manager->port_offset_map[p_rt->num]; + dev_dbg(bus->dev, + "Port=%d hstart=%d hstop=%d port_bo=%d slots=%d max_ports=%d\n", + p_rt->num, hstart, hstop, port_bo, stream_slot_size, + amd_manager->max_ports); + sdw_fill_xport_params(&p_rt->transport_params, p_rt->num, false, SDW_BLK_GRP_CNT_1, sample_int, port_bo, port_bo >> 8, hstart, hstop, @@ -1079,6 +1123,11 @@ static int amd_sdw_manager_probe(struct platform_device *pdev) default: return -EINVAL; } + amd_manager->max_ports = amd_manager->num_dout_ports + amd_manager->num_din_ports; + amd_manager->port_offset_map = devm_kcalloc(dev, amd_manager->max_ports, + sizeof(int), GFP_KERNEL); + if (!amd_manager->port_offset_map) + return -ENOMEM; prop = &amd_manager->bus.prop; prop->mclk_freq = AMD_SDW_BUS_BASE_FREQ; diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index fe31773d5210..470360a2723c 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -66,8 +66,10 @@ struct sdw_amd_dai_runtime { * @status: peripheral devices status array * @num_din_ports: number of input ports * @num_dout_ports: number of output ports + * @max_ports: total number of input ports and output ports * @cols_index: Column index in frame shape * @rows_index: Rows index in frame shape + * @port_offset_map: dynamic array to map port block offset * @instance: SoundWire manager instance * @quirks: SoundWire manager quirks * @wake_en_mask: wake enable mask per SoundWire manager @@ -92,10 +94,12 @@ struct amd_sdw_manager { int num_din_ports; int num_dout_ports; + int max_ports; int cols_index; int rows_index; + int *port_offset_map; u32 instance; u32 quirks; u32 wake_en_mask; -- cgit v1.2.3 From 493740d790cce709d285cd1022d16d05439b7d5b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 6 Mar 2026 11:31:54 +0530 Subject: drm/buddy: Improve offset-aligned allocation handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large alignment requests previously forced the buddy allocator to search by alignment order, which often caused higher-order free blocks to be split even when a suitably aligned smaller region already existed within them. This led to excessive fragmentation, especially for workloads requesting small sizes with large alignment constraints. This change prioritizes the requested allocation size during the search and uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate free blocks that satisfy both size and offset-alignment requirements. As a result, the allocator can directly select an aligned sub-region without splitting larger blocks unnecessarily. A practical example is the VKCTS test dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations caused large blocks to be split aggressively, despite smaller aligned regions being sufficient. With this change, those aligned regions are reused directly, significantly reducing fragmentation. This improvement is visible in the amdgpu VRAM buddy allocator state (/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks are preserved and the number of low-order fragments is substantially reduced. Before: order- 5 free: 1936 MiB, blocks: 15490 order- 4 free: 967 MiB, blocks: 15486 order- 3 free: 483 MiB, blocks: 15485 order- 2 free: 241 MiB, blocks: 15486 order- 1 free: 241 MiB, blocks: 30948 After: order- 5 free: 493 MiB, blocks: 3941 order- 4 free: 246 MiB, blocks: 3943 order- 3 free: 123 MiB, blocks: 4101 order- 2 free: 61 MiB, blocks: 4101 order- 1 free: 61 MiB, blocks: 8018 By avoiding unnecessary splits, this change improves allocator efficiency and helps maintain larger contiguous free regions under heavy offset-aligned allocation workloads. v2:(Matthew) - Update augmented information along the path to the inserted node. v3: - Move the patch to gpu/buddy.c file. v4:(Matthew) - Use the helper instead of calling _ffs directly - Remove gpu_buddy_block_order(block) >= order check and drop order - Drop !node check as all callers handle this already - Return larger than any other possible alignment for __ffs64(0) - Replace __ffs with __ffs64 v5:(Matthew) - Drop subtree_max_alignment initialization at gpu_block_alloc() Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com --- drivers/gpu/buddy.c | 272 ++++++++++++++++++++++++++++++++++++++-------- include/linux/gpu_buddy.h | 2 + 2 files changed, 229 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index da5a1222f46b..52686672e99f 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -53,6 +53,25 @@ gpu_buddy_block_is_split(struct gpu_buddy_block *block) return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; } +static unsigned int gpu_buddy_block_offset_alignment(struct gpu_buddy_block *block) +{ + u64 offset = gpu_buddy_block_offset(block); + + if (!offset) + /* + * __ffs64(0) is undefined; offset 0 is maximally aligned, so return + * a value greater than any possible alignment. + */ + return 64 + 1; + + return __ffs64(offset); +} + +RB_DECLARE_CALLBACKS_MAX(static, gpu_buddy_augment_cb, + struct gpu_buddy_block, rb, + unsigned int, subtree_max_alignment, + gpu_buddy_block_offset_alignment); + static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, struct gpu_buddy_block *parent, unsigned int order, @@ -106,26 +125,42 @@ static bool rbtree_is_empty(struct rb_root *root) return RB_EMPTY_ROOT(root); } -static bool gpu_buddy_block_offset_less(const struct gpu_buddy_block *block, - const struct gpu_buddy_block *node) -{ - return gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node); -} - -static bool rbtree_block_offset_less(struct rb_node *block, - const struct rb_node *node) -{ - return gpu_buddy_block_offset_less(rbtree_get_free_block(block), - rbtree_get_free_block(node)); -} - static void rbtree_insert(struct gpu_buddy *mm, struct gpu_buddy_block *block, enum gpu_buddy_free_tree tree) { - rb_add(&block->rb, - &mm->free_trees[tree][gpu_buddy_block_order(block)], - rbtree_block_offset_less); + struct rb_node **link, *parent = NULL; + unsigned int block_alignment, order; + struct gpu_buddy_block *node; + struct rb_root *root; + + order = gpu_buddy_block_order(block); + block_alignment = gpu_buddy_block_offset_alignment(block); + + root = &mm->free_trees[tree][order]; + link = &root->rb_node; + + while (*link) { + parent = *link; + node = rbtree_get_free_block(parent); + /* + * Manual augmentation update during insertion traversal. Required + * because rb_insert_augmented() only calls rotate callback during + * rotations. This ensures all ancestors on the insertion path have + * correct subtree_max_alignment values. + */ + if (node->subtree_max_alignment < block_alignment) + node->subtree_max_alignment = block_alignment; + + if (gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + block->subtree_max_alignment = block_alignment; + rb_link_node(&block->rb, parent, link); + rb_insert_augmented(&block->rb, root, &gpu_buddy_augment_cb); } static void rbtree_remove(struct gpu_buddy *mm, @@ -138,7 +173,7 @@ static void rbtree_remove(struct gpu_buddy *mm, tree = get_block_tree(block); root = &mm->free_trees[tree][order]; - rb_erase(&block->rb, root); + rb_erase_augmented(&block->rb, root, &gpu_buddy_augment_cb); RB_CLEAR_NODE(&block->rb); } @@ -811,6 +846,127 @@ err_undo: return ERR_PTR(err); } +static bool +gpu_buddy_can_offset_align(u64 size, u64 min_block_size) +{ + return size < min_block_size && is_power_of_2(size); +} + +static bool gpu_buddy_subtree_can_satisfy(struct rb_node *node, + unsigned int alignment) +{ + struct gpu_buddy_block *block; + + block = rbtree_get_free_block(node); + return block->subtree_max_alignment >= alignment; +} + +static struct gpu_buddy_block * +gpu_buddy_find_block_aligned(struct gpu_buddy *mm, + enum gpu_buddy_free_tree tree, + unsigned int order, + unsigned int alignment, + unsigned long flags) +{ + struct rb_root *root = &mm->free_trees[tree][order]; + struct rb_node *rb = root->rb_node; + + while (rb) { + struct gpu_buddy_block *block = rbtree_get_free_block(rb); + struct rb_node *left_node = rb->rb_left, *right_node = rb->rb_right; + + if (right_node) { + if (gpu_buddy_subtree_can_satisfy(right_node, alignment)) { + rb = right_node; + continue; + } + } + + if (gpu_buddy_block_offset_alignment(block) >= alignment) + return block; + + if (left_node) { + if (gpu_buddy_subtree_can_satisfy(left_node, alignment)) { + rb = left_node; + continue; + } + } + + break; + } + + return NULL; +} + +static struct gpu_buddy_block * +gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm, + u64 size, + u64 min_block_size, + unsigned long flags) +{ + struct gpu_buddy_block *block = NULL; + unsigned int order, tmp, alignment; + struct gpu_buddy_block *buddy; + enum gpu_buddy_free_tree tree; + unsigned long pages; + int err; + + alignment = ilog2(min_block_size); + pages = size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + + tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; + + for (tmp = order; tmp <= mm->max_order; ++tmp) { + block = gpu_buddy_find_block_aligned(mm, tree, tmp, + alignment, flags); + if (!block) { + tree = (tree == GPU_BUDDY_CLEAR_TREE) ? + GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; + block = gpu_buddy_find_block_aligned(mm, tree, tmp, + alignment, flags); + } + + if (block) + break; + } + + if (!block) + return ERR_PTR(-ENOSPC); + + while (gpu_buddy_block_order(block) > order) { + struct gpu_buddy_block *left, *right; + + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + + left = block->left; + right = block->right; + + if (gpu_buddy_block_offset_alignment(right) >= alignment) + block = right; + else + block = left; + } + + return block; + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); + return ERR_PTR(err); +} + static int __alloc_range(struct gpu_buddy *mm, struct list_head *dfs, u64 start, u64 size, @@ -1080,6 +1236,7 @@ EXPORT_SYMBOL(gpu_buddy_block_trim); static struct gpu_buddy_block * __gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, + u64 size, u64 min_block_size, unsigned int order, unsigned long flags) { @@ -1087,6 +1244,11 @@ __gpu_buddy_alloc_blocks(struct gpu_buddy *mm, /* Allocate traversing within the range */ return __gpu_buddy_alloc_range_bias(mm, start, end, order, flags); + else if (size < min_block_size) + /* Allocate from an offset-aligned region without size rounding */ + return gpu_buddy_offset_aligned_allocation(mm, size, + min_block_size, + flags); else /* Allocate from freetree */ return alloc_from_freetree(mm, order, flags); @@ -1158,8 +1320,11 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) { size = roundup_pow_of_two(size); min_block_size = size; - /* Align size value to min_block_size */ - } else if (!IS_ALIGNED(size, min_block_size)) { + /* + * Normalize the requested size to min_block_size for regular allocations. + * Offset-aligned allocations intentionally skip size rounding. + */ + } else if (!gpu_buddy_can_offset_align(size, min_block_size)) { size = round_up(size, min_block_size); } @@ -1179,43 +1344,60 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, do { order = min(order, (unsigned int)fls(pages) - 1); BUG_ON(order > mm->max_order); - BUG_ON(order < min_order); + /* + * Regular allocations must not allocate blocks smaller than min_block_size. + * Offset-aligned allocations deliberately bypass this constraint. + */ + BUG_ON(size >= min_block_size && order < min_order); do { + unsigned int fallback_order; + block = __gpu_buddy_alloc_blocks(mm, start, end, + size, + min_block_size, order, flags); if (!IS_ERR(block)) break; - if (order-- == min_order) { - /* Try allocation through force merge method */ - if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { - block = __gpu_buddy_alloc_blocks(mm, start, - end, - min_order, - flags); - if (!IS_ERR(block)) { - order = min_order; - break; - } - } + if (size < min_block_size) { + fallback_order = order; + } else if (order == min_order) { + fallback_order = min_order; + } else { + order--; + continue; + } - /* - * Try contiguous block allocation through - * try harder method. - */ - if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & GPU_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, - original_size, - original_min_size, - blocks); - err = -ENOSPC; - goto err_free; + /* Try allocation through force merge method */ + if (mm->clear_avail && + !__force_merge(mm, start, end, fallback_order)) { + block = __gpu_buddy_alloc_blocks(mm, start, + end, + size, + min_block_size, + fallback_order, + flags); + if (!IS_ERR(block)) { + order = fallback_order; + break; + } } + + /* + * Try contiguous block allocation through + * try harder method. + */ + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, + original_size, + original_min_size, + blocks); + err = -ENOSPC; + goto err_free; } while (1); mark_allocated(mm, block); diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index f1fb6eff604a..5fa917ba5450 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range @@ -128,6 +129,7 @@ struct gpu_buddy_block { }; /* private: */ struct list_head tmp_link; + unsigned int subtree_max_alignment; }; /* Order-zero must be at least SZ_4K */ -- cgit v1.2.3 From 5f88899ec7531e1680b1003f32584d7da5922902 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 3 Mar 2026 10:25:00 +0000 Subject: dmaengine: Document cyclic transfer for dmaengine_prep_peripheral_dma_vec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document that the DMA_PREP_REPEAT flag can be used with the dmaengine_prep_peripheral_dma_vec() to mark a transfer as cyclic similar to dmaengine_prep_dma_cyclic(). Reviewed-by: Frank Li Signed-off-by: Nuno Sá Link: https://patch.msgid.link/20260303-axi-dac-cyclic-support-v2-1-0db27b4be95a@analog.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 99efe2b9b4ea..b3d251c9734e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -996,7 +996,8 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( * @vecs: The array of DMA vectors that should be transferred * @nents: The number of DMA vectors in the array * @dir: Specifies the direction of the data transfer - * @flags: DMA engine flags + * @flags: DMA engine flags - DMA_PREP_REPEAT can be used to mark a cyclic + * DMA transfer */ static inline struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, -- cgit v1.2.3 From 70fbea9f1a44d80a4c573c225f119022d6e21360 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:13 -0800 Subject: dmaengine: ti-cppi5: fix all kernel-doc warnings Add missing struct member, function parameter, and enum value descriptions. Add missing function Returns: sections. Use correct function name in kernel-doc to avoid mismatched prototypes. These repair all kernel-doc warnings in ti-cppi5.h: Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info1' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info2' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:50 struct member 'epib' not described in 'cppi5_host_desc_t' Warning: include/linux/dma/ti-cppi5.h:142 struct member 'epib' not described in 'cppi5_monolithic_desc_t' Warning: include/linux/dma/ti-cppi5.h:413 function parameter 'pkt_len' not described in 'cppi5_hdesc_set_pktlen' Warning: include/linux/dma/ti-cppi5.h:436 function parameter 'ps_flags' not described in 'cppi5_hdesc_set_psflags' Warning: include/linux/dma/ti-cppi5.h:509 function parameter 'hbuf_desc' not described in 'cppi5_hdesc_link_hbdesc' Warning: include/linux/dma/ti-cppi5.h:839 struct member 'dicnt3' not described in 'cppi5_tr_type15_t' Warning: include/linux/dma/ti-cppi5.h:970 function parameter 'desc_hdr' not described in 'cppi5_trdesc_init' Warning: include/linux/dma/ti-cppi5.h:184 No description found for return value of 'cppi5_desc_is_tdcm' Warning: include/linux/dma/ti-cppi5.h:198 No description found for return value of 'cppi5_desc_get_type' Warning: include/linux/dma/ti-cppi5.h:210 No description found for return value of 'cppi5_desc_get_errflags' Warning: include/linux/dma/ti-cppi5.h:448 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_get_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:460 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_set_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:1053 expecting prototype for cppi5_tr_cflag_set(). Prototype was for cppi5_tr_csf_set() instead Warning: include/linux/dma/ti-cppi5.h:651 Enum value 'CPPI5_TR_TYPE_MAX' not described in enum 'cppi5_tr_types' Warning: include/linux/dma/ti-cppi5.h:676 Enum value 'CPPI5_TR_EVENT_SIZE_MAX' not described in enum 'cppi5_tr_event_size' Warning: include/linux/dma/ti-cppi5.h:693 Enum value 'CPPI5_TR_TRIGGER_MAX' not described in enum 'cppi5_tr_trigger' Warning: include/linux/dma/ti-cppi5.h:714 Enum value 'CPPI5_TR_TRIGGER_TYPE_MAX' not described in enum 'cppi5_tr_trigger_type' Warning: include/linux/dma/ti-cppi5.h:890 Enum value 'CPPI5_TR_RESPONSE_STATUS_MAX' not described in enum 'cppi5_tr_resp_status_type' Warning: include/linux/dma/ti-cppi5.h:906 Enum value 'CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX' not described in enum 'cppi5_tr_resp_status_submission' Warning: include/linux/dma/ti-cppi5.h:934 Enum value 'CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX' not described in enum 'cppi5_tr_resp_status_unsupported' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011213.3063688-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/ti-cppi5.h | 53 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index c53c0f6e3b1a..3fe19b75ddf7 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -16,8 +16,8 @@ * struct cppi5_desc_hdr_t - Descriptor header, present in all types of * descriptors * @pkt_info0: Packet info word 0 (n/a in Buffer desc) - * @pkt_info0: Packet info word 1 (n/a in Buffer desc) - * @pkt_info0: Packet info word 2 (n/a in Buffer desc) + * @pkt_info1: Packet info word 1 (n/a in Buffer desc) + * @pkt_info2: Packet info word 2 (n/a in Buffer desc) * @src_dst_tag: Packet info word 3 (n/a in Buffer desc) */ struct cppi5_desc_hdr_t { @@ -35,7 +35,7 @@ struct cppi5_desc_hdr_t { * @buf_info1: word 8: Buffer valid data length * @org_buf_len: word 9: Original buffer length * @org_buf_ptr: word 10/11: Original buffer pointer - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -132,7 +132,7 @@ struct cppi5_desc_epib_t { /** * struct cppi5_monolithic_desc_t - Monolithic-mode packet descriptor * @hdr: Descriptor header - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -179,7 +179,7 @@ static inline void cppi5_desc_dump(void *desc, u32 size) * cppi5_desc_is_tdcm - check if the paddr indicates Teardown Complete Message * @paddr: Physical address of the packet popped from the ring * - * Returns true if the address indicates TDCM + * Returns: true if the address indicates TDCM */ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) { @@ -190,7 +190,7 @@ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) * cppi5_desc_get_type - get descriptor type * @desc_hdr: packet descriptor/TR header * - * Returns descriptor type: + * Returns: descriptor type: * CPPI5_INFO0_DESC_TYPE_VAL_HOST * CPPI5_INFO0_DESC_TYPE_VAL_MONO * CPPI5_INFO0_DESC_TYPE_VAL_TR @@ -205,7 +205,7 @@ static inline u32 cppi5_desc_get_type(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_desc_get_errflags - get Error Flags from Desc * @desc_hdr: packet/TR descriptor header * - * Returns Error Flags from Packet/TR Descriptor + * Returns: Error Flags from Packet/TR Descriptor */ static inline u32 cppi5_desc_get_errflags(struct cppi5_desc_hdr_t *desc_hdr) { @@ -307,7 +307,7 @@ static inline void cppi5_desc_set_tags_ids(struct cppi5_desc_hdr_t *desc_hdr, * @psdata_size: PSDATA size * @sw_data_size: SWDATA size * - * Returns required Host Packet Descriptor size + * Returns: required Host Packet Descriptor size * 0 - if PSDATA > CPPI5_INFO0_HDESC_PSDATA_MAX_SIZE */ static inline u32 cppi5_hdesc_calc_size(bool epib, u32 psdata_size, @@ -381,6 +381,8 @@ cppi5_hdesc_update_psdata_size(struct cppi5_host_desc_t *desc, u32 psdata_size) /** * cppi5_hdesc_get_psdata_size - get PSdata size in bytes * @desc: Host packet descriptor + * + * Returns: PSdata size in bytes */ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) { @@ -398,7 +400,7 @@ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_pktlen - get Packet Length from HDesc * @desc: Host packet descriptor * - * Returns Packet Length from Host Packet Descriptor + * Returns: Packet Length from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) { @@ -408,6 +410,7 @@ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_pktlen - set Packet Length in HDesc * @desc: Host packet descriptor + * @pkt_len: Packet length to set */ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, u32 pkt_len) @@ -420,7 +423,7 @@ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, * cppi5_hdesc_get_psflags - get Protocol Specific Flags from HDesc * @desc: Host packet descriptor * - * Returns Protocol Specific Flags from Host Packet Descriptor + * Returns: Protocol Specific Flags from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) { @@ -431,6 +434,7 @@ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_psflags - set Protocol Specific Flags in HDesc * @desc: Host packet descriptor + * @ps_flags: Protocol Specific flags to set */ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, u32 ps_flags) @@ -442,8 +446,10 @@ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, } /** - * cppi5_hdesc_get_errflags - get Packet Type from HDesc + * cppi5_hdesc_get_pkttype - get Packet Type from HDesc * @desc: Host packet descriptor + * + * Returns: Packet type */ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) { @@ -452,7 +458,7 @@ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) } /** - * cppi5_hdesc_get_errflags - set Packet Type in HDesc + * cppi5_hdesc_set_pkttype - set Packet Type in HDesc * @desc: Host packet descriptor * @pkt_type: Packet Type */ @@ -501,7 +507,7 @@ static inline void cppi5_hdesc_reset_to_original(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_link_hbdesc - link Host Buffer Descriptor to HDesc * @desc: Host Packet Descriptor - * @buf_desc: Host Buffer Descriptor physical address + * @hbuf_desc: Host Buffer Descriptor physical address * * add and link Host Buffer Descriptor to HDesc */ @@ -527,7 +533,7 @@ static inline void cppi5_hdesc_reset_hbdesc(struct cppi5_host_desc_t *desc) * cppi5_hdesc_epib_present - check if EPIB present * @desc_hdr: packet descriptor/TR header * - * Returns true if EPIB present in the packet + * Returns: true if EPIB present in the packet */ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) { @@ -538,7 +544,7 @@ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_hdesc_get_psdata - Get pointer on PSDATA * @desc: Host packet descriptor * - * Returns pointer on PSDATA in HDesc. + * Returns: pointer on PSDATA in HDesc. * NULL - if ps_data placed at the start of data buffer. */ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) @@ -568,7 +574,7 @@ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_swdata - Get pointer on swdata * @desc: Host packet descriptor * - * Returns pointer on SWDATA in HDesc. + * Returns: pointer on SWDATA in HDesc. * NOTE. It's caller responsibility to be sure hdesc actually has swdata. */ static inline void *cppi5_hdesc_get_swdata(struct cppi5_host_desc_t *desc) @@ -648,6 +654,7 @@ enum cppi5_tr_types { CPPI5_TR_TYPE11, /* type12-14: Reserved */ CPPI5_TR_TYPE15 = 15, + /* private: */ CPPI5_TR_TYPE_MAX }; @@ -673,6 +680,7 @@ enum cppi5_tr_event_size { CPPI5_TR_EVENT_SIZE_ICNT1_DEC, CPPI5_TR_EVENT_SIZE_ICNT2_DEC, CPPI5_TR_EVENT_SIZE_ICNT3_DEC, + /* private: */ CPPI5_TR_EVENT_SIZE_MAX }; @@ -690,6 +698,7 @@ enum cppi5_tr_trigger { CPPI5_TR_TRIGGER_GLOBAL0, CPPI5_TR_TRIGGER_GLOBAL1, CPPI5_TR_TRIGGER_LOCAL_EVENT, + /* private: */ CPPI5_TR_TRIGGER_MAX }; @@ -711,6 +720,7 @@ enum cppi5_tr_trigger_type { CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, CPPI5_TR_TRIGGER_TYPE_ICNT3_DEC, CPPI5_TR_TRIGGER_TYPE_ALL, + /* private: */ CPPI5_TR_TRIGGER_TYPE_MAX }; @@ -815,7 +825,7 @@ struct cppi5_tr_type3_t { * destination * @dicnt1: Total loop iteration count for level 1 for destination * @dicnt2: Total loop iteration count for level 2 for destination - * @sicnt3: Total loop iteration count for level 3 (outermost) for + * @dicnt3: Total loop iteration count for level 3 (outermost) for * destination */ struct cppi5_tr_type15_t { @@ -887,6 +897,7 @@ enum cppi5_tr_resp_status_type { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_ERR, CPPI5_TR_RESPONSE_STATUS_TRANSFER_EXCEPTION, CPPI5_TR_RESPONSE_STATUS__TEARDOWN_FLUSH, + /* private: */ CPPI5_TR_RESPONSE_STATUS_MAX }; @@ -903,6 +914,7 @@ enum cppi5_tr_resp_status_submission { CPPI5_TR_RESPONSE_STATUS_SUBMISSION_ICNT0, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_FIFO_FULL, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_OWN, + /* private: */ CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX }; @@ -931,6 +943,7 @@ enum cppi5_tr_resp_status_unsupported { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_DFMT, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_SECTR, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_AMODE_SPECIFIC, + /* private: */ CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX }; @@ -939,7 +952,7 @@ enum cppi5_tr_resp_status_unsupported { * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * - * Returns required TR Descriptor size + * Returns: required TR Descriptor size */ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) { @@ -955,7 +968,7 @@ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) /** * cppi5_trdesc_init - Init TR Descriptor - * @desc: TR Descriptor + * @desc_hdr: TR Descriptor * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * @reload_idx: Absolute index to jump to on the 2nd and following passes @@ -1044,7 +1057,7 @@ static inline void cppi5_tr_set_trigger(cppi5_tr_flags_t *flags, } /** - * cppi5_tr_cflag_set - Update the Configuration specific flags + * cppi5_tr_csf_set - Update the Configuration specific flags * @flags: Pointer to the TR's flags * @csf: Configuration specific flags * -- cgit v1.2.3 From 7b84a00dd3528d980f1f35fd2c5015f72dc3f62a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:03 -0800 Subject: dmaengine: qcom: qcom-gpi-dma.h: fix all kernel-doc warnings Add missing enum descriptions and spell one struct member correctly to avoid kernel-doc warnings: Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_TX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_RX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_DUPLEX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:80 struct member 'multi_msg' not described in 'gpi_i2c_config' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011203.3062658-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/qcom-gpi-dma.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h index 6680dd1a43c6..332be28427e4 100644 --- a/include/linux/dma/qcom-gpi-dma.h +++ b/include/linux/dma/qcom-gpi-dma.h @@ -8,6 +8,9 @@ /** * enum spi_transfer_cmd - spi transfer commands + * @SPI_TX: SPI peripheral TX command + * @SPI_RX: SPI peripheral RX command + * @SPI_DUPLEX: SPI peripheral Duplex command */ enum spi_transfer_cmd { SPI_TX = 1, @@ -64,7 +67,7 @@ enum i2c_op { * @set_config: set peripheral config * @rx_len: receive length for buffer * @op: i2c cmd - * @muli-msg: is part of multi i2c r-w msgs + * @multi_msg: is part of multi i2c r-w msgs */ struct gpi_i2c_config { u8 set_config; -- cgit v1.2.3 From 4d94ce88c77e74830a5b9d02ecb8286039ffa494 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:17:00 +1100 Subject: VFS: unexport lock_rename(), lock_rename_child(), unlock_rename() These three function are now only used in namei.c, so they don't need to be exported. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-16-neilb@ownmail.net Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namei.c | 9 +++------ include/linux/namei.h | 3 --- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 1dd31ab417a2..d02aa57e4477 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1368,3 +1368,10 @@ lifetime, consider using inode_set_cached_link() instead. lookup_one_qstr_excl() is no longer exported - use start_creating() or similar. +--- + +** mandatory** + +lock_rename(), lock_rename_child(), unlock_rename() are no +longer available. Use start_renaming() or similar. + diff --git a/fs/namei.c b/fs/namei.c index a5daa62399d7..77189335bbcc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3775,7 +3775,7 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) /* * p1 and p2 should be directories on the same fs. */ -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +static struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); @@ -3785,12 +3785,11 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } -EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ -struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +static struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* @@ -3827,9 +3826,8 @@ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } -EXPORT_SYMBOL(lock_rename_child); -void unlock_rename(struct dentry *p1, struct dentry *p2) +static void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { @@ -3837,7 +3835,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } -EXPORT_SYMBOL(unlock_rename); /** * __start_renaming - lookup and lock names for rename diff --git a/include/linux/namei.h b/include/linux/namei.h index c7a7288cdd25..2ad6dd9987b9 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -165,9 +165,6 @@ extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -extern struct dentry *lock_rename(struct dentry *, struct dentry *); -extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); -extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, -- cgit v1.2.3 From 5645f805927c9bd4443e6143e487ef3ffea34aaf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 6 Mar 2026 14:22:00 +0100 Subject: gpio: Document line value semantics It is not clearly documented that the GPIO driver API expect the driver to get/set the physical level of the GPIO line and the consumer API will get/set the logic level. Document this in relevant places. Reported-by: David Jander Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260306-gpio-doc-levels-v1-1-19928739e400@kernel.org Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/driver.rst | 27 +++++++++++++++++++++++++++ include/linux/gpio/driver.h | 10 ++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index 85d86f92c41b..a4f160b95089 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -87,6 +87,33 @@ atomic context on realtime kernels (inside hard IRQ handlers and similar contexts). Normally this should not be required. +GPIO level semantics +-------------------- + +The gpip_chip .get/set[_multiple]() line values are clamped to the boolean +space [0, 1], low level or high level. + +Low and high values are defined as physical low on the line in/out to the +connector such as a physical pad, pin or rail. + +The GPIO library has internal logic to handle lines that are active low, such +as indicated by overstrike or #name in a schematic, and the driver should not +try to second-guess the logic value of a line. + +The way GPIO values are handled by the consumers is that the library present +the *logical* value to the consumer. A line is *asserted* if its *logical* +value is 1, and *de-asserted* if its logical value is 0. If inversion is +required, this is handled by gpiolib and configured using hardware descriptions +such as device tree or ACPI that can clearly indicate if a line is active +high or low. + +Since electronics commonly insert inverters as driving stages or protection +buffers in front of a GPIO line it is necessary that this semantic is part +of the hardware description, so that consumers such as kernel drivers need +not worry about this, and can for example assert a RESET line tied to a GPIO +pin by setting it to logic 1 even if it is physically active low. + + GPIO electrical configuration ----------------------------- diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5f5ddcbfa445..17511434ed07 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -343,11 +343,17 @@ struct gpio_irq_chip { * @direction_output: configures signal "offset" as output, returns 0 on * success or a negative error number. This can be omitted on input-only * or output-only gpio chips. - * @get: returns value for signal "offset", 0=low, 1=high, or negative error + * @get: returns value for signal "offset", 0=low, 1=high, or negative error. + * the low and high values are defined as physical low on the line + * in/out to the connector such as a physical pad, pin or rail. The GPIO + * library has internal logic to handle lines that are active low, such + * as indicated by overstrike or #name in a schematic, and the driver + * should not try to second-guess the logic value of a line. * @get_multiple: reads values for multiple signals defined by "mask" and * stores them in "bits", returns 0 on success or negative error * @set: assigns output value for signal "offset", returns 0 on success or - * negative error value + * negative error value. The output value follows the same semantic + * rules as for @get. * @set_multiple: assigns output values for multiple signals defined by * "mask", returns 0 on success or negative error value * @set_config: optional hook for all kinds of settings. Uses the same -- cgit v1.2.3 From ad9d28e68f4f9d15b9bde15e1ab79a4f85eff60e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:47 +0100 Subject: reset: gpio: simplify fallback device matching The of_args field of struct reset_controller_dev was introduced to allow the reset-gpio driver to pass the phandle arguments back to reset core. The thing is: it doesn't even have to do it. The core sets the platform data of the auxiliary device *AND* has access to it later on during the lookup. This means the field is unneeded and all can happen entirely in reset core. Remove the field from the public header and don't set it in reset-gpio.c. Retrieve the platform data in reset core when needed instead. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 15 ++++++--------- drivers/reset/reset-gpio.c | 5 ----- include/linux/reset-controller.h | 4 ---- 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 3845e77a8d32..954df36a242e 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -94,9 +94,6 @@ static const char *rcdev_name(struct reset_controller_dev *rcdev) if (rcdev->of_node) return rcdev->of_node->full_name; - if (rcdev->of_args) - return rcdev->of_args->np->full_name; - return NULL; } @@ -125,9 +122,6 @@ static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, */ int reset_controller_register(struct reset_controller_dev *rcdev) { - if (rcdev->of_node && rcdev->of_args) - return -EINVAL; - if (!rcdev->of_xlate) { rcdev->of_reset_n_cells = 1; rcdev->of_xlate = of_reset_simple_xlate; @@ -1006,13 +1000,16 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a bool gpio_fallback) { struct reset_controller_dev *rcdev; + struct of_phandle_args *rc_args; lockdep_assert_held(&reset_list_mutex); list_for_each_entry(rcdev, &reset_controller_list, list) { - if (gpio_fallback) { - if (rcdev->of_args && of_phandle_args_equal(args, - rcdev->of_args)) + if (gpio_fallback && rcdev->dev && + device_is_compatible(rcdev->dev, "reset-gpio")) { + rc_args = dev_get_platdata(rcdev->dev); + + if (of_phandle_args_equal(args, rc_args)) return rcdev; } else { if (args->np == rcdev->of_node) diff --git a/drivers/reset/reset-gpio.c b/drivers/reset/reset-gpio.c index ad5bfe27aaef..6e1c4f990bc0 100644 --- a/drivers/reset/reset-gpio.c +++ b/drivers/reset/reset-gpio.c @@ -56,12 +56,8 @@ static int reset_gpio_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { struct device *dev = &adev->dev; - struct of_phandle_args *platdata = dev_get_platdata(dev); struct reset_gpio_priv *priv; - if (!platdata) - return -EINVAL; - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -76,7 +72,6 @@ static int reset_gpio_probe(struct auxiliary_device *adev, priv->rc.ops = &reset_gpio_ops; priv->rc.owner = THIS_MODULE; priv->rc.dev = dev; - priv->rc.of_args = platdata; /* Cells to match GPIO specifier, but it's not really used */ priv->rc.of_reset_n_cells = 2; diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 46514cb1b9e0..aa95b460fdf8 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -35,9 +35,6 @@ struct of_phandle_args; * @reset_control_head: head of internal list of requested reset controls * @dev: corresponding driver model device struct * @of_node: corresponding device tree node as phandle target - * @of_args: for reset-gpios controllers: corresponding phandle args with - * of_node and GPIO number complementing of_node; either this or - * of_node should be present * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the * device tree to id as given to the reset control ops, defaults @@ -51,7 +48,6 @@ struct reset_controller_dev { struct list_head reset_control_head; struct device *dev; struct device_node *of_node; - const struct of_phandle_args *of_args; int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); -- cgit v1.2.3 From 44a0acb2caca3bfd0ca459fbf0b19be75f1819c0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:53 +0100 Subject: reset: protect struct reset_controller_dev with its own mutex Currently we use a single, global mutex - misleadingly names reset_list_mutex - to protect the global list of reset devices, per-controller list of reset control handles and also internal fields of struct reset_control. Locking can be made a lot more fine-grained if we use a separate mutex for serializing operations on the list AND accessing the reset controller device. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 44 ++++++++++++++++++++++++---------------- include/linux/reset-controller.h | 3 +++ 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index e6c12fbebca9..acd9d10b1ceb 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -131,6 +131,7 @@ int reset_controller_register(struct reset_controller_dev *rcdev) } INIT_LIST_HEAD(&rcdev->reset_control_head); + mutex_init(&rcdev->lock); guard(mutex)(&reset_list_mutex); @@ -143,6 +144,8 @@ EXPORT_SYMBOL_GPL(reset_controller_register); static void reset_controller_remove(struct reset_controller_dev *rcdev, struct reset_control *rstc) { + lockdep_assert_held(&rcdev->lock); + list_del(&rstc->list); module_put(rcdev->owner); put_device(rcdev->dev); @@ -156,19 +159,22 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev) { struct reset_control *rstc, *pos; - guard(mutex)(&reset_list_mutex); - - list_del(&rcdev->list); + scoped_guard(mutex, &reset_list_mutex) + list_del(&rcdev->list); - /* - * Numb but don't free the remaining reset control handles that are - * still held by consumers. - */ - list_for_each_entry_safe(rstc, pos, &rcdev->reset_control_head, list) { - rcu_assign_pointer(rstc->rcdev, NULL); - synchronize_srcu(&rstc->srcu); - reset_controller_remove(rcdev, rstc); + scoped_guard(mutex, &rcdev->lock) { + /* + * Numb but don't free the remaining reset control handles that are + * still held by consumers. + */ + list_for_each_entry_safe(rstc, pos, &rcdev->reset_control_head, list) { + rcu_assign_pointer(rstc->rcdev, NULL); + synchronize_srcu(&rstc->srcu); + reset_controller_remove(rcdev, rstc); + } } + + mutex_destroy(&rcdev->lock); } EXPORT_SYMBOL_GPL(reset_controller_unregister); @@ -712,10 +718,12 @@ int reset_control_acquire(struct reset_control *rstc) if (!rcdev) return -ENODEV; - list_for_each_entry(rc, &rcdev->reset_control_head, list) { - if (rstc != rc && rstc->id == rc->id) { - if (rc->acquired) - return -EBUSY; + scoped_guard(mutex, &rcdev->lock) { + list_for_each_entry(rc, &rcdev->reset_control_head, list) { + if (rstc != rc && rstc->id == rc->id) { + if (rc->acquired) + return -EBUSY; + } } } @@ -806,7 +814,7 @@ __reset_control_get_internal(struct reset_controller_dev *rcdev, struct reset_control *rstc; int ret; - lockdep_assert_held(&reset_list_mutex); + lockdep_assert_held(&rcdev->lock); /* Expect callers to filter out OPTIONAL and DEASSERTED bits */ if (WARN_ON(flags & ~(RESET_CONTROL_FLAGS_BIT_SHARED | @@ -868,8 +876,10 @@ static void __reset_control_release(struct kref *kref) scoped_guard(srcu, &rstc->srcu) { rcdev = rcu_replace_pointer(rstc->rcdev, NULL, true); - if (rcdev) + if (rcdev) { + guard(mutex)(&rcdev->lock); reset_controller_remove(rcdev, rstc); + } } synchronize_srcu(&rstc->srcu); diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index aa95b460fdf8..185d2a9bd7cd 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -3,6 +3,7 @@ #define _LINUX_RESET_CONTROLLER_H_ #include +#include struct reset_controller_dev; @@ -40,6 +41,7 @@ struct of_phandle_args; * device tree to id as given to the reset control ops, defaults * to :c:func:`of_reset_simple_xlate`. * @nr_resets: number of reset controls in this reset controller device + * @lock: protects the reset control list from concurrent access */ struct reset_controller_dev { const struct reset_control_ops *ops; @@ -52,6 +54,7 @@ struct reset_controller_dev { int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); unsigned int nr_resets; + struct mutex lock; }; #if IS_ENABLED(CONFIG_RESET_CONTROLLER) -- cgit v1.2.3 From ba8dbbb14b7e6734afbb5ba37d0679831aa3d590 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:56 +0100 Subject: reset: convert the core API to using firmware nodes In order to simplify the commit converting the internals of reset core to using firmware nodes, first convert the user-facing API. Modify the signature of the core consumer functions but leave the specialized wrappers as is to avoid modifying users for now. No functional change intended. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- Documentation/driver-api/reset.rst | 1 - drivers/reset/core.c | 33 ++++++++++++++++------------- include/linux/reset.h | 43 +++++++++++++++++++++++++------------- 3 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/reset.rst b/Documentation/driver-api/reset.rst index f773100daaa4..7a6571849664 100644 --- a/Documentation/driver-api/reset.rst +++ b/Documentation/driver-api/reset.rst @@ -198,7 +198,6 @@ query the reset line status using reset_control_status(). reset_control_rearm reset_control_put of_reset_control_get_count - of_reset_control_array_get devm_reset_control_array_get reset_control_get_count diff --git a/drivers/reset/core.c b/drivers/reset/core.c index f1b644a86ad0..0da5079ea9db 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -1061,7 +1061,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, rgpio_dev->of_args = *args; /* * We keep the device_node reference, but of_args.np is put at the end - * of __of_reset_control_get(), so get it one more time. + * of __fwnode_reset_control_get(), so get it one more time. * Hold reference as long as rgpio_dev memory is valid. */ of_node_get(rgpio_dev->of_args.np); @@ -1115,18 +1115,19 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a } struct reset_control * -__of_reset_control_get(struct device_node *node, const char *id, int index, - enum reset_control_flags flags) +__fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int index, + enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; bool gpio_fallback = false; + struct device_node *node = to_of_node(fwnode); struct reset_control *rstc = ERR_PTR(-EINVAL); struct reset_controller_dev *rcdev; struct of_phandle_args args; int rstc_id; int ret; - if (!node) + if (!fwnode) return ERR_PTR(-EINVAL); if (id) { @@ -1193,7 +1194,7 @@ out_put: return rstc; } -EXPORT_SYMBOL_GPL(__of_reset_control_get); +EXPORT_SYMBOL_GPL(__fwnode_reset_control_get); struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags) @@ -1201,12 +1202,13 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id, bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED; bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED; bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + struct fwnode_handle *fwnode = dev_fwnode(dev); if (WARN_ON(shared && acquired)) return ERR_PTR(-EINVAL); - if (dev->of_node) - return __of_reset_control_get(dev->of_node, id, index, flags); + if (fwnode) + return __fwnode_reset_control_get(fwnode, id, index, flags); return optional ? NULL : ERR_PTR(-ENOENT); } @@ -1468,23 +1470,24 @@ static int fwnode_reset_control_get_count(struct fwnode_handle *fwnode) } /** - * of_reset_control_array_get - Get a list of reset controls using - * device node. + * fwnode_reset_control_array_get - Get a list of reset controls using + * a firmware node. * - * @np: device node for the device that requests the reset controls array + * @fwnode: firmware node for the device that requests the reset controls array * @flags: whether reset controls are shared, optional, acquired * * Returns pointer to allocated reset_control on success or error on failure */ struct reset_control * -of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) +fwnode_reset_control_array_get(struct fwnode_handle *fwnode, + enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; struct reset_control_array *resets; struct reset_control *rstc; int num, i; - num = fwnode_reset_control_get_count(of_fwnode_handle(np)); + num = fwnode_reset_control_get_count(fwnode); if (num < 0) return optional ? NULL : ERR_PTR(num); @@ -1494,7 +1497,7 @@ of_reset_control_array_get(struct device_node *np, enum reset_control_flags flag resets->num_rstcs = num; for (i = 0; i < num; i++) { - rstc = __of_reset_control_get(np, NULL, i, flags); + rstc = __fwnode_reset_control_get(fwnode, NULL, i, flags); if (IS_ERR(rstc)) goto err_rst; resets->rstc[i] = rstc; @@ -1511,7 +1514,7 @@ err_rst: return rstc; } -EXPORT_SYMBOL_GPL(of_reset_control_array_get); +EXPORT_SYMBOL_GPL(fwnode_reset_control_array_get); /** * devm_reset_control_array_get - Resource managed reset control array get @@ -1535,7 +1538,7 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) if (!ptr) return ERR_PTR(-ENOMEM); - rstc = of_reset_control_array_get(dev->of_node, flags); + rstc = fwnode_reset_control_array_get(dev_fwnode(dev), flags); if (IS_ERR_OR_NULL(rstc)) { devres_free(ptr); return rstc; diff --git a/include/linux/reset.h b/include/linux/reset.h index 44f9e3415f92..9c391cf0c822 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -5,10 +5,12 @@ #include #include #include +#include #include struct device; struct device_node; +struct fwnode_handle; struct reset_control; /** @@ -84,7 +86,7 @@ int reset_control_bulk_deassert(int num_rstcs, struct reset_control_bulk_data *r int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rstcs); void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs); -struct reset_control *__of_reset_control_get(struct device_node *node, +struct reset_control *__fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags); struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags); @@ -103,7 +105,8 @@ int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control *devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags); -struct reset_control *of_reset_control_array_get(struct device_node *np, enum reset_control_flags); +struct reset_control *fwnode_reset_control_array_get(struct fwnode_handle *fwnode, + enum reset_control_flags); int reset_control_get_count(struct device *dev); @@ -152,8 +155,8 @@ static inline int __device_reset(struct device *dev, bool optional) return optional ? 0 : -ENOTSUPP; } -static inline struct reset_control *__of_reset_control_get( - struct device_node *node, +static inline struct reset_control *__fwnode_reset_control_get( + struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -242,7 +245,7 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) } static inline struct reset_control * -of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) +fwnode_reset_control_array_get(struct fwnode_handle *fwnode, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -500,7 +503,8 @@ reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, static inline struct reset_control *of_reset_control_get_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_EXCLUSIVE); } /** @@ -520,7 +524,8 @@ static inline struct reset_control *of_reset_control_get_exclusive( static inline struct reset_control *of_reset_control_get_optional_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -545,7 +550,8 @@ static inline struct reset_control *of_reset_control_get_optional_exclusive( static inline struct reset_control *of_reset_control_get_shared( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_SHARED); } /** @@ -562,7 +568,8 @@ static inline struct reset_control *of_reset_control_get_shared( static inline struct reset_control *of_reset_control_get_exclusive_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_EXCLUSIVE); } /** @@ -590,7 +597,8 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index( static inline struct reset_control *of_reset_control_get_shared_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_SHARED); } /** @@ -1032,30 +1040,35 @@ devm_reset_control_array_get_optional_shared(struct device *dev) static inline struct reset_control * of_reset_control_array_get_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_exclusive_released(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE_RELEASED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE_RELEASED); } static inline struct reset_control * of_reset_control_array_get_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_SHARED); } static inline struct reset_control * of_reset_control_array_get_optional_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_optional_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_SHARED); } #endif -- cgit v1.2.3 From 9035073d0ef1de813c6335239250248bfe0a64aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:57 +0100 Subject: reset: convert reset core to using firmware nodes With everything else now in place, we can convert the remaining parts of the reset subsystem to becoming fwnode-agnostic - meaning it will work with all kinds of firmware nodes, not only devicetree. To that end: extend struct reset_controller_dev with fields taking information relevant for using firmware nodes (which mirrors what we already do for OF-nodes) and limit using of_ APIs only to where it's absolutely necessary (mostly around the of_xlate callback). For backward compatibility of existing drivers we still support OF-nodes but firmware nodes become the preferred method. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 166 +++++++++++++++++++++++---------------- include/linux/reset-controller.h | 14 +++- 2 files changed, 112 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 0da5079ea9db..e625cf59cfb0 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -81,13 +81,13 @@ struct reset_control_array { /** * struct reset_gpio_lookup - lookup key for ad-hoc created reset-gpio devices - * @of_args: phandle to the reset controller with all the args like GPIO number + * @ref_args: Reference to the reset controller with all the args like GPIO number * @swnode: Software node containing the reference to the GPIO provider * @list: list entry for the reset_gpio_lookup_list * @adev: Auxiliary device representing the reset controller */ struct reset_gpio_lookup { - struct of_phandle_args of_args; + struct fwnode_reference_args ref_args; struct fwnode_handle *swnode; struct list_head list; struct auxiliary_device adev; @@ -98,24 +98,24 @@ static const char *rcdev_name(struct reset_controller_dev *rcdev) if (rcdev->dev) return dev_name(rcdev->dev); - if (rcdev->of_node) - return rcdev->of_node->full_name; + if (rcdev->fwnode) + return fwnode_get_name(rcdev->fwnode); return NULL; } /** - * of_reset_simple_xlate - translate reset_spec to the reset line number + * fwnode_reset_simple_xlate - translate reset_spec to the reset line number * @rcdev: a pointer to the reset controller device - * @reset_spec: reset line specifier as found in the device tree + * @reset_spec: reset line specifier as found in firmware * - * This static translation function is used by default if of_xlate in - * :c:type:`reset_controller_dev` is not set. It is useful for all reset - * controllers with 1:1 mapping, where reset lines can be indexed by number - * without gaps. + * This static translation function is used by default if neither fwnode_xlate + * not of_xlate in :c:type:`reset_controller_dev` is not set. It is useful for + * all reset controllers with 1:1 mapping, where reset lines can be indexed by + * number without gaps. */ -static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, - const struct of_phandle_args *reset_spec) +static int fwnode_reset_simple_xlate(struct reset_controller_dev *rcdev, + const struct fwnode_reference_args *reset_spec) { if (reset_spec->args[0] >= rcdev->nr_resets) return -EINVAL; @@ -129,9 +129,23 @@ static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, */ int reset_controller_register(struct reset_controller_dev *rcdev) { - if (!rcdev->of_xlate) { - rcdev->of_reset_n_cells = 1; - rcdev->of_xlate = of_reset_simple_xlate; + if ((rcdev->of_node && rcdev->fwnode) || (rcdev->of_xlate && rcdev->fwnode_xlate)) + return -EINVAL; + + if (!rcdev->of_node && !rcdev->fwnode) { + rcdev->fwnode = dev_fwnode(rcdev->dev); + if (!rcdev->fwnode) + return -EINVAL; + } + + if (rcdev->of_node) { + rcdev->fwnode = of_fwnode_handle(rcdev->of_node); + rcdev->fwnode_reset_n_cells = rcdev->of_reset_n_cells; + } + + if (rcdev->fwnode && !rcdev->fwnode_xlate) { + rcdev->fwnode_reset_n_cells = 1; + rcdev->fwnode_xlate = fwnode_reset_simple_xlate; } INIT_LIST_HEAD(&rcdev->reset_control_head); @@ -931,7 +945,7 @@ static int reset_create_gpio_aux_device(struct reset_gpio_lookup *rgpio_dev, adev->id = id; adev->name = "gpio"; adev->dev.parent = parent; - adev->dev.platform_data = &rgpio_dev->of_args; + adev->dev.platform_data = &rgpio_dev->ref_args; adev->dev.release = reset_gpio_aux_device_release; device_set_node(&adev->dev, rgpio_dev->swnode); @@ -951,18 +965,18 @@ static int reset_create_gpio_aux_device(struct reset_gpio_lookup *rgpio_dev, return 0; } -static void reset_gpio_add_devlink(struct device_node *np, +static void reset_gpio_add_devlink(struct fwnode_handle *fwnode, struct reset_gpio_lookup *rgpio_dev) { struct device *consumer; /* - * We must use get_dev_from_fwnode() and not of_find_device_by_node() + * We must use get_dev_from_fwnode() and not ref_find_device_by_node() * because the latter only considers the platform bus while we want to * get consumers of any kind that can be associated with firmware * nodes: auxiliary, soundwire, etc. */ - consumer = get_dev_from_fwnode(of_fwnode_handle(np)); + consumer = get_dev_from_fwnode(fwnode); if (consumer) { if (!device_link_add(consumer, &rgpio_dev->adev.dev, DL_FLAG_AUTOREMOVE_CONSUMER)) @@ -982,15 +996,23 @@ static void reset_gpio_add_devlink(struct device_node *np, */ } +/* TODO: move it out into drivers/base/ */ +static bool fwnode_reference_args_equal(const struct fwnode_reference_args *left, + const struct fwnode_reference_args *right) +{ + return left->fwnode == right->fwnode && left->nargs == right->nargs && + !memcmp(left->args, right->args, sizeof(left->args[0]) * left->nargs); +} + /* * @np: OF-node associated with the consumer - * @args: phandle to the GPIO provider with all the args like GPIO number + * @args: Reference to the GPIO provider with all the args like GPIO number */ -static int __reset_add_reset_gpio_device(struct device_node *np, - const struct of_phandle_args *args) +static int __reset_add_reset_gpio_device(struct fwnode_handle *fwnode, + const struct fwnode_reference_args *args) { struct property_entry properties[3] = { }; - unsigned int offset, of_flags, lflags; + unsigned int offset, flags, lflags; struct reset_gpio_lookup *rgpio_dev; struct device *parent; int ret, prop = 0; @@ -1001,7 +1023,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, * args[1]: GPIO flags * TODO: Handle other cases. */ - if (args->args_count != 2) + if (args->nargs != 2) return -ENOENT; /* @@ -1012,7 +1034,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, lockdep_assert_not_held(&reset_list_mutex); offset = args->args[0]; - of_flags = args->args[1]; + flags = args->args[1]; /* * Later we map GPIO flags between OF and Linux, however not all @@ -1022,33 +1044,31 @@ static int __reset_add_reset_gpio_device(struct device_node *np, * FIXME: Find a better way of translating OF flags to GPIO lookup * flags. */ - if (of_flags > GPIO_ACTIVE_LOW) { + if (flags > GPIO_ACTIVE_LOW) { pr_err("reset-gpio code does not support GPIO flags %u for GPIO %u\n", - of_flags, offset); + flags, offset); return -EINVAL; } struct gpio_device *gdev __free(gpio_device_put) = - gpio_device_find_by_fwnode(of_fwnode_handle(args->np)); + gpio_device_find_by_fwnode(args->fwnode); if (!gdev) return -EPROBE_DEFER; guard(mutex)(&reset_gpio_lookup_mutex); list_for_each_entry(rgpio_dev, &reset_gpio_lookup_list, list) { - if (args->np == rgpio_dev->of_args.np) { - if (of_phandle_args_equal(args, &rgpio_dev->of_args)) { - /* - * Already on the list, create the device link - * and stop here. - */ - reset_gpio_add_devlink(np, rgpio_dev); - return 0; - } + if (fwnode_reference_args_equal(args, &rgpio_dev->ref_args)) { + /* + * Already on the list, create the device link + * and stop here. + */ + reset_gpio_add_devlink(fwnode, rgpio_dev); + return 0; } } - lflags = GPIO_PERSISTENT | (of_flags & GPIO_ACTIVE_LOW); + lflags = GPIO_PERSISTENT | (flags & GPIO_ACTIVE_LOW); parent = gpio_device_to_device(gdev); properties[prop++] = PROPERTY_ENTRY_STRING("compatible", "reset-gpio"); properties[prop++] = PROPERTY_ENTRY_GPIO("reset-gpios", parent->fwnode, offset, lflags); @@ -1058,43 +1078,43 @@ static int __reset_add_reset_gpio_device(struct device_node *np, if (!rgpio_dev) return -ENOMEM; - rgpio_dev->of_args = *args; + rgpio_dev->ref_args = *args; /* - * We keep the device_node reference, but of_args.np is put at the end - * of __fwnode_reset_control_get(), so get it one more time. + * We keep the fwnode_handle reference, but ref_args.fwnode is put at + * the end of __fwnode_reset_control_get(), so get it one more time. * Hold reference as long as rgpio_dev memory is valid. */ - of_node_get(rgpio_dev->of_args.np); + fwnode_handle_get(rgpio_dev->ref_args.fwnode); rgpio_dev->swnode = fwnode_create_software_node(properties, NULL); if (IS_ERR(rgpio_dev->swnode)) { ret = PTR_ERR(rgpio_dev->swnode); - goto err_put_of_node; + goto err_put_fwnode; } ret = reset_create_gpio_aux_device(rgpio_dev, parent); if (ret) goto err_del_swnode; - reset_gpio_add_devlink(np, rgpio_dev); + reset_gpio_add_devlink(fwnode, rgpio_dev); list_add(&rgpio_dev->list, &reset_gpio_lookup_list); return 0; err_del_swnode: fwnode_remove_software_node(rgpio_dev->swnode); -err_put_of_node: - of_node_put(rgpio_dev->of_args.np); +err_put_fwnode: + fwnode_handle_put(rgpio_dev->ref_args.fwnode); kfree(rgpio_dev); return ret; } -static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_args *args, - bool gpio_fallback) +static struct reset_controller_dev * +__reset_find_rcdev(const struct fwnode_reference_args *args, bool gpio_fallback) { + struct fwnode_reference_args *rc_args; struct reset_controller_dev *rcdev; - struct of_phandle_args *rc_args; lockdep_assert_held(&reset_list_mutex); @@ -1103,10 +1123,10 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a device_is_compatible(rcdev->dev, "reset-gpio")) { rc_args = dev_get_platdata(rcdev->dev); - if (of_phandle_args_equal(args, rc_args)) + if (fwnode_reference_args_equal(args, rc_args)) return rcdev; } else { - if (args->np == rcdev->of_node) + if (args->fwnode == rcdev->fwnode) return rcdev; } } @@ -1120,27 +1140,26 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; bool gpio_fallback = false; - struct device_node *node = to_of_node(fwnode); struct reset_control *rstc = ERR_PTR(-EINVAL); struct reset_controller_dev *rcdev; - struct of_phandle_args args; - int rstc_id; + struct fwnode_reference_args args; + struct of_phandle_args of_args; + int rstc_id = -EINVAL; int ret; if (!fwnode) return ERR_PTR(-EINVAL); if (id) { - index = of_property_match_string(node, - "reset-names", id); + index = fwnode_property_match_string(fwnode, "reset-names", id); if (index == -EILSEQ) return ERR_PTR(index); if (index < 0) return optional ? NULL : ERR_PTR(-ENOENT); } - ret = of_parse_phandle_with_args(node, "resets", "#reset-cells", - index, &args); + ret = fwnode_property_get_reference_args(fwnode, "resets", "#reset-cells", + 0, index, &args); if (ret == -EINVAL) return ERR_PTR(ret); if (ret) { @@ -1151,16 +1170,16 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind * There can be only one reset-gpio for regular devices, so * don't bother with the "reset-gpios" phandle index. */ - ret = of_parse_phandle_with_args(node, "reset-gpios", "#gpio-cells", - 0, &args); + ret = fwnode_property_get_reference_args(fwnode, "reset-gpios", + "#gpio-cells", 0, 0, &args); if (ret) return optional ? NULL : ERR_PTR(ret); gpio_fallback = true; - ret = __reset_add_reset_gpio_device(node, &args); + ret = __reset_add_reset_gpio_device(fwnode, &args); if (ret) { - of_node_put(args.np); + fwnode_handle_put(args.fwnode); return ERR_PTR(ret); } } @@ -1173,15 +1192,30 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind goto out_put; } - if (WARN_ON(args.args_count != rcdev->of_reset_n_cells)) { + if (WARN_ON(args.nargs != rcdev->fwnode_reset_n_cells)) { rstc = ERR_PTR(-EINVAL); goto out_put; } - rstc_id = rcdev->of_xlate(rcdev, &args); + if (rcdev->of_xlate && is_of_node(fwnode)) { + ret = of_parse_phandle_with_args(to_of_node(fwnode), + gpio_fallback ? "reset-gpios" : "resets", + gpio_fallback ? "#gpio-cells" : "#reset-cells", + gpio_fallback ? 0 : index, + &of_args); + if (ret) { + rstc = ERR_PTR(ret); + goto out_put; + } + + rstc_id = rcdev->of_xlate(rcdev, &of_args); + of_node_put(of_args.np); + } else if (rcdev->fwnode_xlate) { + rstc_id = rcdev->fwnode_xlate(rcdev, &args); + } if (rstc_id < 0) { rstc = ERR_PTR(rstc_id); - goto out_put; + goto out_put; } flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -1190,7 +1224,7 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind rstc = __reset_control_get_internal(rcdev, rstc_id, flags); out_put: - of_node_put(args.np); + fwnode_handle_put(args.fwnode); return rstc; } diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 185d2a9bd7cd..52a5a4e81f18 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -5,6 +5,8 @@ #include #include +struct fwnode_handle; +struct fwnode_reference_args; struct reset_controller_dev; /** @@ -38,8 +40,12 @@ struct of_phandle_args; * @of_node: corresponding device tree node as phandle target * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the - * device tree to id as given to the reset control ops, defaults - * to :c:func:`of_reset_simple_xlate`. + * device tree to id as given to the reset control ops + * @fwnode: firmware node associated with this device + * @fwnode_reset_n_cells: number of cells in reset line specifiers + * @fwnode_xlate: translation function to translate from firmware specifier to + * id as given to the reset control ops, defaults to + * :c:func:`fwnode_reset_simple_xlate` * @nr_resets: number of reset controls in this reset controller device * @lock: protects the reset control list from concurrent access */ @@ -53,6 +59,10 @@ struct reset_controller_dev { int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); + struct fwnode_handle *fwnode; + int fwnode_reset_n_cells; + int (*fwnode_xlate)(struct reset_controller_dev *rcdev, + const struct fwnode_reference_args *reset_spec); unsigned int nr_resets; struct mutex lock; }; -- cgit v1.2.3 From b6420bd5aa0c374331bad6c0fa2eb5f0f87cf5a0 Mon Sep 17 00:00:00 2001 From: Jialu Xu Date: Sat, 7 Mar 2026 11:06:26 +0800 Subject: gpio: remove of_get_named_gpio() and All in-tree consumers have been converted to the descriptor-based API. Remove the deprecated of_get_named_gpio() helper, delete the header, and drop the corresponding entry from MAINTAINERS. Also remove the completed TODO item for this cleanup. Signed-off-by: Jialu Xu Reviewed-by: Linus Walleij Link: https://patch.msgid.link/02ABDA1F9E3FAF1F+20260307030623.3495092-6-xujialu@vimux.org Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 1 - drivers/gpio/TODO | 28 ---------------------------- drivers/gpio/gpiolib-of.c | 27 --------------------------- include/linux/of_gpio.h | 38 -------------------------------------- 4 files changed, 94 deletions(-) delete mode 100644 include/linux/of_gpio.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..24b3f8d2a64c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10959,7 +10959,6 @@ F: drivers/gpio/ F: include/dt-bindings/gpio/ F: include/linux/gpio.h F: include/linux/gpio/ -F: include/linux/of_gpio.h K: (devm_)?gpio_(request|free|direction|get|set) K: GPIOD_FLAGS_BIT_NONEXCLUSIVE K: devm_gpiod_unhinge diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index 5acaeab029ec..7ce80fde1f17 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -58,34 +58,6 @@ Work items: ------------------------------------------------------------------------------- -Get rid of - -This header and helpers appeared at one point when there was no proper -driver infrastructure for doing simpler MMIO GPIO devices and there was -no core support for parsing device tree GPIOs from the core library with -the [devm_]gpiod_get() calls we have today that will implicitly go into -the device tree back-end. It is legacy and should not be used in new code. - -Work items: - -- Change all consumer drivers that #include to - #include and stop doing custom parsing of the - GPIO lines from the device tree. This can be tricky and often involves - changing board files, etc. - -- Pull semantics for legacy device tree (OF) GPIO lookups into - gpiolib-of.c: in some cases subsystems are doing custom flags and - lookups for polarity inversion, open drain and what not. As we now - handle this with generic OF bindings, pull all legacy handling into - gpiolib so the library API becomes narrow and deep and handle all - legacy bindings internally. (See e.g. commits 6953c57ab172, - 6a537d48461d etc) - -- Delete when all the above is complete and everything - uses or instead. - -------------------------------------------------------------------------------- - Collect drivers Collect GPIO drivers from arch/* and other places that should be placed diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 3bdd9af67447..c512d735e85f 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -446,32 +445,6 @@ out: return desc; } -/** - * of_get_named_gpio() - Get a GPIO number to use with GPIO API - * @np: device node to get GPIO from - * @propname: Name of property containing gpio specifier(s) - * @index: index of the GPIO - * - * **DEPRECATED** This function is deprecated and must not be used in new code. - * - * Returns: - * GPIO number to use with Linux generic GPIO API, or one of the errno - * value on the error condition. - */ -int of_get_named_gpio(const struct device_node *np, const char *propname, - int index) -{ - struct gpio_desc *desc; - - desc = of_get_named_gpiod_flags(np, propname, index, NULL); - - if (IS_ERR(desc)) - return PTR_ERR(desc); - else - return desc_to_gpio(desc); -} -EXPORT_SYMBOL_GPL(of_get_named_gpio); - /* Converts gpio_lookup_flags into bitmask of GPIO_* values */ static unsigned long of_convert_gpio_flags(enum of_gpio_flags flags) { diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h deleted file mode 100644 index d0f66a5e1b2a..000000000000 --- a/include/linux/of_gpio.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * OF helpers for the GPIO API - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - */ - -#ifndef __LINUX_OF_GPIO_H -#define __LINUX_OF_GPIO_H - -#include -#include -#include /* FIXME: Shouldn't be here */ -#include - -struct device_node; - -#ifdef CONFIG_OF_GPIO - -extern int of_get_named_gpio(const struct device_node *np, - const char *list_name, int index); - -#else /* CONFIG_OF_GPIO */ - -#include - -/* Drivers may not strictly depend on the GPIO support, so let them link. */ -static inline int of_get_named_gpio(const struct device_node *np, - const char *propname, int index) -{ - return -ENOSYS; -} - -#endif /* CONFIG_OF_GPIO */ - -#endif /* __LINUX_OF_GPIO_H */ -- cgit v1.2.3 From 9840bb66e7e5dffd72b03201318f154a10b06b4a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 7 Mar 2026 14:54:31 -0500 Subject: vfs: remove externs from fs.h on functions modified by i_ino widening Christoph says, in response to one of the patches in the i_ino widening series, which changes the prototype of several functions in fs.h: "Can you please drop all these pointless externs while you're at it?" Remove extern keyword from functions touched by that patch (and a few that happened to be nearby). Also add missing argument names to declarations that lacked them. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260307-iino-u64-v2-1-a1a1696e0653@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 62 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index e820c14f9c5a..23f36a2613a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,27 +2912,27 @@ static inline bool name_contains_dotdot(const char *name) #include /* needed for stackable file system support */ -extern loff_t default_llseek(struct file *file, loff_t offset, int whence); +loff_t default_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); +loff_t vfs_llseek(struct file *file, loff_t offset, int whence); -extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); +int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } -extern void inode_init_once(struct inode *); -extern void address_space_init_once(struct address_space *mapping); -extern struct inode * igrab(struct inode *); -extern ino_t iunique(struct super_block *, ino_t); -extern int inode_needs_sync(struct inode *inode); -extern int inode_just_drop(struct inode *inode); +void inode_init_once(struct inode *inode); +void address_space_init_once(struct address_space *mapping); +struct inode *igrab(struct inode *inode); +ino_t iunique(struct super_block *sb, ino_t max_reserved); +int inode_needs_sync(struct inode *inode); +int inode_just_drop(struct inode *inode); static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } -extern void d_mark_dontcache(struct inode *inode); +void d_mark_dontcache(struct inode *inode); struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, @@ -2944,31 +2944,31 @@ struct inode *ilookup(struct super_block *sb, u64 ino); struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); -struct inode *iget5_locked(struct super_block *, u64, +struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget_locked(struct super_block *, u64); -struct inode *find_inode_nowait(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget_locked(struct super_block *sb, u64 ino); +struct inode *find_inode_nowait(struct super_block *sb, u64 hashval, int (*match)(struct inode *, u64, void *), void *data); -struct inode *find_inode_rcu(struct super_block *, u64, - int (*)(struct inode *, void *), void *); -struct inode *find_inode_by_ino_rcu(struct super_block *, u64); -int insert_inode_locked4(struct inode *, u64, - int (*test)(struct inode *, void *), void *); -extern int insert_inode_locked(struct inode *); +struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *find_inode_by_ino_rcu(struct super_block *sb, u64 ino); +int insert_inode_locked4(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), void *data); +int insert_inode_locked(struct inode *inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif -extern void unlock_new_inode(struct inode *); -extern void discard_new_inode(struct inode *); -extern unsigned int get_next_ino(void); -extern void evict_inodes(struct super_block *sb); +void unlock_new_inode(struct inode *inode); +void discard_new_inode(struct inode *inode); +unsigned int get_next_ino(void); +void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* @@ -3013,21 +3013,21 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -void __insert_inode_hash(struct inode *, u64 hashval); +void __insert_inode_hash(struct inode *inode, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void __remove_inode_hash(struct inode *); +void __remove_inode_hash(struct inode *inode); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } -extern void inode_sb_list_add(struct inode *inode); -extern void inode_lru_list_add(struct inode *inode); +void inode_sb_list_add(struct inode *inode); +void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); -- cgit v1.2.3 From a6fe20d67dc7d512f9b5dc11c5777fb1e1ff70ce Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Fri, 6 Mar 2026 15:28:10 +0000 Subject: mfd: cs42l43: Add support for the B variant Introducing CS42L43B codec, a variant of CS42L43 which can be driven by the same driver. Changes in CS42L43 driver specific for CS42L43B: - Decimator 1 and 2 are dedicated to ADC, can't be selected for PDM - Decimators 3 and 4 are connected to PDM1 - Added Decimator 5 and 6 for PDM2 - Supports SoundWire Clock Gearing - Updated ROM requiring no patching - Reduced RAM space - Each ISRC has 4 decimators now Signed-off-by: Maciej Strozek Acked-by: Lee Jones Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260306152829.3130530-4-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/mfd/cs42l43-i2c.c | 7 ++- drivers/mfd/cs42l43-sdw.c | 4 +- drivers/mfd/cs42l43.c | 93 ++++++++++++++++++++++++++++++++++------ drivers/mfd/cs42l43.h | 2 +- include/linux/mfd/cs42l43-regs.h | 76 ++++++++++++++++++++++++++++++++ include/linux/mfd/cs42l43.h | 1 + 6 files changed, 166 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/mfd/cs42l43-i2c.c b/drivers/mfd/cs42l43-i2c.c index a2ab001a600a..0a0ab5e549a5 100644 --- a/drivers/mfd/cs42l43-i2c.c +++ b/drivers/mfd/cs42l43-i2c.c @@ -47,6 +47,7 @@ static int cs42l43_i2c_probe(struct i2c_client *i2c) cs42l43->irq = i2c->irq; /* A device on an I2C is always attached by definition. */ cs42l43->attached = true; + cs42l43->variant_id = (long)device_get_match_data(cs42l43->dev); cs42l43->regmap = devm_regmap_init_i2c(i2c, &cs42l43_i2c_regmap); if (IS_ERR(cs42l43->regmap)) @@ -58,7 +59,8 @@ static int cs42l43_i2c_probe(struct i2c_client *i2c) #if IS_ENABLED(CONFIG_OF) static const struct of_device_id cs42l43_of_match[] = { - { .compatible = "cirrus,cs42l43", }, + { .compatible = "cirrus,cs42l43", .data = (void *)CS42L43_DEVID_VAL }, + { .compatible = "cirrus,cs42l43b", .data = (void *)CS42L43B_DEVID_VAL }, {} }; MODULE_DEVICE_TABLE(of, cs42l43_of_match); @@ -66,7 +68,8 @@ MODULE_DEVICE_TABLE(of, cs42l43_of_match); #if IS_ENABLED(CONFIG_ACPI) static const struct acpi_device_id cs42l43_acpi_match[] = { - { "CSC4243", 0 }, + { "CSC4243", CS42L43_DEVID_VAL }, + { "CSC2A3B", CS42L43B_DEVID_VAL }, {} }; MODULE_DEVICE_TABLE(acpi, cs42l43_acpi_match); diff --git a/drivers/mfd/cs42l43-sdw.c b/drivers/mfd/cs42l43-sdw.c index 023f7e1a30f8..794c98378175 100644 --- a/drivers/mfd/cs42l43-sdw.c +++ b/drivers/mfd/cs42l43-sdw.c @@ -178,6 +178,7 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id * cs42l43->dev = dev; cs42l43->sdw = sdw; + cs42l43->variant_id = (long)id->driver_data; cs42l43->regmap = devm_regmap_init_sdw(sdw, &cs42l43_sdw_regmap); if (IS_ERR(cs42l43->regmap)) @@ -188,7 +189,8 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id * } static const struct sdw_device_id cs42l43_sdw_id[] = { - SDW_SLAVE_ENTRY(0x01FA, 0x4243, 0), + SDW_SLAVE_ENTRY(0x01FA, 0x4243, (void *) CS42L43_DEVID_VAL), + SDW_SLAVE_ENTRY(0x01FA, 0x2A3B, (void *) CS42L43B_DEVID_VAL), {} }; MODULE_DEVICE_TABLE(sdw, cs42l43_sdw_id); diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c index 107cfb983fec..166881751e69 100644 --- a/drivers/mfd/cs42l43.c +++ b/drivers/mfd/cs42l43.c @@ -115,9 +115,14 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_DECIM_HPF_WNF_CTRL2, 0x00000001 }, { CS42L43_DECIM_HPF_WNF_CTRL3, 0x00000001 }, { CS42L43_DECIM_HPF_WNF_CTRL4, 0x00000001 }, + { CS42L43B_DECIM_HPF_WNF_CTRL5, 0x00000001 }, + { CS42L43B_DECIM_HPF_WNF_CTRL6, 0x00000001 }, { CS42L43_DMIC_PDM_CTRL, 0x00000000 }, { CS42L43_DECIM_VOL_CTRL_CH1_CH2, 0x20122012 }, { CS42L43_DECIM_VOL_CTRL_CH3_CH4, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH1_CH2, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH3_CH4, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH5_CH6, 0x20122012 }, { CS42L43_INTP_VOLUME_CTRL1, 0x00000180 }, { CS42L43_INTP_VOLUME_CTRL2, 0x00000180 }, { CS42L43_AMP1_2_VOL_RAMP, 0x00000022 }, @@ -155,8 +160,12 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_SWIRE_DP2_CH2_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP3_CH1_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP3_CH2_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP3_CH3_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP3_CH4_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP4_CH1_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP4_CH2_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP4_CH3_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP4_CH4_INPUT, 0x00000000 }, { CS42L43_ASRC_INT1_INPUT1, 0x00000000 }, { CS42L43_ASRC_INT2_INPUT1, 0x00000000 }, { CS42L43_ASRC_INT3_INPUT1, 0x00000000 }, @@ -169,10 +178,14 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_ISRC1INT2_INPUT1, 0x00000000 }, { CS42L43_ISRC1DEC1_INPUT1, 0x00000000 }, { CS42L43_ISRC1DEC2_INPUT1, 0x00000000 }, + { CS42L43B_ISRC1DEC3_INPUT1, 0x00000000 }, + { CS42L43B_ISRC1DEC4_INPUT1, 0x00000000 }, { CS42L43_ISRC2INT1_INPUT1, 0x00000000 }, { CS42L43_ISRC2INT2_INPUT1, 0x00000000 }, { CS42L43_ISRC2DEC1_INPUT1, 0x00000000 }, { CS42L43_ISRC2DEC2_INPUT1, 0x00000000 }, + { CS42L43B_ISRC2DEC3_INPUT1, 0x00000000 }, + { CS42L43B_ISRC2DEC4_INPUT1, 0x00000000 }, { CS42L43_EQ1MIX_INPUT1, 0x00800000 }, { CS42L43_EQ1MIX_INPUT2, 0x00800000 }, { CS42L43_EQ1MIX_INPUT3, 0x00800000 }, @@ -269,6 +282,8 @@ EXPORT_SYMBOL_NS_GPL(cs42l43_reg_default, "MFD_CS42L43"); bool cs42l43_readable_register(struct device *dev, unsigned int reg) { + struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + switch (reg) { case CS42L43_DEVID: case CS42L43_REVID: @@ -292,7 +307,6 @@ bool cs42l43_readable_register(struct device *dev, unsigned int reg) case CS42L43_ADC_B_CTRL1 ... CS42L43_ADC_B_CTRL2: case CS42L43_DECIM_HPF_WNF_CTRL1 ... CS42L43_DECIM_HPF_WNF_CTRL4: case CS42L43_DMIC_PDM_CTRL: - case CS42L43_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43_DECIM_VOL_CTRL_CH3_CH4: case CS42L43_INTP_VOLUME_CTRL1 ... CS42L43_INTP_VOLUME_CTRL2: case CS42L43_AMP1_2_VOL_RAMP: case CS42L43_ASP_CTRL: @@ -387,8 +401,16 @@ bool cs42l43_readable_register(struct device *dev, unsigned int reg) case CS42L43_BOOT_CONTROL: case CS42L43_BLOCK_EN: case CS42L43_SHUTTER_CONTROL: - case CS42L43_MCU_SW_REV ... CS42L43_MCU_RAM_MAX: - return true; + case CS42L43B_MCU_SW_REV ... CS42L43B_MCU_RAM_MAX: + return true; // registers present on all variants + case CS42L43_MCU_SW_REV ... CS42L43B_MCU_SW_REV - 1: + case CS42L43B_MCU_RAM_MAX + 1 ... CS42L43_MCU_RAM_MAX: + case CS42L43_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43_DECIM_VOL_CTRL_CH3_CH4: + return cs42l43->variant_id == CS42L43_DEVID_VAL; // regs only in CS42L43 variant + case CS42L43B_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43B_DECIM_HPF_WNF_CTRL6: + case CS42L43B_SWIRE_DP3_CH3_INPUT ... CS42L43B_SWIRE_DP4_CH4_INPUT: + case CS42L43B_ISRC1DEC3_INPUT1 ... CS42L43B_ISRC2DEC4_INPUT1: + return cs42l43->variant_id == CS42L43B_DEVID_VAL; // regs only in CS42L43B variant default: return false; } @@ -597,15 +619,27 @@ static int cs42l43_wait_for_attach(struct cs42l43 *cs42l43) static int cs42l43_mcu_stage_2_3(struct cs42l43 *cs42l43, bool shadow) { unsigned int need_reg = CS42L43_NEED_CONFIGS; + unsigned int boot_reg; unsigned int val; int ret; - if (shadow) - need_reg = CS42L43_FW_SH_BOOT_CFG_NEED_CONFIGS; + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + if (shadow) + need_reg = CS42L43_FW_SH_BOOT_CFG_NEED_CONFIGS; + boot_reg = CS42L43_BOOT_STATUS; + break; + case CS42L43B_DEVID_VAL: + need_reg = CS42L43B_NEED_CONFIGS; + boot_reg = CS42L43B_BOOT_STATUS; + break; + default: + return -EINVAL; + } regmap_write(cs42l43->regmap, need_reg, 0); - ret = regmap_read_poll_timeout(cs42l43->regmap, CS42L43_BOOT_STATUS, + ret = regmap_read_poll_timeout(cs42l43->regmap, boot_reg, val, (val == CS42L43_MCU_BOOT_STAGE3), CS42L43_MCU_POLL_US, CS42L43_MCU_CMD_TIMEOUT_US); if (ret) { @@ -644,13 +678,25 @@ static int cs42l43_mcu_stage_3_2(struct cs42l43 *cs42l43) */ static int cs42l43_mcu_disable(struct cs42l43 *cs42l43) { - unsigned int val; + unsigned int val, cfg_reg, ctrl_reg; int ret; - regmap_write(cs42l43->regmap, CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG, - CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL); - regmap_write(cs42l43->regmap, CS42L43_FW_MISSION_CTRL_MM_CTRL_SELECTION, - CS42L43_FW_MM_CTRL_MCU_SEL_MASK); + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + cfg_reg = CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG; + ctrl_reg = CS42L43_FW_MISSION_CTRL_MM_CTRL_SELECTION; + break; + case CS42L43B_DEVID_VAL: + cfg_reg = CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG; + ctrl_reg = CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION; + break; + default: + return -EINVAL; + } + + regmap_write(cs42l43->regmap, cfg_reg, CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL); + regmap_write(cs42l43->regmap, ctrl_reg, CS42L43_FW_MM_CTRL_MCU_SEL_MASK); + regmap_write(cs42l43->regmap, CS42L43_MCU_SW_INTERRUPT, CS42L43_CONTROL_IND_MASK); regmap_write(cs42l43->regmap, CS42L43_MCU_SW_INTERRUPT, 0); @@ -740,18 +786,32 @@ static int cs42l43_mcu_update_step(struct cs42l43 *cs42l43) { unsigned int mcu_rev, bios_rev, boot_status, secure_cfg; bool patched, shadow; + int boot_status_reg, mcu_sw_rev_reg; int ret; + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + boot_status_reg = CS42L43_BOOT_STATUS; + mcu_sw_rev_reg = CS42L43_MCU_SW_REV; + break; + case CS42L43B_DEVID_VAL: + boot_status_reg = CS42L43B_BOOT_STATUS; + mcu_sw_rev_reg = CS42L43B_MCU_SW_REV; + break; + default: + return -EINVAL; + } + /* Clear any stale software interrupt bits. */ regmap_read(cs42l43->regmap, CS42L43_SOFT_INT, &mcu_rev); - ret = regmap_read(cs42l43->regmap, CS42L43_BOOT_STATUS, &boot_status); + ret = regmap_read(cs42l43->regmap, boot_status_reg, &boot_status); if (ret) { dev_err(cs42l43->dev, "Failed to read boot status: %d\n", ret); return ret; } - ret = regmap_read(cs42l43->regmap, CS42L43_MCU_SW_REV, &mcu_rev); + ret = regmap_read(cs42l43->regmap, mcu_sw_rev_reg, &mcu_rev); if (ret) { dev_err(cs42l43->dev, "Failed to read firmware revision: %d\n", ret); return ret; @@ -918,6 +978,13 @@ static void cs42l43_boot_work(struct work_struct *work) switch (devid) { case CS42L43_DEVID_VAL: + case CS42L43B_DEVID_VAL: + if (devid != cs42l43->variant_id) { + dev_err(cs42l43->dev, + "Device ID (0x%06x) does not match variant ID (0x%06lx)\n", + devid, cs42l43->variant_id); + goto err; + } break; default: dev_err(cs42l43->dev, "Unrecognised devid: 0x%06x\n", devid); diff --git a/drivers/mfd/cs42l43.h b/drivers/mfd/cs42l43.h index f3da783930f5..a0068f6572e2 100644 --- a/drivers/mfd/cs42l43.h +++ b/drivers/mfd/cs42l43.h @@ -9,7 +9,7 @@ #ifndef CS42L43_CORE_INT_H #define CS42L43_CORE_INT_H -#define CS42L43_N_DEFAULTS 176 +#define CS42L43_N_DEFAULTS 189 struct dev_pm_ops; struct device; diff --git a/include/linux/mfd/cs42l43-regs.h b/include/linux/mfd/cs42l43-regs.h index c39a49269cb7..68831f113589 100644 --- a/include/linux/mfd/cs42l43-regs.h +++ b/include/linux/mfd/cs42l43-regs.h @@ -1181,4 +1181,80 @@ /* CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG */ #define CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL 0xF05AA50F +/* CS42L43B VARIANT REGISTERS */ +#define CS42L43B_DEVID_VAL 0x0042A43B + +#define CS42L43B_DECIM_VOL_CTRL_CH1_CH2 0x00008280 +#define CS42L43B_DECIM_VOL_CTRL_CH3_CH4 0x00008284 + +#define CS42L43B_DECIM_VOL_CTRL_CH5_CH6 0x00008290 +#define CS42L43B_DECIM_VOL_CTRL_UPDATE 0x0000829C + +#define CS42L43B_DECIM_HPF_WNF_CTRL5 0x000082A0 +#define CS42L43B_DECIM_HPF_WNF_CTRL6 0x000082A4 + +#define CS42L43B_SWIRE_DP3_CH3_INPUT 0x0000C320 +#define CS42L43B_SWIRE_DP3_CH4_INPUT 0x0000C330 +#define CS42L43B_SWIRE_DP4_CH3_INPUT 0x0000C340 +#define CS42L43B_SWIRE_DP4_CH4_INPUT 0x0000C350 + +#define CS42L43B_ISRC1DEC3_INPUT1 0x0000C780 +#define CS42L43B_ISRC1DEC4_INPUT1 0x0000C790 +#define CS42L43B_ISRC2DEC3_INPUT1 0x0000C7A0 +#define CS42L43B_ISRC2DEC4_INPUT1 0x0000C7B0 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_SW_REV 0x00117314 +#define CS42L43B_PATCH_START_ADDR 0x00117318 +#define CS42L43B_CONFIG_SELECTION 0x0011731C +#define CS42L43B_NEED_CONFIGS 0x00117320 +#define CS42L43B_BOOT_STATUS 0x00117330 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_RAM_MAX 0x00117FFF + +/* CS42L43B_DECIM_DECIM_VOL_CTRL_CH5_CH6 */ +#define CS42L43B_DECIM6_MUTE_MASK 0x80000000 +#define CS42L43B_DECIM6_MUTE_SHIFT 31 +#define CS42L43B_DECIM6_VOL_MASK 0x3FC00000 +#define CS42L43B_DECIM6_VOL_SHIFT 22 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_MASK 0x00380000 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_SHIFT 19 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_MASK 0x00070000 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_SHIFT 16 +#define CS42L43B_DECIM5_MUTE_MASK 0x00008000 +#define CS42L43B_DECIM5_MUTE_SHIFT 15 +#define CS42L43B_DECIM5_VOL_MASK 0x00003FC0 +#define CS42L43B_DECIM5_VOL_SHIFT 6 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_MASK 0x00000038 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_SHIFT 3 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_MASK 0x00000007 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_SHIFT 0 + +/* CS42L43B_DECIM_VOL_CTRL_UPDATE */ +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_MASK 0x00000800 +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_SHIFT 11 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_MASK 0x00000100 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_SHIFT 8 +#define CS42L43B_DECIM4_VOL_UPDATE_MASK 0x00000020 +#define CS42L43B_DECIM4_VOL_UPDATE_SHIFT 5 + +/* CS42L43_ISRC1_CTRL..CS42L43_ISRC2_CTRL */ +#define CS42L43B_ISRC_DEC4_EN_MASK 0x00000008 +#define CS42L43B_ISRC_DEC4_EN_SHIFT 3 +#define CS42L43B_ISRC_DEC4_EN_WIDTH 1 +#define CS42L43B_ISRC_DEC3_EN_MASK 0x00000004 +#define CS42L43B_ISRC_DEC3_EN_SHIFT 2 +#define CS42L43B_ISRC_DEC3_EN_WIDTH 1 + #endif /* CS42L43_CORE_REGS_H */ diff --git a/include/linux/mfd/cs42l43.h b/include/linux/mfd/cs42l43.h index 2239d8585e78..ff0f7e365a19 100644 --- a/include/linux/mfd/cs42l43.h +++ b/include/linux/mfd/cs42l43.h @@ -98,6 +98,7 @@ struct cs42l43 { bool sdw_pll_active; bool attached; bool hw_lock; + long variant_id; }; #endif /* CS42L43_CORE_EXT_H */ -- cgit v1.2.3 From c76350e7add86344beae4cd69fffdf63284a4bf5 Mon Sep 17 00:00:00 2001 From: Gary Yang Date: Mon, 2 Mar 2026 14:44:05 +0800 Subject: dt-bindings: soc: cix: document the syscon on Sky1 SoC There are two system control on Cix sky1 Soc. One is located in S0 domain, and the other is located in S5 domain. The system control contains resets, usb typeC and more. At this point, only the reset controller is embedded as usb typeC uses it by phandle. Signed-off-by: Gary Yang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Philipp Zabel --- .../bindings/soc/cix/cix,sky1-system-control.yaml | 42 ++++++ .../dt-bindings/reset/cix,sky1-s5-system-control.h | 163 +++++++++++++++++++++ .../dt-bindings/reset/cix,sky1-system-control.h | 41 ++++++ 3 files changed, 246 insertions(+) create mode 100644 Documentation/devicetree/bindings/soc/cix/cix,sky1-system-control.yaml create mode 100644 include/dt-bindings/reset/cix,sky1-s5-system-control.h create mode 100644 include/dt-bindings/reset/cix,sky1-system-control.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/soc/cix/cix,sky1-system-control.yaml b/Documentation/devicetree/bindings/soc/cix/cix,sky1-system-control.yaml new file mode 100644 index 000000000000..a01a515222c6 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/cix/cix,sky1-system-control.yaml @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/soc/cix/cix,sky1-system-control.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Cix Sky1 SoC system control register region + +maintainers: + - Gary Yang + +description: + An wide assortment of registers of the system controller on Sky1 SoC, + including resets, usb, wakeup sources and so on. + +properties: + compatible: + items: + - enum: + - cix,sky1-system-control + - cix,sky1-s5-system-control + - const: syscon + + reg: + maxItems: 1 + + '#reset-cells': + const: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + syscon@4160000 { + compatible = "cix,sky1-system-control", "syscon"; + reg = <0x4160000 0x100>; + #reset-cells = <1>; + }; diff --git a/include/dt-bindings/reset/cix,sky1-s5-system-control.h b/include/dt-bindings/reset/cix,sky1-s5-system-control.h new file mode 100644 index 000000000000..808bbcbe0c98 --- /dev/null +++ b/include/dt-bindings/reset/cix,sky1-s5-system-control.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Author: Jerry Zhu */ +#ifndef DT_BINDING_RESET_CIX_SKY1_S5_SYSTEM_CONTROL_H +#define DT_BINDING_RESET_CIX_SKY1_S5_SYSTEM_CONTROL_H + +/* reset for csu_pm */ +#define SKY1_CSU_PM_RESET_N 0 +#define SKY1_SENSORFUSION_RESET_N 1 +#define SKY1_SENSORFUSION_NOC_RESET_N 2 + +/* reset group0 for s0 domain modules */ +#define SKY1_DDRC_RESET_N 3 +#define SKY1_GIC_RESET_N 4 +#define SKY1_CI700_RESET_N 5 +#define SKY1_SYS_NI700_RESET_N 6 +#define SKY1_MM_NI700_RESET_N 7 +#define SKY1_PCIE_NI700_RESET_N 8 +#define SKY1_GPU_RESET_N 9 +#define SKY1_NPUTOP_RESET_N 10 +#define SKY1_NPUCORE0_RESET_N 11 +#define SKY1_NPUCORE1_RESET_N 12 +#define SKY1_NPUCORE2_RESET_N 13 +#define SKY1_VPU_RESET_N 14 +#define SKY1_ISP_SRESET_N 15 +#define SKY1_ISP_ARESET_N 16 +#define SKY1_ISP_HRESET_N 17 +#define SKY1_ISP_GDCRESET_N 18 +#define SKY1_DPU_RESET0_N 19 +#define SKY1_DPU_RESET1_N 20 +#define SKY1_DPU_RESET2_N 21 +#define SKY1_DPU_RESET3_N 22 +#define SKY1_DPU_RESET4_N 23 +#define SKY1_DP_RESET0_N 24 +#define SKY1_DP_RESET1_N 25 +#define SKY1_DP_RESET2_N 26 +#define SKY1_DP_RESET3_N 27 +#define SKY1_DP_RESET4_N 28 +#define SKY1_DP_PHY_RST_N 29 + +/* reset group1 for s0 domain modules */ +#define SKY1_AUDIO_HIFI5_RESET_N 30 +#define SKY1_AUDIO_HIFI5_NOC_RESET_N 31 +#define SKY1_CSIDPHY_PRST0_N 32 +#define SKY1_CSIDPHY_CMNRST0_N 33 +#define SKY1_CSI0_RST_N 34 +#define SKY1_CSIDPHY_PRST1_N 35 +#define SKY1_CSIDPHY_CMNRST1_N 36 +#define SKY1_CSI1_RST_N 37 +#define SKY1_CSI2_RST_N 38 +#define SKY1_CSI3_RST_N 39 +#define SKY1_CSIBRDGE0_RST_N 40 +#define SKY1_CSIBRDGE1_RST_N 41 +#define SKY1_CSIBRDGE2_RST_N 42 +#define SKY1_CSIBRDGE3_RST_N 43 +#define SKY1_GMAC0_RST_N 44 +#define SKY1_GMAC1_RST_N 45 +#define SKY1_PCIE0_RESET_N 46 +#define SKY1_PCIE1_RESET_N 47 +#define SKY1_PCIE2_RESET_N 48 +#define SKY1_PCIE3_RESET_N 49 +#define SKY1_PCIE4_RESET_N 50 + +/* reset group1 for usb phys */ +#define SKY1_USB_DP_PHY0_PRST_N 51 +#define SKY1_USB_DP_PHY1_PRST_N 52 +#define SKY1_USB_DP_PHY2_PRST_N 53 +#define SKY1_USB_DP_PHY3_PRST_N 54 +#define SKY1_USB_DP_PHY0_RST_N 55 +#define SKY1_USB_DP_PHY1_RST_N 56 +#define SKY1_USB_DP_PHY2_RST_N 57 +#define SKY1_USB_DP_PHY3_RST_N 58 +#define SKY1_USBPHY_SS_PST_N 59 +#define SKY1_USBPHY_SS_RST_N 60 +#define SKY1_USBPHY_HS0_PRST_N 61 +#define SKY1_USBPHY_HS1_PRST_N 62 +#define SKY1_USBPHY_HS2_PRST_N 63 +#define SKY1_USBPHY_HS3_PRST_N 64 +#define SKY1_USBPHY_HS4_PRST_N 65 +#define SKY1_USBPHY_HS5_PRST_N 66 +#define SKY1_USBPHY_HS6_PRST_N 67 +#define SKY1_USBPHY_HS7_PRST_N 68 +#define SKY1_USBPHY_HS8_PRST_N 69 +#define SKY1_USBPHY_HS9_PRST_N 70 + +/* reset group1 for usb controllers */ +#define SKY1_USBC_SS0_PRST_N 71 +#define SKY1_USBC_SS1_PRST_N 72 +#define SKY1_USBC_SS2_PRST_N 73 +#define SKY1_USBC_SS3_PRST_N 74 +#define SKY1_USBC_SS4_PRST_N 75 +#define SKY1_USBC_SS5_PRST_N 76 +#define SKY1_USBC_SS0_RST_N 77 +#define SKY1_USBC_SS1_RST_N 78 +#define SKY1_USBC_SS2_RST_N 79 +#define SKY1_USBC_SS3_RST_N 80 +#define SKY1_USBC_SS4_RST_N 81 +#define SKY1_USBC_SS5_RST_N 82 +#define SKY1_USBC_HS0_PRST_N 83 +#define SKY1_USBC_HS1_PRST_N 84 +#define SKY1_USBC_HS2_PRST_N 85 +#define SKY1_USBC_HS3_PRST_N 86 +#define SKY1_USBC_HS0_RST_N 87 +#define SKY1_USBC_HS1_RST_N 88 +#define SKY1_USBC_HS2_RST_N 89 +#define SKY1_USBC_HS3_RST_N 90 + +/* reset group0 for rcsu */ +#define SKY1_AUDIO_RCSU_RESET_N 91 +#define SKY1_CI700_RCSU_RESET_N 92 +#define SKY1_CSI_RCSU0_RESET_N 93 +#define SKY1_CSI_RCSU1_RESET_N 94 +#define SKY1_CSU_PM_RCSU_RESET_N 95 +#define SKY1_DDR_BROADCAST_RCSU_RESET_N 96 +#define SKY1_DDR_CTRL_RCSU_0_RESET_N 97 +#define SKY1_DDR_CTRL_RCSU_1_RESET_N 98 +#define SKY1_DDR_CTRL_RCSU_2_RESET_N 99 +#define SKY1_DDR_CTRL_RCSU_3_RESET_N 100 +#define SKY1_DDR_TZC400_RCSU_0_RESET_N 101 +#define SKY1_DDR_TZC400_RCSU_1_RESET_N 102 +#define SKY1_DDR_TZC400_RCSU_2_RESET_N 103 +#define SKY1_DDR_TZC400_RCSU_3_RESET_N 104 +#define SKY1_DP0_RCSU_RESET_N 105 +#define SKY1_DP1_RCSU_RESET_N 106 +#define SKY1_DP2_RCSU_RESET_N 107 +#define SKY1_DP3_RCSU_RESET_N 108 +#define SKY1_DP4_RCSU_RESET_N 109 +#define SKY1_DPU0_RCSU_RESET_N 110 +#define SKY1_DPU1_RCSU_RESET_N 111 +#define SKY1_DPU2_RCSU_RESET_N 112 +#define SKY1_DPU3_RCSU_RESET_N 113 +#define SKY1_DPU4_RCSU_RESET_N 114 +#define SKY1_DSU_RCSU_RESET_N 115 +#define SKY1_FCH_RCSU_RESET_N 116 +#define SKY1_GICD_RCSU_RESET_N 117 +#define SKY1_GMAC_RCSU_RESET_N 118 +#define SKY1_GPU_RCSU_RESET_N 119 +#define SKY1_ISP_RCSU0_RESET_N 120 +#define SKY1_ISP_RCSU1_RESET_N 121 +#define SKY1_NI700_MMHUB_RCSU_RESET_N 122 + +/* reset group1 for rcsu */ +#define SKY1_NPU_RCSU_RESET_N 123 +#define SKY1_NI700_PCIE_RCSU_RESET_N 124 +#define SKY1_PCIE_X421_RCSU_RESET_N 125 +#define SKY1_PCIE_X8_RCSU_RESET_N 126 +#define SKY1_SF_RCSU_RESET_N 127 +#define SKY1_RCSU_SMMU_MMHUB_RESET_N 128 +#define SKY1_RCSU_SMMU_PCIEHUB_RESET_N 129 +#define SKY1_RCSU_SYSHUB_RESET_N 130 +#define SKY1_NI700_SMN_RCSU_RESET_N 131 +#define SKY1_NI700_SYSHUB_RCSU_RESET_N 132 +#define SKY1_RCSU_USB2_HOST0_RESET_N 133 +#define SKY1_RCSU_USB2_HOST1_RESET_N 134 +#define SKY1_RCSU_USB2_HOST2_RESET_N 135 +#define SKY1_RCSU_USB2_HOST3_RESET_N 136 +#define SKY1_RCSU_USB3_TYPEA_DRD_RESET_N 137 +#define SKY1_RCSU_USB3_TYPEC_DRD_RESET_N 138 +#define SKY1_RCSU_USB3_TYPEC_HOST0_RESET_N 139 +#define SKY1_RCSU_USB3_TYPEC_HOST1_RESET_N 140 +#define SKY1_RCSU_USB3_TYPEC_HOST2_RESET_N 141 +#define SKY1_VPU_RCSU_RESET_N 142 + +#endif diff --git a/include/dt-bindings/reset/cix,sky1-system-control.h b/include/dt-bindings/reset/cix,sky1-system-control.h new file mode 100644 index 000000000000..7a16fc4ef3b5 --- /dev/null +++ b/include/dt-bindings/reset/cix,sky1-system-control.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Author: Jerry Zhu */ +#ifndef DT_BINDING_RESET_CIX_SKY1_SYSTEM_CONTROL_H +#define DT_BINDING_RESET_CIX_SKY1_SYSTEM_CONTROL_H + +/* func reset for sky1 fch */ +#define SW_I3C0_RST_FUNC_G_N 0 +#define SW_I3C0_RST_FUNC_I_N 1 +#define SW_I3C1_RST_FUNC_G_N 2 +#define SW_I3C1_RST_FUNC_I_N 3 +#define SW_UART0_RST_FUNC_N 4 +#define SW_UART1_RST_FUNC_N 5 +#define SW_UART2_RST_FUNC_N 6 +#define SW_UART3_RST_FUNC_N 7 +#define SW_TIMER_RST_FUNC_N 8 + +/* apb reset for sky1 fch */ +#define SW_I3C0_RST_APB_N 9 +#define SW_I3C1_RST_APB_N 10 +#define SW_DMA_RST_AXI_N 11 +#define SW_UART0_RST_APB_N 12 +#define SW_UART1_RST_APB_N 13 +#define SW_UART2_RST_APB_N 14 +#define SW_UART3_RST_APB_N 15 +#define SW_SPI0_RST_APB_N 16 +#define SW_SPI1_RST_APB_N 17 +#define SW_I2C0_RST_APB_N 18 +#define SW_I2C1_RST_APB_N 19 +#define SW_I2C2_RST_APB_N 20 +#define SW_I2C3_RST_APB_N 21 +#define SW_I2C4_RST_APB_N 22 +#define SW_I2C5_RST_APB_N 23 +#define SW_I2C6_RST_APB_N 24 +#define SW_I2C7_RST_APB_N 25 +#define SW_GPIO_RST_APB_N 26 + +/* fch rst for xspi */ +#define SW_XSPI_REG_RST_N 27 +#define SW_XSPI_SYS_RST_N 28 + +#endif -- cgit v1.2.3 From 3a005126c9d7f30093627a6f329656c358e16b3a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 25 Feb 2026 16:41:37 -0500 Subject: dmaengine: of_dma: Add devm_of_dma_controller_register() Add a managed API, devm_of_dma_controller_register(), to simplify DMA engine controller registration by automatically handling resource cleanup. Signed-off-by: Frank Li Link: https://patch.msgid.link/20260225-mxsdma-module-v3-1-8f798b13baa6@nxp.com Signed-off-by: Vinod Koul --- include/linux/of_dma.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index fd706cdf255c..16b08234d03b 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -38,6 +38,26 @@ extern int of_dma_controller_register(struct device_node *np, void *data); extern void of_dma_controller_free(struct device_node *np); +static void __of_dma_controller_free(void *np) +{ + of_dma_controller_free(np); +} + +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + int ret; + + ret = of_dma_controller_register(np, of_dma_xlate, data); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, __of_dma_controller_free, np); +} + extern int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), @@ -64,6 +84,15 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + return -ENODEV; +} + static inline int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), -- cgit v1.2.3 From 9ded47ad003f09a94b6a710b5c47f4aa5ceb7429 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:54 +0100 Subject: fbdev: defio: Disconnect deferred I/O from the lifetime of struct fb_info Hold state of deferred I/O in struct fb_deferred_io_state. Allocate an instance as part of initializing deferred I/O and remove it only after the final mapping has been closed. If the fb_info and the contained deferred I/O meanwhile goes away, clear struct fb_deferred_io_state.info to invalidate the mapping. Any access will then result in a SIGBUS signal. Fixes a long-standing problem, where a device hot-unplug happens while user space still has an active mapping of the graphics memory. The hot- unplug frees the instance of struct fb_info. Accessing the memory will operate on undefined state. Signed-off-by: Thomas Zimmermann Fixes: 60b59beafba8 ("fbdev: mm: Deferred IO support") Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v2.6.22+ Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 178 ++++++++++++++++++++++++++++-------- include/linux/fb.h | 4 +- 2 files changed, 145 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index ca48b89a323d..93bd2f696fa4 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -24,6 +24,75 @@ #include #include +/* + * struct fb_deferred_io_state + */ + +struct fb_deferred_io_state { + struct kref ref; + + struct mutex lock; /* mutex that protects the pageref list */ + /* fields protected by lock */ + struct fb_info *info; +}; + +static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) +{ + struct fb_deferred_io_state *fbdefio_state; + + fbdefio_state = kzalloc_obj(*fbdefio_state); + if (!fbdefio_state) + return NULL; + + kref_init(&fbdefio_state->ref); + mutex_init(&fbdefio_state->lock); + + return fbdefio_state; +} + +static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) +{ + mutex_destroy(&fbdefio_state->lock); + + kfree(fbdefio_state); +} + +static void fb_deferred_io_state_get(struct fb_deferred_io_state *fbdefio_state) +{ + kref_get(&fbdefio_state->ref); +} + +static void __fb_deferred_io_state_release(struct kref *ref) +{ + struct fb_deferred_io_state *fbdefio_state = + container_of(ref, struct fb_deferred_io_state, ref); + + fb_deferred_io_state_release(fbdefio_state); +} + +static void fb_deferred_io_state_put(struct fb_deferred_io_state *fbdefio_state) +{ + kref_put(&fbdefio_state->ref, __fb_deferred_io_state_release); +} + +/* + * struct vm_operations_struct + */ + +static void fb_deferred_io_vm_open(struct vm_area_struct *vma) +{ + struct fb_deferred_io_state *fbdefio_state = vma->vm_private_data; + + fb_deferred_io_state_get(fbdefio_state); +} + +static void fb_deferred_io_vm_close(struct vm_area_struct *vma) +{ + struct fb_deferred_io_state *fbdefio_state = vma->vm_private_data; + + fb_deferred_io_state_put(fbdefio_state); +} + static struct page *fb_deferred_io_get_page(struct fb_info *info, unsigned long offs) { struct fb_deferred_io *fbdefio = info->fbdefio; @@ -121,25 +190,46 @@ static void fb_deferred_io_pageref_put(struct fb_deferred_io_pageref *pageref, /* this is to find and return the vmalloc-ed fb pages */ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) { + struct fb_info *info; unsigned long offset; struct page *page; - struct fb_info *info = vmf->vma->vm_private_data; + vm_fault_t ret; + struct fb_deferred_io_state *fbdefio_state = vmf->vma->vm_private_data; + + mutex_lock(&fbdefio_state->lock); + + info = fbdefio_state->info; + if (!info) { + ret = VM_FAULT_SIGBUS; /* our device is gone */ + goto err_mutex_unlock; + } offset = vmf->pgoff << PAGE_SHIFT; - if (offset >= info->fix.smem_len) - return VM_FAULT_SIGBUS; + if (offset >= info->fix.smem_len) { + ret = VM_FAULT_SIGBUS; + goto err_mutex_unlock; + } page = fb_deferred_io_get_page(info, offset); - if (!page) - return VM_FAULT_SIGBUS; + if (!page) { + ret = VM_FAULT_SIGBUS; + goto err_mutex_unlock; + } if (!vmf->vma->vm_file) fb_err(info, "no mapping available\n"); BUG_ON(!info->fbdefio->mapping); + mutex_unlock(&fbdefio_state->lock); + vmf->page = page; + return 0; + +err_mutex_unlock: + mutex_unlock(&fbdefio_state->lock); + return ret; } int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync) @@ -166,15 +256,24 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); * Adds a page to the dirty list. Call this from struct * vm_operations_struct.page_mkwrite. */ -static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long offset, - struct page *page) +static vm_fault_t fb_deferred_io_track_page(struct fb_deferred_io_state *fbdefio_state, + unsigned long offset, struct page *page) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_info *info; + struct fb_deferred_io *fbdefio; struct fb_deferred_io_pageref *pageref; vm_fault_t ret; /* protect against the workqueue changing the page list */ - mutex_lock(&fbdefio->lock); + mutex_lock(&fbdefio_state->lock); + + info = fbdefio_state->info; + if (!info) { + ret = VM_FAULT_SIGBUS; /* our device is gone */ + goto err_mutex_unlock; + } + + fbdefio = info->fbdefio; pageref = fb_deferred_io_pageref_get(info, offset, page); if (WARN_ON_ONCE(!pageref)) { @@ -192,50 +291,38 @@ static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long */ lock_page(pageref->page); - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); /* come back after delay to process the deferred IO */ schedule_delayed_work(&info->deferred_work, fbdefio->delay); return VM_FAULT_LOCKED; err_mutex_unlock: - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); return ret; } -/* - * fb_deferred_io_page_mkwrite - Mark a page as written for deferred I/O - * @fb_info: The fbdev info structure - * @vmf: The VM fault - * - * This is a callback we get when userspace first tries to - * write to the page. We schedule a workqueue. That workqueue - * will eventually mkclean the touched pages and execute the - * deferred framebuffer IO. Then if userspace touches a page - * again, we repeat the same scheme. - * - * Returns: - * VM_FAULT_LOCKED on success, or a VM_FAULT error otherwise. - */ -static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_info *info, struct vm_fault *vmf) +static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_deferred_io_state *fbdefio_state, + struct vm_fault *vmf) { unsigned long offset = vmf->pgoff << PAGE_SHIFT; struct page *page = vmf->page; file_update_time(vmf->vma->vm_file); - return fb_deferred_io_track_page(info, offset, page); + return fb_deferred_io_track_page(fbdefio_state, offset, page); } -/* vm_ops->page_mkwrite handler */ static vm_fault_t fb_deferred_io_mkwrite(struct vm_fault *vmf) { - struct fb_info *info = vmf->vma->vm_private_data; + struct fb_deferred_io_state *fbdefio_state = vmf->vma->vm_private_data; - return fb_deferred_io_page_mkwrite(info, vmf); + return fb_deferred_io_page_mkwrite(fbdefio_state, vmf); } static const struct vm_operations_struct fb_deferred_io_vm_ops = { + .open = fb_deferred_io_vm_open, + .close = fb_deferred_io_vm_close, .fault = fb_deferred_io_fault, .page_mkwrite = fb_deferred_io_mkwrite, }; @@ -252,7 +339,10 @@ int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) vm_flags_set(vma, VM_IO); - vma->vm_private_data = info; + vma->vm_private_data = info->fbdefio_state; + + fb_deferred_io_state_get(info->fbdefio_state); /* released in vma->vm_ops->close() */ + return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_mmap); @@ -263,9 +353,10 @@ static void fb_deferred_io_work(struct work_struct *work) struct fb_info *info = container_of(work, struct fb_info, deferred_work.work); struct fb_deferred_io_pageref *pageref, *next; struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; /* here we wrprotect the page's mappings, then do all deferred IO. */ - mutex_lock(&fbdefio->lock); + mutex_lock(&fbdefio_state->lock); #ifdef CONFIG_MMU list_for_each_entry(pageref, &fbdefio->pagereflist, list) { struct page *page = pageref->page; @@ -283,12 +374,13 @@ static void fb_deferred_io_work(struct work_struct *work) list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); } int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state; struct fb_deferred_io_pageref *pagerefs; unsigned long npagerefs; int ret; @@ -298,7 +390,11 @@ int fb_deferred_io_init(struct fb_info *info) if (WARN_ON(!info->fix.smem_len)) return -EINVAL; - mutex_init(&fbdefio->lock); + fbdefio_state = fb_deferred_io_state_alloc(); + if (!fbdefio_state) + return -ENOMEM; + fbdefio_state->info = info; + INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ @@ -315,10 +411,12 @@ int fb_deferred_io_init(struct fb_info *info) info->npagerefs = npagerefs; info->pagerefs = pagerefs; + info->fbdefio_state = fbdefio_state; + return 0; err: - mutex_destroy(&fbdefio->lock); + fb_deferred_io_state_release(fbdefio_state); return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); @@ -352,11 +450,19 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; fb_deferred_io_lastclose(info); + info->fbdefio_state = NULL; + + mutex_lock(&fbdefio_state->lock); + fbdefio_state->info = NULL; + mutex_unlock(&fbdefio_state->lock); + + fb_deferred_io_state_put(fbdefio_state); + kvfree(info->pagerefs); - mutex_destroy(&fbdefio->lock); fbdefio->mapping = NULL; } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index 6d4a58084fd5..aed17567fe50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -218,13 +218,14 @@ struct fb_deferred_io { unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ int open_count; /* number of opened files; protected by fb_info lock */ - struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); }; + +struct fb_deferred_io_state; #endif /* @@ -487,6 +488,7 @@ struct fb_info { unsigned long npagerefs; struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; + struct fb_deferred_io_state *fbdefio_state; #endif const struct fb_ops *fbops; -- cgit v1.2.3 From 648bfb62da4e7a970f6b153bb8cdab1703877fcd Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:56 +0100 Subject: fbdev: defio: Move variable state into struct fb_deferred_io_state Move variable fields from struct fb_deferred_io into struct fb_deferred_io_state. These fields are internal to the defio code and should not be exposed to drivers. At some later point, struct fb_defered_io might become const in all defio code. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 37 ++++++++++++++++++++++--------------- include/linux/fb.h | 3 --- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 56030eb42f71..35ac13727da1 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -25,6 +25,8 @@ #include #include +struct address_space; + /* * struct fb_deferred_io_state */ @@ -32,9 +34,13 @@ struct fb_deferred_io_state { struct kref ref; + int open_count; /* number of opened files; protected by fb_info lock */ + struct address_space *mapping; /* page cache object for fb device */ + struct mutex lock; /* mutex that protects the pageref list */ /* fields protected by lock */ struct fb_info *info; + struct list_head pagereflist; /* list of pagerefs for touched pages */ }; static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) @@ -48,11 +54,14 @@ static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) kref_init(&fbdefio_state->ref); mutex_init(&fbdefio_state->lock); + INIT_LIST_HEAD(&fbdefio_state->pagereflist); + return fbdefio_state; } static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) { + WARN_ON(!list_empty(&fbdefio_state->pagereflist)); mutex_destroy(&fbdefio_state->lock); kfree(fbdefio_state); @@ -147,7 +156,8 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; - struct list_head *pos = &fbdefio->pagereflist; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; + struct list_head *pos = &fbdefio_state->pagereflist; struct fb_deferred_io_pageref *pageref, *cur; pageref = fb_deferred_io_pageref_lookup(info, offset, page); @@ -171,7 +181,7 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info * pages. If possible, drivers should try to work with * unsorted page lists instead. */ - list_for_each_entry(cur, &fbdefio->pagereflist, list) { + list_for_each_entry(cur, &fbdefio_state->pagereflist, list) { if (cur->offset > pageref->offset) break; } @@ -222,7 +232,7 @@ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) if (!vmf->vma->vm_file) fb_err(info, "no mapping available\n"); - BUG_ON(!info->fbdefio->mapping); + fb_WARN_ON_ONCE(info, !fbdefio_state->mapping); mutex_unlock(&fbdefio_state->lock); @@ -364,20 +374,20 @@ static void fb_deferred_io_work(struct work_struct *work) /* here we wrprotect the page's mappings, then do all deferred IO. */ mutex_lock(&fbdefio_state->lock); #ifdef CONFIG_MMU - list_for_each_entry(pageref, &fbdefio->pagereflist, list) { + list_for_each_entry(pageref, &fbdefio_state->pagereflist, list) { struct page *page = pageref->page; pgoff_t pgoff = pageref->offset >> PAGE_SHIFT; - mapping_wrprotect_range(fbdefio->mapping, pgoff, + mapping_wrprotect_range(fbdefio_state->mapping, pgoff, page_to_pfn(page), 1); } #endif /* driver's callback with pagereflist */ - fbdefio->deferred_io(info, &fbdefio->pagereflist); + fbdefio->deferred_io(info, &fbdefio_state->pagereflist); /* clear the list */ - list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) + list_for_each_entry_safe(pageref, next, &fbdefio_state->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); mutex_unlock(&fbdefio_state->lock); @@ -402,7 +412,6 @@ int fb_deferred_io_init(struct fb_info *info) fbdefio_state->info = info; INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); - INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; @@ -431,11 +440,11 @@ void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; - fbdefio->mapping = file->f_mapping; + fbdefio_state->mapping = file->f_mapping; file->f_mapping->a_ops = &fb_deferred_io_aops; - fbdefio->open_count++; + fbdefio_state->open_count++; } EXPORT_SYMBOL_GPL(fb_deferred_io_open); @@ -446,16 +455,15 @@ static void fb_deferred_io_lastclose(struct fb_info *info) void fb_deferred_io_release(struct fb_info *info) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; - if (!--fbdefio->open_count) + if (!--fbdefio_state->open_count) fb_deferred_io_lastclose(info); } EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { - struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; fb_deferred_io_lastclose(info); @@ -469,6 +477,5 @@ void fb_deferred_io_cleanup(struct fb_info *info) fb_deferred_io_state_put(fbdefio_state); kvfree(info->pagerefs); - fbdefio->mapping = NULL; } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index aed17567fe50..2791777f3a50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -217,9 +217,6 @@ struct fb_deferred_io { /* delay between mkwrite and deferred handler */ unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ - int open_count; /* number of opened files; protected by fb_info lock */ - struct list_head pagereflist; /* list of pagerefs for touched pages */ - struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); -- cgit v1.2.3 From 02fe86e5fc22faee9b29e761d9fdd17fdfe583e6 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:57 +0100 Subject: fbdev: defio: Move pageref array to struct fb_deferred_io_state The pageref array stores all pageref structures for a device's defio helpers. Move it into struct fb_deferred_io_state to not expose it to drivers. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 55 ++++++++++++++++++------------------- include/linux/fb.h | 2 -- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 35ac13727da1..a12dd25ab697 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -41,28 +41,46 @@ struct fb_deferred_io_state { /* fields protected by lock */ struct fb_info *info; struct list_head pagereflist; /* list of pagerefs for touched pages */ + unsigned long npagerefs; + struct fb_deferred_io_pageref *pagerefs; }; -static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) +static struct fb_deferred_io_state *fb_deferred_io_state_alloc(unsigned long len) { struct fb_deferred_io_state *fbdefio_state; + struct fb_deferred_io_pageref *pagerefs; + unsigned long npagerefs; fbdefio_state = kzalloc_obj(*fbdefio_state); if (!fbdefio_state) return NULL; + npagerefs = DIV_ROUND_UP(len, PAGE_SIZE); + + /* alloc a page ref for each page of the display memory */ + pagerefs = kvzalloc_objs(*pagerefs, npagerefs); + if (!pagerefs) + goto err_kfree; + fbdefio_state->npagerefs = npagerefs; + fbdefio_state->pagerefs = pagerefs; + kref_init(&fbdefio_state->ref); mutex_init(&fbdefio_state->lock); INIT_LIST_HEAD(&fbdefio_state->pagereflist); return fbdefio_state; + +err_kfree: + kfree(fbdefio_state); + return NULL; } static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) { WARN_ON(!list_empty(&fbdefio_state->pagereflist)); mutex_destroy(&fbdefio_state->lock); + kvfree(fbdefio_state->pagerefs); kfree(fbdefio_state); } @@ -125,18 +143,19 @@ static struct page *fb_deferred_io_get_page(struct fb_info *info, unsigned long return page; } -static struct fb_deferred_io_pageref *fb_deferred_io_pageref_lookup(struct fb_info *info, - unsigned long offset, - struct page *page) +static struct fb_deferred_io_pageref * +fb_deferred_io_pageref_lookup(struct fb_deferred_io_state *fbdefio_state, unsigned long offset, + struct page *page) { + struct fb_info *info = fbdefio_state->info; unsigned long pgoff = offset >> PAGE_SHIFT; struct fb_deferred_io_pageref *pageref; - if (fb_WARN_ON_ONCE(info, pgoff >= info->npagerefs)) + if (fb_WARN_ON_ONCE(info, pgoff >= fbdefio_state->npagerefs)) return NULL; /* incorrect allocation size */ /* 1:1 mapping between pageref and page offset */ - pageref = &info->pagerefs[pgoff]; + pageref = &fbdefio_state->pagerefs[pgoff]; if (pageref->page) goto out; @@ -160,7 +179,7 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info struct list_head *pos = &fbdefio_state->pagereflist; struct fb_deferred_io_pageref *pageref, *cur; - pageref = fb_deferred_io_pageref_lookup(info, offset, page); + pageref = fb_deferred_io_pageref_lookup(fbdefio_state, offset, page); if (!pageref) return NULL; @@ -397,16 +416,13 @@ int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_state *fbdefio_state; - struct fb_deferred_io_pageref *pagerefs; - unsigned long npagerefs; - int ret; BUG_ON(!fbdefio); if (WARN_ON(!info->fix.smem_len)) return -EINVAL; - fbdefio_state = fb_deferred_io_state_alloc(); + fbdefio_state = fb_deferred_io_state_alloc(info->fix.smem_len); if (!fbdefio_state) return -ENOMEM; fbdefio_state->info = info; @@ -415,24 +431,9 @@ int fb_deferred_io_init(struct fb_info *info) if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; - npagerefs = DIV_ROUND_UP(info->fix.smem_len, PAGE_SIZE); - - /* alloc a page ref for each page of the display memory */ - pagerefs = kvzalloc_objs(*pagerefs, npagerefs); - if (!pagerefs) { - ret = -ENOMEM; - goto err; - } - info->npagerefs = npagerefs; - info->pagerefs = pagerefs; - info->fbdefio_state = fbdefio_state; return 0; - -err: - fb_deferred_io_state_release(fbdefio_state); - return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); @@ -475,7 +476,5 @@ void fb_deferred_io_cleanup(struct fb_info *info) mutex_unlock(&fbdefio_state->lock); fb_deferred_io_state_put(fbdefio_state); - - kvfree(info->pagerefs); } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index 2791777f3a50..b27943719fab 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -482,8 +482,6 @@ struct fb_info { #ifdef CONFIG_FB_DEFERRED_IO struct delayed_work deferred_work; - unsigned long npagerefs; - struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; struct fb_deferred_io_state *fbdefio_state; #endif -- cgit v1.2.3 From 993bcaf32c494014f56357f6cdd87fdfaa4e4d11 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 26 Feb 2026 15:21:11 +0200 Subject: mux: Add helper functions for getting optional and selected mux-state In-tree phy-can-transceiver and phy_rcar_gen3_usb2 have already implemented local versions of devm_mux_state_get_optional. The omap-i2c driver gets and selects an optional mux in its probe function without using any helper. Add new helper functions covering both aforementioned use-cases: - mux_control_get_optional: Get a mux-control if specified in dt, return NULL otherwise. - devm_mux_state_get_optional: Get a mux-state if specified in dt, return NULL otherwise. - devm_mux_state_get_selected: Get and select a mux-state specified in dt, return error otherwise. - devm_mux_state_get_optional_selected: Get and select a mux-state if specified in dt, return error or NULL. Existing mux_get helper function is changed to take an extra argument indicating whether the mux is optional. In this case no error is printed, and NULL returned in case of ENOENT. Calling code is adapted to handle NULL return case, and to pass optional argument as required. To support automatic deselect for _selected helper, a new structure is created storing an exit pointer similar to clock core which is called on release. To facilitate code sharing between optional/mandatory/selected helpers, a new internal helper function is added to handle quiet (optional) and verbose (mandatory) errors, as well as storing the correct callback for devm release: __devm_mux_state_get Due to this structure devm_mux_state_get_*_selected can no longer print a useful error message when select fails. Instead callers should print errors where needed. Commit e153fdea9db04 ("phy: can-transceiver: Re-instate "mux-states" property presence check") noted that "mux_get() always prints an error message in case of an error, including when the property is not present, confusing the user." The first error message covers the case that a mux name is not matched in dt. The second error message is based on of_parse_phandle_with_args return value. In optional case no error is printed and NULL is returned. This ensures that the new helper functions will not confuse the user either. With the addition of optional helper functions it became clear that drivers should compile and link even if CONFIG_MULTIPLEXER was not enabled. Add stubs for all symbols exported by mux core. Acked-by: Wolfram Sang Signed-off-by: Josua Mayer Signed-off-by: Ulf Hansson --- drivers/mux/core.c | 194 +++++++++++++++++++++++++++++++++++++------ include/linux/mux/consumer.h | 108 +++++++++++++++++++++++- 2 files changed, 271 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/mux/core.c b/drivers/mux/core.c index f09ee8782e3d..23538de2c91b 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -46,6 +46,16 @@ static const struct class mux_class = { .name = "mux", }; +/** + * struct devm_mux_state_state - Tracks managed resources for mux-state objects. + * @mstate: Pointer to a mux state. + * @exit: An optional callback to execute before free. + */ +struct devm_mux_state_state { + struct mux_state *mstate; + int (*exit)(struct mux_state *mstate); +}; + static DEFINE_IDA(mux_ida); static int __init mux_init(void) @@ -516,17 +526,19 @@ static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np) return dev ? to_mux_chip(dev) : NULL; } -/* +/** * mux_get() - Get the mux-control for a device. * @dev: The device that needs a mux-control. * @mux_name: The name identifying the mux-control. * @state: Pointer to where the requested state is returned, or NULL when * the required multiplexer states are handled by other means. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. * - * Return: A pointer to the mux-control, or an ERR_PTR with a negative errno. + * Return: Pointer to the mux-control on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. */ static struct mux_control *mux_get(struct device *dev, const char *mux_name, - unsigned int *state) + unsigned int *state, bool optional) { struct device_node *np = dev->of_node; struct of_phandle_args args; @@ -542,7 +554,9 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, else index = of_property_match_string(np, "mux-control-names", mux_name); - if (index < 0) { + if (index < 0 && optional) { + return NULL; + } else if (index < 0) { dev_err(dev, "mux controller '%s' not found\n", mux_name); return ERR_PTR(index); @@ -558,8 +572,12 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, "mux-controls", "#mux-control-cells", index, &args); if (ret) { + if (optional && ret == -ENOENT) + return NULL; + dev_err(dev, "%pOF: failed to get mux-%s %s(%i)\n", - np, state ? "state" : "control", mux_name ?: "", index); + np, state ? "state" : "control", + mux_name ?: "", index); return ERR_PTR(ret); } @@ -617,10 +635,29 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, */ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) { - return mux_get(dev, mux_name, NULL); + struct mux_control *mux = mux_get(dev, mux_name, NULL, false); + + if (!mux) + return ERR_PTR(-ENOENT); + + return mux; } EXPORT_SYMBOL_GPL(mux_control_get); +/** + * mux_control_get_optional() - Get the optional mux-control for a device. + * @dev: The device that needs a mux-control. + * @mux_name: The name identifying the mux-control. + * + * Return: Pointer to the mux-control on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + */ +struct mux_control *mux_control_get_optional(struct device *dev, const char *mux_name) +{ + return mux_get(dev, mux_name, NULL, true); +} +EXPORT_SYMBOL_GPL(mux_control_get_optional); + /** * mux_control_put() - Put away the mux-control for good. * @mux: The mux-control to put away. @@ -670,14 +707,16 @@ struct mux_control *devm_mux_control_get(struct device *dev, } EXPORT_SYMBOL_GPL(devm_mux_control_get); -/* +/** * mux_state_get() - Get the mux-state for a device. * @dev: The device that needs a mux-state. * @mux_name: The name identifying the mux-state. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. * - * Return: A pointer to the mux-state, or an ERR_PTR with a negative errno. + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. */ -static struct mux_state *mux_state_get(struct device *dev, const char *mux_name) +static struct mux_state *mux_state_get(struct device *dev, const char *mux_name, bool optional) { struct mux_state *mstate; @@ -685,12 +724,15 @@ static struct mux_state *mux_state_get(struct device *dev, const char *mux_name) if (!mstate) return ERR_PTR(-ENOMEM); - mstate->mux = mux_get(dev, mux_name, &mstate->state); + mstate->mux = mux_get(dev, mux_name, &mstate->state, optional); if (IS_ERR(mstate->mux)) { int err = PTR_ERR(mstate->mux); kfree(mstate); return ERR_PTR(err); + } else if (!mstate->mux) { + kfree(mstate); + return optional ? NULL : ERR_PTR(-ENOENT); } return mstate; @@ -710,9 +752,66 @@ static void mux_state_put(struct mux_state *mstate) static void devm_mux_state_release(struct device *dev, void *res) { - struct mux_state *mstate = *(struct mux_state **)res; + struct devm_mux_state_state *devm_state = res; + + if (devm_state->exit) + devm_state->exit(devm_state->mstate); + + mux_state_put(devm_state->mstate); +} + +/** + * __devm_mux_state_get() - Get the optional mux-state for a device, + * with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. + * @init: Optional function pointer for mux-state object initialisation. + * @exit: Optional function pointer for mux-state object cleanup on release. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. + */ +static struct mux_state *__devm_mux_state_get(struct device *dev, const char *mux_name, + bool optional, + int (*init)(struct mux_state *mstate), + int (*exit)(struct mux_state *mstate)) +{ + struct devm_mux_state_state *devm_state; + struct mux_state *mstate; + int ret; + + mstate = mux_state_get(dev, mux_name, optional); + if (IS_ERR(mstate)) + return ERR_CAST(mstate); + else if (optional && !mstate) + return NULL; + else if (!mstate) + return ERR_PTR(-ENOENT); + + devm_state = devres_alloc(devm_mux_state_release, sizeof(*devm_state), GFP_KERNEL); + if (!devm_state) { + ret = -ENOMEM; + goto err_devres_alloc; + } + + if (init) { + ret = init(mstate); + if (ret) + goto err_mux_state_init; + } + + devm_state->mstate = mstate; + devm_state->exit = exit; + devres_add(dev, devm_state); + return mstate; + +err_mux_state_init: + devres_free(devm_state); +err_devres_alloc: mux_state_put(mstate); + return ERR_PTR(ret); } /** @@ -722,28 +821,69 @@ static void devm_mux_state_release(struct device *dev, void *res) * @mux_name: The name identifying the mux-control. * * Return: Pointer to the mux-state, or an ERR_PTR with a negative errno. + * + * The mux-state will automatically be freed on release. */ -struct mux_state *devm_mux_state_get(struct device *dev, - const char *mux_name) +struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name) { - struct mux_state **ptr, *mstate; - - ptr = devres_alloc(devm_mux_state_release, sizeof(*ptr), GFP_KERNEL); - if (!ptr) - return ERR_PTR(-ENOMEM); + return __devm_mux_state_get(dev, mux_name, false, NULL, NULL); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get); - mstate = mux_state_get(dev, mux_name); - if (IS_ERR(mstate)) { - devres_free(ptr); - return mstate; - } +/** + * devm_mux_state_get_optional() - Get the optional mux-state for a device, + * with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + * + * The mux-state will automatically be freed on release. + */ +struct mux_state *devm_mux_state_get_optional(struct device *dev, const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, true, NULL, NULL); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get_optional); - *ptr = mstate; - devres_add(dev, ptr); +/** + * devm_mux_state_get_selected() - Get the mux-state for a device, with + * resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state, or an ERR_PTR with a negative errno. + * + * The returned mux-state (if valid) is already selected. + * + * The mux-state will automatically be deselected and freed on release. + */ +struct mux_state *devm_mux_state_get_selected(struct device *dev, const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, false, mux_state_select, mux_state_deselect); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get_selected); - return mstate; +/** + * devm_mux_state_get_optional_selected() - Get the optional mux-state for + * a device, with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + * + * The returned mux-state (if valid) is already selected. + * + * The mux-state will automatically be deselected and freed on release. + */ +struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, + const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, true, mux_state_select, mux_state_deselect); } -EXPORT_SYMBOL_GPL(devm_mux_state_get); +EXPORT_SYMBOL_GPL(devm_mux_state_get_optional_selected); /* * Using subsys_initcall instead of module_init here to try to ensure - for diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h index 2e25c838f831..a961861a503b 100644 --- a/include/linux/mux/consumer.h +++ b/include/linux/mux/consumer.h @@ -16,6 +16,8 @@ struct device; struct mux_control; struct mux_state; +#if IS_ENABLED(CONFIG_MULTIPLEXER) + unsigned int mux_control_states(struct mux_control *mux); int __must_check mux_control_select_delay(struct mux_control *mux, unsigned int state, @@ -54,11 +56,109 @@ int mux_control_deselect(struct mux_control *mux); int mux_state_deselect(struct mux_state *mstate); struct mux_control *mux_control_get(struct device *dev, const char *mux_name); +struct mux_control *mux_control_get_optional(struct device *dev, const char *mux_name); void mux_control_put(struct mux_control *mux); -struct mux_control *devm_mux_control_get(struct device *dev, - const char *mux_name); -struct mux_state *devm_mux_state_get(struct device *dev, - const char *mux_name); +struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_selected(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, const char *mux_name); + +#else + +static inline unsigned int mux_control_states(struct mux_control *mux) +{ + return 0; +} +static inline int __must_check mux_control_select_delay(struct mux_control *mux, + unsigned int state, unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_control_try_select_delay(struct mux_control *mux, + unsigned int state, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_try_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_try_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_try_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int mux_control_deselect(struct mux_control *mux) +{ + return -EOPNOTSUPP; +} +static inline int mux_state_deselect(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline struct mux_control *mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_control *mux_control_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline void mux_control_put(struct mux_control *mux) {} + +static inline struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline struct mux_state *devm_mux_state_get_selected(struct device *dev, + const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, + const char *mux_name) +{ + return NULL; +} + +#endif /* CONFIG_MULTIPLEXER */ #endif /* _LINUX_MUX_CONSUMER_H */ -- cgit v1.2.3 From d9d2455e77d0f36a22b9dbaba8b6354dd1378101 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 23:31:20 +0000 Subject: io_uring/zcrx: move zcrx uapi into separate header Split out zcrx uapi into a separate file. It'll be easier to manage it this way, and that reduces the size of a not so small io_uring.h. Since there are users that expect that zcrx definitions come with io_uring.h, it includes the new file. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 96 +-------------------------------- include/uapi/linux/io_uring/zcrx.h | 108 +++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 94 deletions(-) create mode 100644 include/uapi/linux/io_uring/zcrx.h (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ff16141c8a5..17475c2045fb 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,6 +10,8 @@ #include #include +#include + /* * this file is shared with liburing and that has to autodetect * if linux/time_types.h is available or not, it can @@ -1050,100 +1052,6 @@ struct io_timespec { __u64 tv_nsec; }; -/* Zero copy receive refill queue entry */ -struct io_uring_zcrx_rqe { - __u64 off; - __u32 len; - __u32 __pad; -}; - -struct io_uring_zcrx_cqe { - __u64 off; - __u64 __pad; -}; - -/* The bit from which area id is encoded into offsets */ -#define IORING_ZCRX_AREA_SHIFT 48 -#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) - -struct io_uring_zcrx_offsets { - __u32 head; - __u32 tail; - __u32 rqes; - __u32 __resv2; - __u64 __resv[2]; -}; - -enum io_uring_zcrx_area_flags { - IORING_ZCRX_AREA_DMABUF = 1, -}; - -struct io_uring_zcrx_area_reg { - __u64 addr; - __u64 len; - __u64 rq_area_token; - __u32 flags; - __u32 dmabuf_fd; - __u64 __resv2[2]; -}; - -enum zcrx_reg_flags { - ZCRX_REG_IMPORT = 1, -}; - -enum zcrx_features { - /* - * The user can ask for the desired rx page size by passing the - * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. - */ - ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, -}; - -/* - * Argument for IORING_REGISTER_ZCRX_IFQ - */ -struct io_uring_zcrx_ifq_reg { - __u32 if_idx; - __u32 if_rxq; - __u32 rq_entries; - __u32 flags; - - __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ - __u64 region_ptr; /* struct io_uring_region_desc * */ - - struct io_uring_zcrx_offsets offsets; - __u32 zcrx_id; - __u32 rx_buf_len; - __u64 __resv[3]; -}; - -enum zcrx_ctrl_op { - ZCRX_CTRL_FLUSH_RQ, - ZCRX_CTRL_EXPORT, - - __ZCRX_CTRL_LAST, -}; - -struct zcrx_ctrl_flush_rq { - __u64 __resv[6]; -}; - -struct zcrx_ctrl_export { - __u32 zcrx_fd; - __u32 __resv1[11]; -}; - -struct zcrx_ctrl { - __u32 zcrx_id; - __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[2]; - - union { - struct zcrx_ctrl_export zc_export; - struct zcrx_ctrl_flush_rq zc_flush; - }; -}; - #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h new file mode 100644 index 000000000000..3163a4b8aeb0 --- /dev/null +++ b/include/uapi/linux/io_uring/zcrx.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring zerocopy receive (zcrx) interface. + * + * Copyright (C) 2026 Pavel Begunkov + * Copyright (C) 2026 David Wei + * Copyright (C) Meta Platforms, Inc. + */ +#ifndef LINUX_IO_ZCRX_H +#define LINUX_IO_ZCRX_H + +#include + +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 dmabuf_fd; + __u64 __resv2[2]; +}; + +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + +enum zcrx_features { + /* + * The user can ask for the desired rx page size by passing the + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. + */ + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + +enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, + + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[2]; + + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; +}; + +#endif /* LINUX_IO_ZCRX_H */ -- cgit v1.2.3 From d8345a21902af5d754f2c2aadf877de989e3cac3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 13:10:37 +0000 Subject: io_uring/timeout: immediate timeout arg One the things the user has always keep in mind is that any user pointers they put into an SQE is not going to be read by the kernel until submission happens, and the user has to ensure the pointee stays alive until then. For example, snippet below will lead to UAF of the on stack variable ts. Instead of passing the timeout value as a pointer allow to store it immediately in the SQE. The user has to set a new flag called IORING_TIMEOUT_IMMEDIATE_ARG, in which case sqe->addr for timeout or sqe->addr2 for timeout update requests will be interpreted as a time value in nanosecods. void prep_timeout(struct io_uring_sqe *sqe) { struct __kernel_timespec ts = {...}; prep_timeout(sqe, &ts); } void submit() { sqe = get_sqe(); prep_timeout(sqe); io_uring_submit(); } Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ io_uring/timeout.c | 20 +++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 17475c2045fb..17ac1b785440 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -343,6 +343,10 @@ enum io_uring_op { /* * sqe->timeout_flags + * + * IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout + * value in nanoseconds instead of + * pointing to a timespec. */ #define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_UPDATE (1U << 1) @@ -351,6 +355,7 @@ enum io_uring_op { #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_MULTISHOT (1U << 6) +#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4b67746ea3ca..8eddf8add7a2 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,10 +35,17 @@ struct io_timeout_rem { bool ltimeout; }; -static int io_parse_user_time(ktime_t *time, u64 arg) +static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) { struct timespec64 ts; + if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) { + *time = ns_to_ktime(arg); + if (*time < 0) + return -EINVAL; + return 0; + } + if (get_timespec64(&ts, u64_to_user_ptr(arg))) return -EFAULT; if (ts.tv_sec < 0 || ts.tv_nsec < 0) @@ -475,9 +482,11 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK | + IORING_TIMEOUT_ABS | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; - ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2)); + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags); if (ret) return ret; } else if (tr->flags) { @@ -545,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req, flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | IORING_TIMEOUT_ETIME_SUCCESS | - IORING_TIMEOUT_MULTISHOT)) + IORING_TIMEOUT_MULTISHOT | + IORING_TIMEOUT_IMMEDIATE_ARG)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -574,7 +584,7 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr)); + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags); if (ret) return ret; -- cgit v1.2.3 From 97af961568c8682c44506c9ad4b26c8a5455ec1d Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 9 Mar 2026 12:45:43 +0000 Subject: ASoC: cs35l56: Put OTP register defines in correct address order Move the defines for the OTP registers to keep the register defines in order of increasing address. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260309124543.1135247-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index ae1e1489b671..51b153bb8598 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -32,9 +32,6 @@ struct snd_ctl_elem_value; #define CS35L56_UPDATE_REGS 0x0002A0C #define CS35L56_REFCLK_INPUT 0x0002C04 #define CS35L56_GLOBAL_SAMPLE_RATE 0x0002C0C -#define CS35L56_OTP_MEM_53 0x00300D4 -#define CS35L56_OTP_MEM_54 0x00300D8 -#define CS35L56_OTP_MEM_55 0x00300DC #define CS35L56_ASP1_ENABLES1 0x0004800 #define CS35L56_ASP1_CONTROL1 0x0004804 #define CS35L56_ASP1_CONTROL2 0x0004808 @@ -86,6 +83,9 @@ struct snd_ctl_elem_value; #define CS35L56_DIE_STS1 0x0017040 #define CS35L56_DIE_STS2 0x0017044 #define CS35L56_DSP_RESTRICT_STS1 0x00190F0 +#define CS35L56_OTP_MEM_53 0x00300D4 +#define CS35L56_OTP_MEM_54 0x00300D8 +#define CS35L56_OTP_MEM_55 0x00300DC #define CS35L56_DSP1_XMEM_PACKED_0 0x2000000 #define CS35L56_DSP1_XMEM_PACKED_6143 0x2005FFC #define CS35L56_DSP1_XMEM_UNPACKED32_0 0x2400000 -- cgit v1.2.3 From 7ea25eaad5ae3a6c837a3df9bdb822194f002565 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:01 -0800 Subject: block: factor out a bio_integrity_action helper Split the logic to see if a bio needs integrity metadata from bio_integrity_prep into a reusable helper than can be called from file system code. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 64 ++++++------------------------------------- block/bio-integrity.c | 48 ++++++++++++++++++++++++++++++++ block/blk-mq.c | 6 ++-- drivers/nvdimm/btt.c | 6 ++-- include/linux/bio-integrity.h | 5 ++-- include/linux/blk-integrity.h | 23 ++++++++++++++++ 6 files changed, 89 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 44dcdf7520c5..e16f669dbf1e 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -50,11 +50,6 @@ static bool bip_should_check(struct bio_integrity_payload *bip) return bip->bip_flags & BIP_CHECK_FLAGS; } -static bool bi_offload_capable(struct blk_integrity *bi) -{ - return bi->metadata_size == bi->pi_tuple_size; -} - /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio @@ -84,69 +79,27 @@ bool __bio_integrity_endio(struct bio *bio) /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare + * @action: preparation action needed (BI_ACT_*) + * + * Allocate the integrity payload. For writes, generate the integrity metadata + * and for reads, setup the completion handler to verify the metadata. * - * Checks if the bio already has an integrity payload attached. If it does, the - * payload has been generated by another kernel subsystem, and we just pass it - * through. - * Otherwise allocates integrity payload and for writes the integrity metadata - * will be generated. For reads, the completion handler will verify the - * metadata. + * This is used for bios that do not have user integrity payloads attached. */ -bool bio_integrity_prep(struct bio *bio) +void bio_integrity_prep(struct bio *bio, unsigned int action) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; - bool set_flags = true; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) { - if (bi_offload_capable(bi)) - return true; - set_flags = false; - } - break; - case REQ_OP_WRITE: - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->flags & BLK_INTEGRITY_NOGENERATE) { - if (bi_offload_capable(bi)) - return true; - set_flags = false; - gfp |= __GFP_ZERO; - } else if (bi->metadata_size > bi->pi_tuple_size) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) - return true; bid = mempool_alloc(&bid_pool, GFP_NOIO); bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; - bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO); + bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - if (set_flags) { + if (action & BI_ACT_CHECK) { if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) @@ -160,7 +113,6 @@ bool bio_integrity_prep(struct bio *bio) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; - return true; } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 20f5d301d32d..0955be90038b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,6 +7,7 @@ */ #include +#include #include "blk.h" struct bio_integrity_alloc { @@ -16,6 +17,53 @@ struct bio_integrity_alloc { static mempool_t integrity_buf_pool; +static bool bi_offload_capable(struct blk_integrity *bi) +{ + return bi->metadata_size == bi->pi_tuple_size; +} + +unsigned int __bio_integrity_action(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return 0; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) { + if (bi_offload_capable(bi)) + return 0; + return BI_ACT_BUFFER; + } + return BI_ACT_BUFFER | BI_ACT_CHECK; + case REQ_OP_WRITE: + /* + * Flush masquerading as write? + */ + if (!bio_sectors(bio)) + return 0; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->flags & BLK_INTEGRITY_NOGENERATE) { + if (bi_offload_capable(bi)) + return 0; + return BI_ACT_BUFFER | BI_ACT_ZERO; + } + + if (bi->metadata_size > bi->pi_tuple_size) + return BI_ACT_BUFFER | BI_ACT_CHECK | BI_ACT_ZERO; + return BI_ACT_BUFFER | BI_ACT_CHECK; + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(__bio_integrity_action); + void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9af8c3dec3f6..0b311a797178 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,6 +3143,7 @@ void blk_mq_submit_bio(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); + unsigned int integrity_action; struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; @@ -3195,8 +3196,9 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio) goto queue_exit; - if (!bio_integrity_prep(bio)) - goto queue_exit; + integrity_action = bio_integrity_action(bio); + if (integrity_action) + bio_integrity_prep(bio, integrity_action); blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index b6bef092f8b8..fdcb080a4314 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1435,14 +1435,16 @@ static void btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct btt *btt = bio->bi_bdev->bd_disk->private_data; + unsigned int integrity_action; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; int err = 0; bool do_acct; - if (!bio_integrity_prep(bio)) - return; + integrity_action = bio_integrity_action(bio); + if (integrity_action) + bio_integrity_prep(bio, integrity_action); do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 21e4652dcfd2..276cbbdd2c9d 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -78,7 +78,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); -bool bio_integrity_prep(struct bio *bio); +void bio_integrity_prep(struct bio *bio, unsigned int action); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); @@ -104,9 +104,8 @@ static inline void bio_integrity_unmap_user(struct bio *bio) { } -static inline bool bio_integrity_prep(struct bio *bio) +static inline void bio_integrity_prep(struct bio *bio, unsigned int action) { - return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c15b1ac62765..fd3f3c8c0fcd 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -180,4 +180,27 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +enum bio_integrity_action { + BI_ACT_BUFFER = (1u << 0), /* allocate buffer */ + BI_ACT_CHECK = (1u << 1), /* generate / verify PI */ + BI_ACT_ZERO = (1u << 2), /* zero buffer */ +}; + +/** + * bio_integrity_action - return the integrity action needed for a bio + * @bio: bio to operate on + * + * Returns the mask of integrity actions (BI_ACT_*) that need to be performed + * for @bio. + */ +unsigned int __bio_integrity_action(struct bio *bio); +static inline unsigned int bio_integrity_action(struct bio *bio) +{ + if (!blk_get_integrity(bio->bi_bdev->bd_disk)) + return 0; + if (bio_integrity(bio)) + return 0; + return __bio_integrity_action(bio); +} + #endif /* _LINUX_BLK_INTEGRITY_H */ -- cgit v1.2.3 From a936655697cd8d1bab2fd5189e2c33dd6356a266 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:02 -0800 Subject: block: factor out a bio_integrity_setup_default helper Add a helper to set the seed and check flag based on useful defaults from the profile. Note that this includes a small behavior change, as we now only set the seed if any action is set, which is fine as nothing will look at it otherwise. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 14 ++------------ block/bio-integrity.c | 16 ++++++++++++++++ include/linux/bio-integrity.h | 1 + 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index e16f669dbf1e..b64c71a7fc82 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -88,7 +88,6 @@ bool __bio_integrity_endio(struct bio *bio) */ void bio_integrity_prep(struct bio *bio, unsigned int action) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; bid = mempool_alloc(&bid_pool, GFP_NOIO); @@ -96,17 +95,8 @@ void bio_integrity_prep(struct bio *bio, unsigned int action) bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); - - bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - - if (action & BI_ACT_CHECK) { - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bid->bip.bip_flags |= BIP_IP_CHECKSUM; - if (bi->csum_type) - bid->bip.bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bid->bip.bip_flags |= BIP_CHECK_REFTAG; - } + if (action & BI_ACT_CHECK) + bio_integrity_setup_default(bio); /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0955be90038b..e79eaf047794 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -101,6 +101,22 @@ void bio_integrity_free_buf(struct bio_integrity_payload *bip) kfree(bvec_virt(bv)); } +void bio_integrity_setup_default(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->csum_type) { + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bip->bip_flags |= BIP_IP_CHECKSUM; + } + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 276cbbdd2c9d..232b86b9bbcb 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -143,5 +143,6 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); +void bio_integrity_setup_default(struct bio *bio); #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From 7afe93946dff63aa57c6db81f5eb43ac8233364e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:03 -0800 Subject: block: add a bdev_has_integrity_csum helper Factor out a helper to see if the block device has an integrity checksum from bdev_stable_writes so that it can be reused for other checks. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..dec0acaed6e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1477,14 +1477,18 @@ static inline bool bdev_synchronous(struct block_device *bdev) return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } -static inline bool bdev_stable_writes(struct block_device *bdev) +static inline bool bdev_has_integrity_csum(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); + struct queue_limits *lim = bdev_limits(bdev); - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) - return true; - return q->limits.features & BLK_FEAT_STABLE_WRITES; + return IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + lim->integrity.csum_type != BLK_INTEGRITY_CSUM_NONE; +} + +static inline bool bdev_stable_writes(struct block_device *bdev) +{ + return bdev_has_integrity_csum(bdev) || + (bdev_limits(bdev)->features & BLK_FEAT_STABLE_WRITES); } static inline bool blk_queue_write_cache(struct request_queue *q) -- cgit v1.2.3 From 8c56ef10150ed7650cf4105539242c94c156148c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:05 -0800 Subject: block: make max_integrity_io_size public File systems that generate integrity will need this, so move it out of the block private or blk-mq specific headers. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/blk-settings.c | 13 ------------- include/linux/blk-integrity.h | 5 ----- include/linux/blkdev.h | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index a9e65dc090da..dabfab97fbab 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -123,19 +123,6 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } -/* - * Maximum size of I/O that needs a block layer integrity buffer. Limited - * by the number of intervals for which we can fit the integrity buffer into - * the buffer size. Because the buffer is a single segment it is also limited - * by the maximum segment size. - */ -static inline unsigned int max_integrity_io_size(struct queue_limits *lim) -{ - return min_t(unsigned int, lim->max_segment_size, - (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << - lim->integrity.interval_exp); -} - static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index fd3f3c8c0fcd..ea6d7d322ae3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,11 +8,6 @@ struct request; -/* - * Maximum contiguous integrity buffer allocation. - */ -#define BLK_INTEGRITY_MAX_SIZE SZ_2M - enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dec0acaed6e6..11857ae13d10 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1881,6 +1881,24 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 0bde8a12b5540572a7fd6d2867bee6de15e4f289 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:06 -0800 Subject: block: add fs_bio_integrity helpers Add a set of helpers for file system initiated integrity information. These include mempool backed allocations and verifying based on a passed in sector and size which is often available from file system completion routines. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/bio-integrity-fs.c | 81 +++++++++++++++++++++++++++++++++++++++++++ include/linux/bio-integrity.h | 6 ++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 block/bio-integrity-fs.c (limited to 'include') diff --git a/block/Makefile b/block/Makefile index c65f4da93702..7dce2e44276c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,7 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ - bio-integrity-auto.o + bio-integrity-auto.o bio-integrity-fs.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bio-integrity-fs.c b/block/bio-integrity-fs.c new file mode 100644 index 000000000000..acb1e5f270d2 --- /dev/null +++ b/block/bio-integrity-fs.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Christoph Hellwig. + */ +#include +#include +#include "blk.h" + +struct fs_bio_integrity_buf { + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *fs_bio_integrity_cache; +static mempool_t fs_bio_integrity_pool; + +unsigned int fs_bio_integrity_alloc(struct bio *bio) +{ + struct fs_bio_integrity_buf *iib; + unsigned int action; + + action = bio_integrity_action(bio); + if (!action) + return 0; + + iib = mempool_alloc(&fs_bio_integrity_pool, GFP_NOIO); + bio_integrity_init(bio, &iib->bip, &iib->bvec, 1); + + bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); + if (action & BI_ACT_CHECK) + bio_integrity_setup_default(bio); + return action; +} + +void fs_bio_integrity_free(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + bio_integrity_free_buf(bip); + mempool_free(container_of(bip, struct fs_bio_integrity_buf, bip), + &fs_bio_integrity_pool); + + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} + +void fs_bio_integrity_generate(struct bio *bio) +{ + if (fs_bio_integrity_alloc(bio)) + bio_integrity_generate(bio); +} +EXPORT_SYMBOL_GPL(fs_bio_integrity_generate); + +int fs_bio_integrity_verify(struct bio *bio, sector_t sector, unsigned int size) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + /* + * Reinitialize bip->bip_iter. + * + * This is for use in the submitter after the driver is done with the + * bio. Requires the submitter to remember the sector and the size. + */ + memset(&bip->bip_iter, 0, sizeof(bip->bip_iter)); + bip->bip_iter.bi_sector = sector; + bip->bip_iter.bi_size = bio_integrity_bytes(bi, size >> SECTOR_SHIFT); + return blk_status_to_errno(bio_integrity_verify(bio, &bip->bip_iter)); +} + +static int __init fs_bio_integrity_init(void) +{ + fs_bio_integrity_cache = kmem_cache_create("fs_bio_integrity", + sizeof(struct fs_bio_integrity_buf), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (mempool_init_slab_pool(&fs_bio_integrity_pool, BIO_POOL_SIZE, + fs_bio_integrity_cache)) + panic("fs_bio_integrity: can't create pool\n"); + return 0; +} +fs_initcall(fs_bio_integrity_init); diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 232b86b9bbcb..af5178434ec6 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -145,4 +145,10 @@ void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); void bio_integrity_setup_default(struct bio *bio); +unsigned int fs_bio_integrity_alloc(struct bio *bio); +void fs_bio_integrity_free(struct bio *bio); +void fs_bio_integrity_generate(struct bio *bio); +int fs_bio_integrity_verify(struct bio *bio, sector_t sector, + unsigned int size); + #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From a9aa6045abde87b94168c3ba034b953417e27272 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:07 -0800 Subject: block: pass a maxlen argument to bio_iov_iter_bounce Allow the file system to limit the size processed in a single bounce operation. This is needed when generating integrity data so that the size of a single integrity segment can't overflow. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio.c | 17 ++++++++++------- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index d80d5d26804e..784d2a66d3ae 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1327,9 +1327,10 @@ static void bio_free_folios(struct bio *bio) } } -static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) +static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, + size_t maxlen) { - size_t total_len = iov_iter_count(iter); + size_t total_len = min(maxlen, iov_iter_count(iter)); if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EINVAL; @@ -1367,9 +1368,10 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) return 0; } -static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) +static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, + size_t maxlen) { - size_t len = min(iov_iter_count(iter), SZ_1M); + size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M); struct folio *folio; folio = folio_alloc_greedy(GFP_KERNEL, &len); @@ -1408,6 +1410,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) * bio_iov_iter_bounce - bounce buffer data from an iter into a bio * @bio: bio to send * @iter: iter to read from / write into + * @maxlen: maximum size to bounce * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require @@ -1415,11 +1418,11 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen) { if (op_is_write(bio_op(bio))) - return bio_iov_iter_bounce_write(bio, iter); - return bio_iov_iter_bounce_read(bio, iter); + return bio_iov_iter_bounce_write(bio, iter, maxlen); + return bio_iov_iter_bounce_read(bio, iter, maxlen); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 95254aa1b654..21d4fad2eeb8 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -338,7 +338,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, bio->bi_end_io = iomap_dio_bio_end_io; if (dio->flags & IOMAP_DIO_BOUNCE) - ret = bio_iov_iter_bounce(bio, dio->submit.iter); + ret = bio_iov_iter_bounce(bio, dio->submit.iter, BIO_MAX_SIZE); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/include/linux/bio.h b/include/linux/bio.h index 36a3f2275ecd..9693a0d6fefe 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,7 +474,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From b9e0180b2e6a48532eb80e5cd8793157196586cf Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:43 +0100 Subject: fbdev: Declare src parameter of fb_pad_ helpers as constant Fbdev's padding helpers do not modify the source buffer. Declare the parameter as 'const'. Fbcon's font-rendering code calls these helpers with the font data. Declaring src as const will allow for making the font data constant as well. While at it, also remove the extern qualifier from the function declarations in the header file. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbmem.c | 6 +++--- include/linux/fb.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index cf199038f069..30f2b59c47bf 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -91,14 +91,14 @@ EXPORT_SYMBOL(fb_get_color_depth); /* * Data padding functions. */ -void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) +void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, u32 height) { __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); -void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, - u32 shift_high, u32 shift_low, u32 mod) +void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 idx, u32 height, + u32 shift_high, u32 shift_low, u32 mod) { u8 mask = (u8) (0xff << shift_high), tmp; int i, j; diff --git a/include/linux/fb.h b/include/linux/fb.h index b27943719fab..5178a33c752c 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -602,9 +602,9 @@ extern int register_framebuffer(struct fb_info *fb_info); extern void unregister_framebuffer(struct fb_info *fb_info); extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, - u32 height, u32 shift_high, u32 shift_low, u32 mod); -extern void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height); +void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 idx, u32 height, + u32 shift_high, u32 shift_low, u32 mod); +void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, u32 height); extern void fb_set_suspend(struct fb_info *info, int state); extern int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix); @@ -630,8 +630,8 @@ static inline struct device *dev_of_fbinfo(const struct fb_info *info) #endif } -static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, - u8 *src, u32 s_pitch, u32 height) +static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, + u32 height) { u32 i, j; -- cgit v1.2.3 From 982f8b002aadef2b5169147b3a60a9eb62f908df Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:44 +0100 Subject: vt: Remove trailing whitespaces Fix coding style. No functional changes. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/console_struct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 13b35637bd5a..ebdb9750d348 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -120,7 +120,7 @@ struct vc_data { unsigned short vc_complement_mask; /* [#] Xor mask for mouse pointer */ unsigned short vc_s_complement_mask; /* Saved mouse pointer mask */ unsigned long vc_pos; /* Cursor address */ - /* fonts */ + /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ struct console_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ -- cgit v1.2.3 From 61912c607fa9955dcf3fc018b227baa98a6776dc Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:45 +0100 Subject: vt: Store font in struct vc_font Replace struct console_font with struct vc_font for the type of the vc_font field of struct vc_data. Struct console_font is UAPI, which prevents further changes. Hence a new data type is required. Struct console_font has a documented vertical pitch of 32 bytes. This is not the case after the font data has been loaded into the kernel. Changing the type of vc_font addresses this inconsistency. The font data is now declared as constant, as it might come from the kernel's read-only section. There's some fallout throughout the console code where non-const variables refer to it. Fix them. A later update will declare the font data to a dedicated data type. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/bitblit.c | 11 +++++------ drivers/video/fbdev/core/fbcon.c | 4 ++-- drivers/video/fbdev/core/fbcon.h | 4 ++-- include/linux/console_struct.h | 29 +++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 085ffb44c51a..7478accea8ec 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -22,8 +22,7 @@ /* * Accelerated handlers. */ -static void update_attr(u8 *dst, u8 *src, int attribute, - struct vc_data *vc) +static void update_attr(u8 *dst, const u8 *src, int attribute, struct vc_data *vc) { int i, offset = (vc->vc_font.height < 10) ? 1 : 2; int width = DIV_ROUND_UP(vc->vc_font.width, 8); @@ -81,7 +80,7 @@ static inline void bit_putcs_aligned(struct vc_data *vc, struct fb_info *info, u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; unsigned int charcnt = vc->vc_font.charcount; u32 idx = vc->vc_font.width >> 3; - u8 *src; + const u8 *src; while (cnt--) { u16 ch = scr_readw(s++) & charmask; @@ -120,7 +119,7 @@ static inline void bit_putcs_unaligned(struct vc_data *vc, u32 shift_low = 0, mod = vc->vc_font.width % 8; u32 shift_high = 8; u32 idx = vc->vc_font.width >> 3; - u8 *src; + const u8 *src; while (cnt--) { u16 ch = scr_readw(s++) & charmask; @@ -267,7 +266,7 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, bool enable, int y = real_y(par->p, vc->state.y); int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1; - char *src; + const u8 *src; cursor.set = 0; @@ -278,7 +277,7 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, bool enable, attribute = get_attribute(info, c); src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height)); - if (par->cursor_state.image.data != src || + if (par->cursor_state.image.data != (const char *)src || par->cursor_reset) { par->cursor_state.image.data = src; cursor.set |= FB_CUR_SETIMAGE; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 666261ae59d8..247bb90c08d3 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2286,7 +2286,7 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - u8 *fontdata = vc->vc_font.data; + const u8 *fontdata = vc->vc_font.data; u8 *data = font->data; int i, j; @@ -2417,7 +2417,7 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; int resize, ret, old_userfont, old_width, old_height, old_charcount; - u8 *old_data = vc->vc_font.data; + const u8 *old_data = vc->vc_font.data; resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); vc->vc_font.data = (void *)(p->fontdata = data); diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index fca14e9b729b..3f4386a40237 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -82,8 +82,8 @@ struct fbcon_par { int rotate; int cur_rotate; char *cursor_data; - u8 *fontbuffer; - u8 *fontdata; + u8 *fontbuffer; + const u8 *fontdata; u8 *cursor_src; u32 cursor_size; u32 fd_size; diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ebdb9750d348..ea0cdf4278a3 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -13,8 +13,9 @@ #ifndef _LINUX_CONSOLE_STRUCT_H #define _LINUX_CONSOLE_STRUCT_H -#include +#include #include +#include #include struct uni_pagedict; @@ -58,6 +59,30 @@ struct vc_state { bool reverse; }; +/** + * struct vc_font - Describes a font + * @width: The width of a single glyph in bits + * @height: The height of a single glyph in scanlines + * @charcount: The number of glyphs in the font + * @data: The raw font data + * + * Font data is organized as an array of glyphs. Each glyph is a bitmap with + * set bits indicating the foreground color. Unset bits indicate background + * color. The fields @width and @height store a single glyph's number of + * horizontal bits and vertical scanlines. If width is not a multiple of 8, + * there are trailing bits to fill up the byte. These bits should not be drawn. + * + * The field @data points to the first glyph's first byte. The value @charcount + * gives the number of glyphs in the font. There are no empty scanlines between + * two adjacent glyphs. + */ +struct vc_font { + unsigned int width; + unsigned int height; + unsigned int charcount; + const unsigned char *data; +}; + /* * Example: vc_data of a console that was scrolled 3 lines down. * @@ -122,7 +147,7 @@ struct vc_data { unsigned long vc_pos; /* Cursor address */ /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ - struct console_font vc_font; /* Current VC font set */ + struct vc_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ /* VT terminal data */ unsigned int vc_state; /* Escape sequence parser state */ -- cgit v1.2.3 From e370d84b79ad28ecf9a9e1dad967aa64dbfbd8d8 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:46 +0100 Subject: vt: Calculate font-buffer size with vc_font_size() In fbcon, fbcon_resize() computes the size of the font buffer from the values stored in vc_font. Move these calculations to the dedicated helpers vc_font_pitch() and vc_font_size(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 9 ++------- include/linux/console_struct.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 247bb90c08d3..103e91c8d874 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2037,7 +2037,6 @@ static void updatescrollmode(struct fbcon_display *p, } #define PITCH(w) (((w) + 7) >> 3) -#define CALC_FONTSZ(h, p, c) ((h) * (p) * (c)) /* size = height * pitch * charcount */ static int fbcon_resize(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user) @@ -2049,8 +2048,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; if (p->userfont && FNTSIZE(vc->vc_font.data)) { - int size; - int pitch = PITCH(vc->vc_font.width); + unsigned int size = vc_font_size(&vc->vc_font); /* * If user font, ensure that a possible change to user font @@ -2059,10 +2057,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (pitch <= 0) - return -EINVAL; - size = CALC_FONTSZ(vc->vc_font.height, pitch, vc->vc_font.charcount); - if (size > FNTSIZE(vc->vc_font.data)) + if (!size || size > FNTSIZE(vc->vc_font.data)) return -EINVAL; } diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ea0cdf4278a3..771cba16cb54 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -83,6 +83,34 @@ struct vc_font { const unsigned char *data; }; +/** + * vc_font_pitch - Calculates the number of bytes between two adjacent scanlines + * @font: The VC font + * + * Returns: + * The number of bytes between two adjacent scanlines in the font data + */ +static inline unsigned int vc_font_pitch(const struct vc_font *font) +{ + return DIV_ROUND_UP(font->width, 8); +} + +/** + * vc_font_size - Calculates the size of the font data in bytes + * @font: The VC font + * + * vc_font_size() calculates the number of bytes of font data in the + * font specified by @font. The function calculates the size from the + * font parameters. + * + * Returns: + * The size of the font data in bytes. + */ +static inline unsigned int vc_font_size(const struct vc_font *font) +{ + return font->height * vc_font_pitch(font) * font->charcount; +} + /* * Example: vc_data of a console that was scrolled 3 lines down. * -- cgit v1.2.3 From 773ac24c44614ad1d5a96258534160c4a0d48d72 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:48 +0100 Subject: lib/fonts: Remove FNTCHARCNT() The character count in the font data is unused. The internal fonts also do not set it. Remove FNTCHARCNT(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 1 - include/linux/font.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 337e04236d6d..259a63fc7789 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -516,7 +516,6 @@ static int newport_set_font(int unit, const struct console_font *op, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - FNTCHARCNT(new_data) = op->charcount; REFCOUNT(new_data) = 0; /* usage counter */ FNTSUM(new_data) = 0; diff --git a/include/linux/font.h b/include/linux/font.h index fd8625cd76b2..d929c5fa32ca 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -68,7 +68,6 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Extra word getters */ #define REFCOUNT(fd) (((int *)(fd))[-1]) #define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTCHARCNT(fd) (((int *)(fd))[-3]) #define FNTSUM(fd) (((int *)(fd))[-4]) #define FONT_EXTRA_WORDS 4 -- cgit v1.2.3 From 04bd5abc8cbebc1bd7e02471a8e3af51b8aad029 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:49 +0100 Subject: lib/fonts: Store font data as font_data_t; update consoles Store font data as pointer to font_data_t instead of unsigned char. Update consoles. Pointers to font data refer to the raw data. There is a hidden header before the data that contains additional state. Document the existing layout and semantics of font_data_t. The data field in struct vc_font can be used by any console. Therefore it still points to plain data without the additional header. Fbcon sets its value from struct fbcon_display.fontdata. Hence, update the size test in fbcon_resize() to use struct fbcon_display.fontdata instead of struct vc_font.data. v3: - fix typos (Helge) v2: - 'Font lookup' -> 'Font description' in Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 17 +++++++------- drivers/video/fbdev/core/fbcon.c | 44 ++++++++++++++++++++-------------- drivers/video/fbdev/core/fbcon.h | 3 ++- include/linux/font.h | 47 ++++++++++++++++++++++++++++++++++++- 4 files changed, 84 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 259a63fc7789..857b85df360c 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -33,9 +33,9 @@ #define NEWPORT_LEN 0x10000 -#define FONT_DATA ((unsigned char *)font_vga_8x16.data) +#define FONT_DATA font_vga_8x16.data -static unsigned char *font_data[MAX_NR_CONSOLES]; +static font_data_t *font_data[MAX_NR_CONSOLES]; static struct newport_regs *npregs; static unsigned long newport_addr; @@ -370,9 +370,9 @@ static void newport_clear(struct vc_data *vc, unsigned int sy, unsigned int sx, static void newport_putc(struct vc_data *vc, u16 charattr, unsigned int ypos, unsigned int xpos) { - unsigned char *p; + const unsigned char *p; - p = &font_data[vc->vc_num][(charattr & 0xff) << 4]; + p = &font_data_buf(font_data[vc->vc_num])[(charattr & 0xff) << 4]; charattr = (charattr >> 8) & 0xff; xpos <<= 3; ypos <<= 4; @@ -400,7 +400,7 @@ static void newport_putcs(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos) { - unsigned char *p; + const unsigned char *p; unsigned int i; u16 charattr; @@ -424,7 +424,7 @@ static void newport_putcs(struct vc_data *vc, const u16 *s, NPORT_DMODE0_L32); for (i = 0; i < count; i++, xpos += 8) { - p = &font_data[vc->vc_num][(scr_readw(s++) & 0xff) << 4]; + p = &font_data_buf(font_data[vc->vc_num])[(scr_readw(s++) & 0xff) << 4]; newport_wait(npregs); @@ -503,7 +503,8 @@ static int newport_set_font(int unit, const struct console_font *op, int h = op->height; int size = h * op->charcount; int i; - unsigned char *new_data, *data = op->data, *p; + font_data_t *new_data; + unsigned char *data = op->data, *p; /* ladis: when I grow up, there will be a day... and more sizes will * be supported ;-) */ @@ -519,7 +520,7 @@ static int newport_set_font(int unit, const struct console_font *op, REFCOUNT(new_data) = 0; /* usage counter */ FNTSUM(new_data) = 0; - p = new_data; + p = (unsigned char *)font_data_buf(new_data); for (i = 0; i < op->charcount; i++) { memcpy(p, data, h); data += 32; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 103e91c8d874..8d7840b9ebad 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1019,8 +1019,10 @@ static const char *fbcon_startup(void) info->pixmap.blit_y); vc->vc_font.width = font->width; vc->vc_font.height = font->height; - vc->vc_font.data = (void *)(p->fontdata = font->data); + vc->vc_font.data = font_data_buf(font->data); vc->vc_font.charcount = font->charcount; + + p->fontdata = font->data; } cols = FBCON_SWAP(par->rotate, info->var.xres, info->var.yres); @@ -1078,11 +1080,12 @@ static void fbcon_init(struct vc_data *vc, bool init) if (t->fontdata) { struct vc_data *fvc = vc_cons[fg_console].d; - vc->vc_font.data = (void *)(p->fontdata = - fvc->vc_font.data); + vc->vc_font.data = fvc->vc_font.data; vc->vc_font.width = fvc->vc_font.width; vc->vc_font.height = fvc->vc_font.height; vc->vc_font.charcount = fvc->vc_font.charcount; + + p->fontdata = t->fontdata; p->userfont = t->userfont; if (p->userfont) @@ -1097,8 +1100,10 @@ static void fbcon_init(struct vc_data *vc, bool init) info->pixmap.blit_y); vc->vc_font.width = font->width; vc->vc_font.height = font->height; - vc->vc_font.data = (void *)(p->fontdata = font->data); + vc->vc_font.data = font_data_buf(font->data); vc->vc_font.charcount = font->charcount; + + p->fontdata = font->data; } } @@ -1409,11 +1414,12 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, svc = *default_mode; t = &fb_display[svc->vc_num]; - if (!vc->vc_font.data) { - vc->vc_font.data = (void *)(p->fontdata = t->fontdata); + if (!p->fontdata) { + vc->vc_font.data = font_data_buf(t->fontdata); vc->vc_font.width = (*default_mode)->vc_font.width; vc->vc_font.height = (*default_mode)->vc_font.height; vc->vc_font.charcount = (*default_mode)->vc_font.charcount; + p->fontdata = t->fontdata; p->userfont = t->userfont; if (p->userfont) REFCOUNT(p->fontdata)++; @@ -2047,7 +2053,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && FNTSIZE(vc->vc_font.data)) { + if (p->userfont && FNTSIZE(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2057,7 +2063,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (!size || size > FNTSIZE(vc->vc_font.data)) + if (!size || size > FNTSIZE(p->fontdata)) return -EINVAL; } @@ -2281,7 +2287,8 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - const u8 *fontdata = vc->vc_font.data; + struct fbcon_display *p = &fb_display[vc->vc_num]; + font_data_t *fontdata = p->fontdata; u8 *data = font->data; int i, j; @@ -2406,16 +2413,18 @@ static void set_vc_hi_font(struct vc_data *vc, bool set) } static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, - const u8 * data, int userfont) + font_data_t *data, int userfont) { struct fb_info *info = fbcon_info_from_console(vc->vc_num); struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; int resize, ret, old_userfont, old_width, old_height, old_charcount; + font_data_t *old_fontdata = p->fontdata; const u8 *old_data = vc->vc_font.data; resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); - vc->vc_font.data = (void *)(p->fontdata = data); + p->fontdata = data; + vc->vc_font.data = font_data_buf(p->fontdata); old_userfont = p->userfont; if ((p->userfont = userfont)) REFCOUNT(data)++; @@ -2448,12 +2457,12 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, update_screen(vc); } - if (old_userfont && (--REFCOUNT(old_data) == 0)) - kfree(old_data - FONT_EXTRA_WORDS * sizeof(int)); + if (old_userfont && (--REFCOUNT(old_fontdata) == 0)) + kfree(old_fontdata - FONT_EXTRA_WORDS * sizeof(int)); return 0; err_out: - p->fontdata = old_data; + p->fontdata = old_fontdata; vc->vc_font.data = old_data; if (userfont) { @@ -2483,7 +2492,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, int h = font->height; int size, alloc_size; int i, csum; - u8 *new_data, *data = font->data; + font_data_t *new_data; + u8 *data = font->data; int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? @@ -2522,13 +2532,13 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (!new_data) return -ENOMEM; - memset(new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); + memset((u8 *)new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; REFCOUNT(new_data) = 0; /* usage counter */ for (i=0; i< charcount; i++) { - memcpy(new_data + i*h*pitch, data + i*vpitch*pitch, h*pitch); + memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); } /* Since linux has a nice crc32 function use it for counting font diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 3f4386a40237..d26ee7860cf5 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -11,6 +11,7 @@ #ifndef _VIDEO_FBCON_H #define _VIDEO_FBCON_H +#include #include #include #include @@ -25,7 +26,7 @@ struct fbcon_display { /* Filled in by the low-level console driver */ - const u_char *fontdata; + font_data_t *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ diff --git a/include/linux/font.h b/include/linux/font.h index d929c5fa32ca..746a0996a018 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,12 +13,57 @@ #include +/* + * font_data_t and helpers + */ + +/** + * font_data_t - Raw font data + * + * Values of type font_data_t store a pointer to raw font data. The format + * is monochrome. Each bit sets a pixel of a stored glyph. Font data does + * not store geometry information for the individual glyphs. Users of the + * font have to store glyph size, pitch and character count separately. + * + * Font data in font_data_t is not equivalent to raw u8. Each pointer stores + * an additional hidden header before the font data. The layout is + * + * +------+-----------------------------+ + * | -16 | CRC32 Checksum (optional) | + * | -12 | | + * | -8 | Number of data bytes | + * | -4 | Reference count | + * +------+-----------------------------+ + * | 0 | Data buffer | + * | ... | | + * +------+-----------------------------+ + * + * Use helpers to access font_data_t. Use font_data_buf() to get the stored data. + */ +typedef const unsigned char font_data_t; + +/** + * font_data_buf() - Returns the font data as raw bytes + * @fd: The font data + * + * Returns: + * The raw font data. The provided buffer is read-only. + */ +static inline const unsigned char *font_data_buf(font_data_t *fd) +{ + return (const unsigned char *)fd; +} + +/* + * Font description + */ + struct font_desc { int idx; const char *name; unsigned int width, height; unsigned int charcount; - const void *data; + font_data_t *data; int pref; }; -- cgit v1.2.3 From e2e000a0b22036a72474088d3399097ff47ace02 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:50 +0100 Subject: lib/fonts: Read font size with font_data_size() Add font_data_size() and update consoles to use it. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 2 +- drivers/video/fbdev/core/fbcon.c | 14 +++++++------- include/linux/font.h | 2 ++ lib/fonts/fonts.c | 21 +++++++++++++++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 857b85df360c..00ace2940aa0 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -530,7 +530,7 @@ static int newport_set_font(int unit, const struct console_font *op, /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { if (font_data[i] != FONT_DATA - && FNTSIZE(font_data[i]) == size + && font_data_size(font_data[i]) == size && !memcmp(font_data[i], new_data, size)) { kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); /* current font is the same as the new one */ diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 8d7840b9ebad..fa8f3e4196de 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2053,7 +2053,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && FNTSIZE(p->fontdata)) { + if (p->userfont && font_data_size(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2063,7 +2063,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (!size || size > FNTSIZE(p->fontdata)) + if (!size || size > font_data_size(p->fontdata)) return -EINVAL; } @@ -2302,7 +2302,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne if (font->width <= 8) { j = vc->vc_font.height; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2313,7 +2313,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne } } else if (font->width <= 16) { j = vc->vc_font.height * 2; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2323,7 +2323,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne fontdata += j; } } else if (font->width <= 24) { - if (font->charcount * (vc->vc_font.height * sizeof(u32)) > FNTSIZE(fontdata)) + if (font->charcount * (vc->vc_font.height * sizeof(u32)) > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2338,7 +2338,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne } } else { j = vc->vc_font.height * 4; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2553,7 +2553,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fb_display[i].userfont && fb_display[i].fontdata && FNTSUM(fb_display[i].fontdata) == csum && - FNTSIZE(fb_display[i].fontdata) == size && + font_data_size(fb_display[i].fontdata) == size && tmp->vc_font.width == w && !memcmp(fb_display[i].fontdata, new_data, size)) { kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); diff --git a/include/linux/font.h b/include/linux/font.h index 746a0996a018..5b8557813c5c 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +unsigned int font_data_size(font_data_t *fd); + /* * Font description */ diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index a7f118b30171..8c9a6762061c 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -20,6 +20,27 @@ #endif #include +/* + * Helpers for font_data_t + */ + +/** + * font_data_size - Return size of the font data in bytes + * @fd: Font data + * + * Returns: + * The number of bytes in the given font data. + */ +unsigned int font_data_size(font_data_t *fd) +{ + return FNTSIZE(fd); +} +EXPORT_SYMBOL_GPL(font_data_size); + +/* + * Font lookup + */ + static const struct font_desc *fonts[] = { #ifdef CONFIG_FONT_8x8 &font_vga_8x8, -- cgit v1.2.3 From 1de371b1f1b02dc82da598f9707089229699a604 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:51 +0100 Subject: lib/fonts: Manage font-data lifetime with font_data_get/_put() Add font_data_get() and font_data_put(). Update consoles to use them over REFCOUNT() and plain kfree(). Newly allocated font data starts with a reference count of 1. Loading the font puts the previously loaded font. If the reference count reaches zero, font_data_put() frees the font data. The kernel stores a refcount of zero for internal font data. Invoking font_data_get() and font_data_put() tests this internally and returns success without further operation. From the caller's perspective, getting and putting works the same for all font data. Fbcon used the userfont flag distinguish between internal fonts and fonts loaded by user space. Only the latter where refcounted. With the new helper's automatic handling of internal font data, remove the userfont flag from fbcon. Newport_con uses a default font, FONT_DATA, until user space loads custom font data. Remove all special cases for FONT_DATA, as the get and put calls' read-only handling also covers this case. v3: - fix module linker error wrt font symbols (Nathan, Arnd) - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 23 ++++------- drivers/video/fbdev/core/fbcon.c | 65 +++++++++++++++---------------- drivers/video/fbdev/core/fbcon.h | 1 - include/linux/font.h | 2 + lib/fonts/fonts.c | 77 ++++++++++++++++++++++++++++++++++++- 5 files changed, 115 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 00ace2940aa0..dbbb787fc46e 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -517,7 +517,7 @@ static int newport_set_font(int unit, const struct console_font *op, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 0; /* usage counter */ + REFCOUNT(new_data) = 1; /* usage counter */ FNTSUM(new_data) = 0; p = (unsigned char *)font_data_buf(new_data); @@ -532,21 +532,17 @@ static int newport_set_font(int unit, const struct console_font *op, if (font_data[i] != FONT_DATA && font_data_size(font_data[i]) == size && !memcmp(font_data[i], new_data, size)) { - kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); + font_data_put(new_data); /* current font is the same as the new one */ if (i == unit) return 0; new_data = font_data[i]; + font_data_get(new_data); break; } } - /* old font is user font */ - if (font_data[unit] != FONT_DATA) { - if (--REFCOUNT(font_data[unit]) == 0) - kfree(font_data[unit] - - FONT_EXTRA_WORDS * sizeof(int)); - } - REFCOUNT(new_data)++; + + font_data_put(font_data[unit]); font_data[unit] = new_data; return 0; @@ -554,12 +550,9 @@ static int newport_set_font(int unit, const struct console_font *op, static int newport_set_def_font(int unit, struct console_font *op) { - if (font_data[unit] != FONT_DATA) { - if (--REFCOUNT(font_data[unit]) == 0) - kfree(font_data[unit] - - FONT_EXTRA_WORDS * sizeof(int)); - font_data[unit] = FONT_DATA; - } + font_data_put(font_data[unit]); + font_data[unit] = FONT_DATA; + font_data_get(font_data[unit]); return 0; } diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fa8f3e4196de..ac59480c98cb 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1023,6 +1023,7 @@ static const char *fbcon_startup(void) vc->vc_font.charcount = font->charcount; p->fontdata = font->data; + font_data_get(p->fontdata); } cols = FBCON_SWAP(par->rotate, info->var.xres, info->var.yres); @@ -1086,10 +1087,7 @@ static void fbcon_init(struct vc_data *vc, bool init) vc->vc_font.charcount = fvc->vc_font.charcount; p->fontdata = t->fontdata; - p->userfont = t->userfont; - - if (p->userfont) - REFCOUNT(p->fontdata)++; + font_data_get(p->fontdata); } else { const struct font_desc *font = NULL; @@ -1104,6 +1102,7 @@ static void fbcon_init(struct vc_data *vc, bool init) vc->vc_font.charcount = font->charcount; p->fontdata = font->data; + font_data_get(p->fontdata); } } @@ -1194,10 +1193,10 @@ static void fbcon_init(struct vc_data *vc, bool init) static void fbcon_free_font(struct fbcon_display *p) { - if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0)) - kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int)); - p->fontdata = NULL; - p->userfont = 0; + if (p->fontdata) { + font_data_put(p->fontdata); + p->fontdata = NULL; + } } static void set_vc_hi_font(struct vc_data *vc, bool set); @@ -1420,9 +1419,7 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, vc->vc_font.height = (*default_mode)->vc_font.height; vc->vc_font.charcount = (*default_mode)->vc_font.charcount; p->fontdata = t->fontdata; - p->userfont = t->userfont; - if (p->userfont) - REFCOUNT(p->fontdata)++; + font_data_get(p->fontdata); } var->activate = FB_ACTIVATE_NOW; @@ -2053,7 +2050,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && font_data_size(p->fontdata)) { + if (font_data_size(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2413,21 +2410,20 @@ static void set_vc_hi_font(struct vc_data *vc, bool set) } static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, - font_data_t *data, int userfont) + font_data_t *data) { struct fb_info *info = fbcon_info_from_console(vc->vc_num); struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; - int resize, ret, old_userfont, old_width, old_height, old_charcount; + int resize, ret, old_width, old_height, old_charcount; font_data_t *old_fontdata = p->fontdata; const u8 *old_data = vc->vc_font.data; + font_data_get(data); + resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); p->fontdata = data; vc->vc_font.data = font_data_buf(p->fontdata); - old_userfont = p->userfont; - if ((p->userfont = userfont)) - REFCOUNT(data)++; old_width = vc->vc_font.width; old_height = vc->vc_font.height; @@ -2457,24 +2453,20 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, update_screen(vc); } - if (old_userfont && (--REFCOUNT(old_fontdata) == 0)) - kfree(old_fontdata - FONT_EXTRA_WORDS * sizeof(int)); + if (old_fontdata) + font_data_put(old_fontdata); + return 0; err_out: p->fontdata = old_fontdata; vc->vc_font.data = old_data; - - if (userfont) { - p->userfont = old_userfont; - if (--REFCOUNT(data) == 0) - kfree(data - FONT_EXTRA_WORDS * sizeof(int)); - } - vc->vc_font.width = old_width; vc->vc_font.height = old_height; vc->vc_font.charcount = old_charcount; + font_data_put(data); + return ret; } @@ -2491,9 +2483,9 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, int w = font->width; int h = font->height; int size, alloc_size; - int i, csum; + int i, csum, ret; font_data_t *new_data; - u8 *data = font->data; + const u8 *data = font->data; int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? @@ -2536,7 +2528,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 0; /* usage counter */ + REFCOUNT(new_data) = 1; /* usage counter */ for (i=0; i< charcount; i++) { memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); } @@ -2550,18 +2542,21 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, for (i = first_fb_vc; i <= last_fb_vc; i++) { struct vc_data *tmp = vc_cons[i].d; - if (fb_display[i].userfont && - fb_display[i].fontdata && + if (fb_display[i].fontdata && FNTSUM(fb_display[i].fontdata) == csum && font_data_size(fb_display[i].fontdata) == size && tmp->vc_font.width == w && !memcmp(fb_display[i].fontdata, new_data, size)) { - kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); - new_data = (u8 *)fb_display[i].fontdata; + font_data_get(fb_display[i].fontdata); + font_data_put(new_data); + new_data = fb_display[i].fontdata; break; } } - return fbcon_do_set_font(vc, font->width, font->height, charcount, new_data, 1); + ret = fbcon_do_set_font(vc, font->width, font->height, charcount, new_data); + font_data_put(new_data); + + return ret; } static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, @@ -2578,7 +2573,7 @@ static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, font->width = f->width; font->height = f->height; - return fbcon_do_set_font(vc, f->width, f->height, f->charcount, f->data, 0); + return fbcon_do_set_font(vc, f->width, f->height, f->charcount, f->data); } static u16 palette_red[16]; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index d26ee7860cf5..1e3c1ef84762 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -27,7 +27,6 @@ struct fbcon_display { /* Filled in by the low-level console driver */ font_data_t *fontdata; - int userfont; /* != 0 if fontdata kmalloc()ed */ #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ #endif diff --git a/include/linux/font.h b/include/linux/font.h index 5b8557813c5c..dd319d0f0201 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +void font_data_get(font_data_t *fd); +bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); /* diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 8c9a6762061c..d25efd8d6c31 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -12,18 +12,91 @@ * for more details. */ +#include +#include #include -#include +#include #include +#include + #if defined(__mc68000__) #include #endif -#include /* * Helpers for font_data_t */ +static struct font_data *to_font_data_struct(font_data_t *fd) +{ + return container_of(fd, struct font_data, data[0]); +} + +static bool font_data_is_internal(font_data_t *fd) +{ + return !REFCOUNT(fd); /* internal fonts have no reference counting */ +} + +static void font_data_free(font_data_t *fd) +{ + kfree(to_font_data_struct(fd)); +} + +/** + * font_data_get - Acquires a reference on font data + * @fd: Font data + * + * Font data from user space is reference counted. The helper + * font_data_get() increases the reference counter by one. Invoke + * font_data_put() to release the reference. + * + * Internal font data is located in read-only memory. In this case + * the helper returns success without modifying the counter field. + * It is still required to call font_data_put() on internal font data. + */ +void font_data_get(font_data_t *fd) +{ + if (font_data_is_internal(fd)) + return; /* never ref static data */ + + if (WARN_ON(!REFCOUNT(fd))) + return; /* should never be 0 */ + ++REFCOUNT(fd); +} +EXPORT_SYMBOL_GPL(font_data_get); + +/** + * font_data_put - Release a reference on font data + * @fd: Font data + * + * Font data from user space is reference counted. The helper + * font_data_put() decreases the reference counter by one. If this was + * the final reference, it frees the allocated memory. + * + * Internal font data is located in read-only memory. In this case + * the helper returns success without modifying the counter field. + * + * Returns: + * True if the font data's memory buffer has been freed, false otherwise. + */ +bool font_data_put(font_data_t *fd) +{ + unsigned int count; + + if (font_data_is_internal(fd)) + return false; /* never unref static data */ + + if (WARN_ON(!REFCOUNT(fd))) + return false; /* should never be 0 */ + + count = --REFCOUNT(fd); + if (!count) + font_data_free(fd); + + return !count; +} +EXPORT_SYMBOL_GPL(font_data_put); + /** * font_data_size - Return size of the font data in bytes * @fd: Font data -- cgit v1.2.3 From 1e3c49aa03fbfeea595ca0c9a4ebaf1e9cc078af Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:52 +0100 Subject: lib/fonts: Compare font data for equality with font_data_is_equal() Add font_data_is_equal() and update consoles to use it. Font data is equal if it has the same size and contains the same values on all bytes. Only fbcon uses a crc32 checksum. If set in both operands the checksums have to be equal. The new helper also guarantees to not compare internal fonts against fonts from user space. Internal fonts cannot be ref-counted, so making them equal to user-space fonts with the same byte sequence results in undefined behavior. The test only compares data buffers. Their interpretation is up each console. Therefore remove a width test in fbcon_set_font(). v3: - rebase onto font_data_{get,put}() Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 4 +--- drivers/video/fbdev/core/fbcon.c | 7 +------ include/linux/font.h | 1 + lib/fonts/fonts.c | 26 ++++++++++++++++++++++++++ 4 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index dbbb787fc46e..db0228bce00e 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -529,9 +529,7 @@ static int newport_set_font(int unit, const struct console_font *op, /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { - if (font_data[i] != FONT_DATA - && font_data_size(font_data[i]) == size - && !memcmp(font_data[i], new_data, size)) { + if (font_data_is_equal(font_data[i], new_data)) { font_data_put(new_data); /* current font is the same as the new one */ if (i == unit) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index ac59480c98cb..00255ac92e42 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2540,13 +2540,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, FNTSUM(new_data) = csum; /* Check if the same font is on some other console already */ for (i = first_fb_vc; i <= last_fb_vc; i++) { - struct vc_data *tmp = vc_cons[i].d; - if (fb_display[i].fontdata && - FNTSUM(fb_display[i].fontdata) == csum && - font_data_size(fb_display[i].fontdata) == size && - tmp->vc_font.width == w && - !memcmp(fb_display[i].fontdata, new_data, size)) { + font_data_is_equal(fb_display[i].fontdata, new_data)) { font_data_get(fb_display[i].fontdata); font_data_put(new_data); new_data = fb_display[i].fontdata; diff --git a/include/linux/font.h b/include/linux/font.h index dd319d0f0201..58bf3c64cabb 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -57,6 +57,7 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); +bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); /* * Font description diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index d25efd8d6c31..3fb76d185647 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -110,6 +110,32 @@ unsigned int font_data_size(font_data_t *fd) } EXPORT_SYMBOL_GPL(font_data_size); +/** + * font_data_is_equal - Compares font data for equality + * @lhs: Left-hand side font data + * @rhs: Right-hand-size font data + * + * Font data is equal if is constain the same sequence of values. The + * helper also use the checksum, if both arguments contain it. Font data + * coming from different origins, internal or from user space, is never + * equal. Allowing this would break reference counting. + * + * Returns: + * True if the given font data is equal, false otherwise. + */ +bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs) +{ + if (font_data_is_internal(lhs) != font_data_is_internal(rhs)) + return false; + if (font_data_size(lhs) != font_data_size(rhs)) + return false; + if (FNTSUM(lhs) && FNTSUM(rhs) && FNTSUM(lhs) != FNTSUM(rhs)) + return false; + + return !memcmp(lhs, rhs, FNTSIZE(lhs)); +} +EXPORT_SYMBOL_GPL(font_data_is_equal); + /* * Font lookup */ -- cgit v1.2.3 From 514d0de7cf403144d3e6c5b9fabb1ce4c15974ca Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:53 +0100 Subject: lib/fonts: Create font_data_t from struct console_font with font_data_import() Add font_data_import() and update consoles to use it. The implementation of font_data_import() is based on code from fbcon, which supports overflow checks and crc32 checksums. Fbcon uses the crc32 checksum. Newport_con now implements the same overflow checks as fbcon. As before, this console does not support checksums, which are optional. Newport_con can now also handle input font data with a vertical pitch other than 32 bytes. (The vertical pitch is the offset between two glyphs in the font data.) As an internal change, remove the const qualifier from the data field if struct font_data. This allows font_data_import() to write the data without type casting. For all users of the font data via font_data_t, the stored data is still read only. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 22 +++---------- drivers/video/fbdev/core/fbcon.c | 38 +++-------------------- include/linux/font.h | 6 +++- lib/fonts/fonts.c | 62 +++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index db0228bce00e..e88ff3a93b77 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -501,31 +501,17 @@ static int newport_set_font(int unit, const struct console_font *op, { int w = op->width; int h = op->height; - int size = h * op->charcount; int i; font_data_t *new_data; - unsigned char *data = op->data, *p; /* ladis: when I grow up, there will be a day... and more sizes will * be supported ;-) */ - if ((w != 8) || (h != 16) || (vpitch != 32) - || (op->charcount != 256 && op->charcount != 512)) + if (w != 8 || h != 16 || (op->charcount != 256 && op->charcount != 512)) return -EINVAL; - if (!(new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, - GFP_USER))) return -ENOMEM; - - new_data += FONT_EXTRA_WORDS * sizeof(int); - FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 1; /* usage counter */ - FNTSUM(new_data) = 0; - - p = (unsigned char *)font_data_buf(new_data); - for (i = 0; i < op->charcount; i++) { - memcpy(p, data, h); - data += 32; - p += h; - } + new_data = font_data_import(op, vpitch, NULL); + if (IS_ERR(new_data)) + return PTR_ERR(new_data); /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 00255ac92e42..53677c09a0ec 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2039,8 +2039,6 @@ static void updatescrollmode(struct fbcon_display *p, updatescrollmode_accel(p, info, vc); } -#define PITCH(w) (((w) + 7) >> 3) - static int fbcon_resize(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user) { @@ -2424,7 +2422,6 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); p->fontdata = data; vc->vc_font.data = font_data_buf(p->fontdata); - old_width = vc->vc_font.width; old_height = vc->vc_font.height; old_charcount = vc->vc_font.charcount; @@ -2482,11 +2479,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, unsigned charcount = font->charcount; int w = font->width; int h = font->height; - int size, alloc_size; - int i, csum, ret; + int i, ret; font_data_t *new_data; - const u8 *data = font->data; - int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? * If not this check should be changed to charcount < 256 */ @@ -2510,34 +2504,10 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fbcon_invalid_charcount(info, charcount)) return -EINVAL; - /* Check for integer overflow in font size calculation */ - if (check_mul_overflow(h, pitch, &size) || - check_mul_overflow(size, charcount, &size)) - return -EINVAL; - - /* Check for overflow in allocation size calculation */ - if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &alloc_size)) - return -EINVAL; - - new_data = kmalloc(alloc_size, GFP_USER); - - if (!new_data) - return -ENOMEM; - - memset((u8 *)new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); - - new_data += FONT_EXTRA_WORDS * sizeof(int); - FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 1; /* usage counter */ - for (i=0; i< charcount; i++) { - memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); - } - - /* Since linux has a nice crc32 function use it for counting font - * checksums. */ - csum = crc32(0, new_data, size); + new_data = font_data_import(font, vpitch, crc32); + if (IS_ERR(new_data)) + return PTR_ERR(new_data); - FNTSUM(new_data) = csum; /* Check if the same font is on some other console already */ for (i = first_fb_vc; i <= last_fb_vc; i++) { if (fb_display[i].fontdata && diff --git a/include/linux/font.h b/include/linux/font.h index 58bf3c64cabb..3eb4818402c5 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,6 +13,8 @@ #include +struct console_font; + /* * font_data_t and helpers */ @@ -54,6 +56,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +font_data_t *font_data_import(const struct console_font *font, unsigned int vpitch, + u32 (*calc_csum)(u32, const void *, size_t)); void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); @@ -124,7 +128,7 @@ extern const struct font_desc *get_default_font(int xres, int yres, struct font_data { unsigned int extra[FONT_EXTRA_WORDS]; - const unsigned char data[]; + unsigned char data[]; } __packed; #endif /* _VIDEO_FONT_H */ diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 3fb76d185647..16e75c3d2a0f 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -14,7 +14,9 @@ #include #include +#include #include +#include #include #include #include @@ -23,6 +25,8 @@ #include #endif +#define console_font_pitch(font) DIV_ROUND_UP((font)->width, 8) + /* * Helpers for font_data_t */ @@ -42,6 +46,64 @@ static void font_data_free(font_data_t *fd) kfree(to_font_data_struct(fd)); } +/** + * font_data_import - Allocates and initializes font data from user space + * @font: A font from user space + * @vpitch: The size of a single glyph in @font in bytes + * @calc_csum: An optional helper to calculate a chechsum + * + * Font data from user space must be translated to the kernel's format. The + * font's glyph geometry and data is provided in @font. The parameter @vpitch + * gives the number of bytes per glyph, including trailing bytes. + * + * The parameter @calc_csum is optional. Fbcon passes crc32() to calculate the + * font data's checksum. + * + * Returns: + * Newly initialized font data on success, or a pointer-encoded errno value otherwise. + */ +font_data_t *font_data_import(const struct console_font *font, unsigned int vpitch, + u32 (*calc_csum)(u32, const void *, size_t)) +{ + unsigned int pitch = console_font_pitch(font); + unsigned int h = font->height; + unsigned int charcount = font->charcount; + const unsigned char *data = font->data; + u32 csum = 0; + struct font_data *font_data; + int size, alloc_size; + unsigned int i; + font_data_t *fd; + + /* Check for integer overflow in font-size calculation */ + if (check_mul_overflow(h, pitch, &size) || + check_mul_overflow(size, charcount, &size)) + return ERR_PTR(-EINVAL); + + /* Check for overflow in allocation size calculation */ + if (check_add_overflow(sizeof(*font_data), size, &alloc_size)) + return ERR_PTR(-EINVAL); + + font_data = kmalloc(alloc_size, GFP_USER); + if (!font_data) + return ERR_PTR(-ENOMEM); + memset(font_data->extra, 0, sizeof(font_data->extra)); + + for (i = 0; i < charcount; ++i) + memcpy(font_data->data + i * h * pitch, data + i * vpitch * pitch, h * pitch); + + if (calc_csum) + csum = calc_csum(0, font_data->data, size); + + fd = font_data->data; + REFCOUNT(fd) = 1; /* start with reference acquired */ + FNTSIZE(fd) = size; + FNTSUM(fd) = csum; + + return fd; +} +EXPORT_SYMBOL_GPL(font_data_import); + /** * font_data_get - Acquires a reference on font data * @fd: Font data -- cgit v1.2.3 From c37bd7c8d36f760c064de2639423866dc0270997 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:54 +0100 Subject: lib/fonts: Store font data for user space with font_data_export() Add font_data_export() and update consoles to use it. The helper font_data_export() is based on code in fbcon_get_font(). It extends the size of a single glyph to match the requested vpitch, which us usually 32 bytes for fonts from user space. Internal fonts have a pitch according to the glyph's height. The implementation of font_data_export() differs in several ways from the original code. The original implementation distinguished between different pitches of the font data. This is not necessary as the pitch is a parameter in the copying. There was also special handling for a font pitch of 3 bytes, which got expanded to 4 bytes (with trailing bits on each scanline). The logic originated from long before git history exists even in the historical tree. So it is not clear why this was implemented. It is not what user space expects. The setfont utitlity loads font with 3-bytes pitches and expects to read such fonts with a 3-byte pitch. For any font width, the font pitch is always the width extended to the next multiple of 8. See [1] for the user-space font-reading code. With the changes to handling the font pitches, font_data_export() replaces the original code's various special cases with a single copying logic. v3: - fix typos (Helge) Signed-off-by: Thomas Zimmermann Link: https://github.com/legionus/kbd/blob/v2.9.0/src/libkfont/kdfontop.c#L73 # [1] Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 57 ++-------------------------------------- include/linux/font.h | 1 + lib/fonts/fonts.c | 40 ++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 53677c09a0ec..8641b0b3edc4 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2282,68 +2282,15 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - struct fbcon_display *p = &fb_display[vc->vc_num]; - font_data_t *fontdata = p->fontdata; - u8 *data = font->data; - int i, j; + const struct fbcon_display *p = &fb_display[vc->vc_num]; font->width = vc->vc_font.width; font->height = vc->vc_font.height; if (font->height > vpitch) return -ENOSPC; font->charcount = vc->vc_hi_font_mask ? 512 : 256; - if (!font->data) - return 0; - - if (font->width <= 8) { - j = vc->vc_font.height; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, vpitch - j); - data += vpitch; - fontdata += j; - } - } else if (font->width <= 16) { - j = vc->vc_font.height * 2; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, 2*vpitch - j); - data += 2*vpitch; - fontdata += j; - } - } else if (font->width <= 24) { - if (font->charcount * (vc->vc_font.height * sizeof(u32)) > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - for (j = 0; j < vc->vc_font.height; j++) { - *data++ = fontdata[0]; - *data++ = fontdata[1]; - *data++ = fontdata[2]; - fontdata += sizeof(u32); - } - memset(data, 0, 3 * (vpitch - j)); - data += 3 * (vpitch - j); - } - } else { - j = vc->vc_font.height * 4; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, 4 * vpitch - j); - data += 4 * vpitch; - fontdata += j; - } - } - return 0; + return font_data_export(p->fontdata, font, vpitch); } /* set/clear vc_hi_font_mask and update vc attrs accordingly */ diff --git a/include/linux/font.h b/include/linux/font.h index 3eb4818402c5..d80db66a5c17 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -62,6 +62,7 @@ void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); +int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch); /* * Font description diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 16e75c3d2a0f..a861b375e35d 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -198,6 +198,46 @@ bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs) } EXPORT_SYMBOL_GPL(font_data_is_equal); +/** + * font_data_export - Stores font data for user space + * @fd: Font data + * @font: A font for user space + * @vpitch: The size of a single glyph in @font in bytes + * + * Store the font data given in @fd to the font in @font. Values and + * pointers in @font are pre-initialized. This helper mostly checks some + * corner cases and translates glyph sizes according to the value given + * @vpitch. + * + * Returns: + * 0 on success, or a negative errno code otherwise. + */ +int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch) +{ + const unsigned char *font_data = font_data_buf(fd); + unsigned char *data = font->data; + unsigned int pitch = console_font_pitch(font); + unsigned int glyphsize, i; + + if (!font->width || !font->height || !font->charcount || !font->data) + return 0; + + glyphsize = font->height * pitch; + + if (font->charcount * glyphsize > font_data_size(fd)) + return -EINVAL; + + for (i = 0; i < font->charcount; i++) { + memcpy(data, font_data, glyphsize); + memset(data + glyphsize, 0, pitch * vpitch - glyphsize); + data += pitch * vpitch; + font_data += glyphsize; + } + + return 0; +} +EXPORT_SYMBOL_GPL(font_data_export); + /* * Font lookup */ -- cgit v1.2.3 From db65872b38dc9f18a62669d6ae1e4ec7868a85a9 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:55 +0100 Subject: lib/fonts: Remove internal symbols and macros from public header file Define access macros for font_data_t in fonts.c. Define struct font_data and declare most of the font symbols in the internal header font.h, where they can only be seen by the font code. Also move font indices into internal font.h. They appear to be unused though. There is m86k assembly code that operates on the idx field, so leave them in place for now. List all built-in fonts in a separate section in the public header file. v2: - do not add config guards around font symbols (Helge) - keep declaration of built-in fonts in public header Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 57 ++++++++++++++-------------------------------- lib/fonts/font.h | 38 +++++++++++++++++++++++++++++++ lib/fonts/font_10x18.c | 2 +- lib/fonts/font_6x10.c | 3 ++- lib/fonts/font_6x11.c | 2 +- lib/fonts/font_6x8.c | 3 ++- lib/fonts/font_7x14.c | 2 +- lib/fonts/font_8x16.c | 3 ++- lib/fonts/font_8x8.c | 2 +- lib/fonts/font_acorn_8x8.c | 2 +- lib/fonts/font_mini_4x6.c | 2 +- lib/fonts/font_pearl_8x8.c | 2 +- lib/fonts/font_sun12x22.c | 3 ++- lib/fonts/font_sun8x16.c | 3 ++- lib/fonts/font_ter10x18.c | 4 +++- lib/fonts/font_ter16x32.c | 4 +++- lib/fonts/fonts.c | 8 ++++++- 17 files changed, 85 insertions(+), 55 deletions(-) create mode 100644 lib/fonts/font.h (limited to 'include') diff --git a/include/linux/font.h b/include/linux/font.h index d80db66a5c17..5401f07dd6ce 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -77,36 +77,6 @@ struct font_desc { int pref; }; -#define VGA8x8_IDX 0 -#define VGA8x16_IDX 1 -#define PEARL8x8_IDX 2 -#define VGA6x11_IDX 3 -#define FONT7x14_IDX 4 -#define FONT10x18_IDX 5 -#define SUN8x16_IDX 6 -#define SUN12x22_IDX 7 -#define ACORN8x8_IDX 8 -#define MINI4x6_IDX 9 -#define FONT6x10_IDX 10 -#define TER16x32_IDX 11 -#define FONT6x8_IDX 12 -#define TER10x18_IDX 13 - -extern const struct font_desc font_vga_8x8, - font_vga_8x16, - font_pearl_8x8, - font_vga_6x11, - font_7x14, - font_10x18, - font_sun_8x16, - font_sun_12x22, - font_acorn_8x8, - font_mini_4x6, - font_6x10, - font_ter_16x32, - font_6x8, - font_ter_10x18; - /* Find a font with a specific name */ extern const struct font_desc *find_font(const char *name); @@ -120,16 +90,23 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 -/* Extra word getters */ -#define REFCOUNT(fd) (((int *)(fd))[-1]) -#define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTSUM(fd) (((int *)(fd))[-4]) - -#define FONT_EXTRA_WORDS 4 +/* + * Built-in fonts + */ -struct font_data { - unsigned int extra[FONT_EXTRA_WORDS]; - unsigned char data[]; -} __packed; +extern const struct font_desc font_10x18; +extern const struct font_desc font_6x10; +extern const struct font_desc font_6x8; +extern const struct font_desc font_7x14; +extern const struct font_desc font_acorn_8x8; +extern const struct font_desc font_mini_4x6; +extern const struct font_desc font_pearl_8x8; +extern const struct font_desc font_sun_12x22; +extern const struct font_desc font_sun_8x16; +extern const struct font_desc font_ter_10x18; +extern const struct font_desc font_ter_16x32; +extern const struct font_desc font_vga_6x11; +extern const struct font_desc font_vga_8x16; +extern const struct font_desc font_vga_8x8; #endif /* _VIDEO_FONT_H */ diff --git a/lib/fonts/font.h b/lib/fonts/font.h new file mode 100644 index 000000000000..4f1adf0b6b54 --- /dev/null +++ b/lib/fonts/font.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LIB_FONTS_FONT_H +#define _LIB_FONTS_FONT_H + +#include + +/* + * Font data + */ + +#define FONT_EXTRA_WORDS 4 + +struct font_data { + unsigned int extra[FONT_EXTRA_WORDS]; + unsigned char data[]; +} __packed; + +/* + * Built-in fonts + */ + +#define VGA8x8_IDX 0 +#define VGA8x16_IDX 1 +#define PEARL8x8_IDX 2 +#define VGA6x11_IDX 3 +#define FONT7x14_IDX 4 +#define FONT10x18_IDX 5 +#define SUN8x16_IDX 6 +#define SUN12x22_IDX 7 +#define ACORN8x8_IDX 8 +#define MINI4x6_IDX 9 +#define FONT6x10_IDX 10 +#define TER16x32_IDX 11 +#define FONT6x8_IDX 12 +#define TER10x18_IDX 13 + +#endif diff --git a/lib/fonts/font_10x18.c b/lib/fonts/font_10x18.c index 5d940db626e7..10edebc4bb74 100644 --- a/lib/fonts/font_10x18.c +++ b/lib/fonts/font_10x18.c @@ -4,7 +4,7 @@ * by Jurriaan Kalkman 06-2005 * ********************************/ -#include +#include "font.h" #define FONTDATAMAX 9216 diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c index e65df019e0d2..660d3a371b30 100644 --- a/lib/fonts/font_6x10.c +++ b/lib/fonts/font_6x10.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 2560 diff --git a/lib/fonts/font_6x11.c b/lib/fonts/font_6x11.c index bd76b3f6b635..671487ccc172 100644 --- a/lib/fonts/font_6x11.c +++ b/lib/fonts/font_6x11.c @@ -5,7 +5,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX (11*256) diff --git a/lib/fonts/font_6x8.c b/lib/fonts/font_6x8.c index 06ace7792521..5811ee07f4d8 100644 --- a/lib/fonts/font_6x8.c +++ b/lib/fonts/font_6x8.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_7x14.c b/lib/fonts/font_7x14.c index a2f561c9fa04..0c7475d643c8 100644 --- a/lib/fonts/font_7x14.c +++ b/lib/fonts/font_7x14.c @@ -4,7 +4,7 @@ /* by Jurriaan Kalkman 05-2005 */ /**************************************/ -#include +#include "font.h" #define FONTDATAMAX 3584 diff --git a/lib/fonts/font_8x16.c b/lib/fonts/font_8x16.c index 06ae14088514..523e95c75569 100644 --- a/lib/fonts/font_8x16.c +++ b/lib/fonts/font_8x16.c @@ -5,9 +5,10 @@ /* */ /**********************************************/ -#include #include +#include "font.h" + #define FONTDATAMAX 4096 static const struct font_data fontdata_8x16 = { diff --git a/lib/fonts/font_8x8.c b/lib/fonts/font_8x8.c index 69570b8c31af..e5b697fc9675 100644 --- a/lib/fonts/font_8x8.c +++ b/lib/fonts/font_8x8.c @@ -5,7 +5,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_acorn_8x8.c b/lib/fonts/font_acorn_8x8.c index af5fa72aa8b7..36c51016769d 100644 --- a/lib/fonts/font_acorn_8x8.c +++ b/lib/fonts/font_acorn_8x8.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Acorn-like font definition, with PC graphics characters */ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_mini_4x6.c b/lib/fonts/font_mini_4x6.c index cc21dc70cfd1..dc919c160dde 100644 --- a/lib/fonts/font_mini_4x6.c +++ b/lib/fonts/font_mini_4x6.c @@ -39,7 +39,7 @@ __END__; MSBit to LSBit = left to right. */ -#include +#include "font.h" #define FONTDATAMAX 1536 diff --git a/lib/fonts/font_pearl_8x8.c b/lib/fonts/font_pearl_8x8.c index ae98ca17982e..2438b374acea 100644 --- a/lib/fonts/font_pearl_8x8.c +++ b/lib/fonts/font_pearl_8x8.c @@ -10,7 +10,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_sun12x22.c b/lib/fonts/font_sun12x22.c index 91daf5ab8b6b..2afbc144bea8 100644 --- a/lib/fonts/font_sun12x22.c +++ b/lib/fonts/font_sun12x22.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 11264 diff --git a/lib/fonts/font_sun8x16.c b/lib/fonts/font_sun8x16.c index 81bb4eeae04e..2b7b2d8e548a 100644 --- a/lib/fonts/font_sun8x16.c +++ b/lib/fonts/font_sun8x16.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 4096 diff --git a/lib/fonts/font_ter10x18.c b/lib/fonts/font_ter10x18.c index 80356e9d56c7..3f30b4a211ab 100644 --- a/lib/fonts/font_ter10x18.c +++ b/lib/fonts/font_ter10x18.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include + #include +#include "font.h" + #define FONTDATAMAX 9216 static const struct font_data fontdata_ter10x18 = { diff --git a/lib/fonts/font_ter16x32.c b/lib/fonts/font_ter16x32.c index 5baedc573dd6..93616cffe642 100644 --- a/lib/fonts/font_ter16x32.c +++ b/lib/fonts/font_ter16x32.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include + #include +#include "font.h" + #define FONTDATAMAX 16384 static const struct font_data fontdata_ter16x32 = { diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index a861b375e35d..5938f542906b 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include @@ -25,12 +24,19 @@ #include #endif +#include "font.h" + #define console_font_pitch(font) DIV_ROUND_UP((font)->width, 8) /* * Helpers for font_data_t */ +/* Extra word getters */ +#define REFCOUNT(fd) (((int *)(fd))[-1]) +#define FNTSIZE(fd) (((int *)(fd))[-2]) +#define FNTSUM(fd) (((int *)(fd))[-4]) + static struct font_data *to_font_data_struct(font_data_t *fd) { return container_of(fd, struct font_data, data[0]); -- cgit v1.2.3 From 7d776a36277ff2685ffc3dc7eff32002d0333ac9 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:47 +0000 Subject: ring-buffer: Add page statistics to the meta-page Add two fields pages_touched and pages_lost to the ring-buffer meta-page. Those fields are useful to get the number of used pages in the ring-buffer. Link: https://patch.msgid.link/20260309162516.2623589-2-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/trace_mmap.h | 8 ++++---- kernel/trace/ring_buffer.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h index c102ef35d11e..e8185889a1c8 100644 --- a/include/uapi/linux/trace_mmap.h +++ b/include/uapi/linux/trace_mmap.h @@ -17,8 +17,8 @@ * @entries: Number of entries in the ring-buffer. * @overrun: Number of entries lost in the ring-buffer. * @read: Number of entries that have been read. - * @Reserved1: Internal use only. - * @Reserved2: Internal use only. + * @pages_lost: Number of pages overwritten by the writer. + * @pages_touched: Number of pages written by the writer. */ struct trace_buffer_meta { __u32 meta_page_size; @@ -39,8 +39,8 @@ struct trace_buffer_meta { __u64 overrun; __u64 read; - __u64 Reserved1; - __u64 Reserved2; + __u64 pages_lost; + __u64 pages_touched; }; #define TRACE_MMAP_IOCTL_GET_READER _IO('R', 0x20) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 17d0ea0cc3e6..82b4df579670 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6154,6 +6154,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); -- cgit v1.2.3 From 2e67fabd8b77c4f482df9b211bca1b495c6c2c24 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:49 +0000 Subject: ring-buffer: Introduce ring-buffer remotes Add ring-buffer remotes to support entities outside of the kernel (such as firmware or a hypervisor) that writes events into a ring-buffer using the tracefs format Require a description of the ring-buffer pages (struct trace_buffer_desc) and callbacks (swap_reader_page and reset) to set up the ring-buffer on the kernel side. Expect the remote entity to maintain and update the meta-page. Link: https://patch.msgid.link/20260309162516.2623589-4-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 58 +++++++++++ kernel/trace/ring_buffer.c | 233 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 283 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610270..994f52b34344 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); + +struct ring_buffer_desc { + int cpu; + unsigned int nr_page_va; /* excludes the meta page */ + unsigned long meta_va; + unsigned long page_va[] __counted_by(nr_page_va); +}; + +struct trace_buffer_desc { + int nr_cpus; + size_t struct_len; + char __data[]; /* list of ring_buffer_desc */ +}; + +static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc) +{ + size_t len = struct_size(desc, page_va, desc->nr_page_va); + + return (struct ring_buffer_desc *)((void *)desc + len); +} + +static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc) +{ + return (struct ring_buffer_desc *)(&desc->__data[0]); +} + +static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + struct ring_buffer_desc *rbdesc; + + return size_add(offsetof(struct trace_buffer_desc, __data), + size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages))); +} + +#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \ + for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \ + (__cpu) < (__trace_pdesc)->nr_cpus; \ + (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc)) + +struct ring_buffer_remote { + struct trace_buffer_desc *desc; + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + void *priv; +}; + +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu); + +struct trace_buffer * +__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key); + +#define ring_buffer_alloc_remote(remote) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_remote(remote, &__key); \ +}) #endif /* _LINUX_RING_BUFFER_H */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3d2804a7e8ab..88218377fa29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -559,6 +559,8 @@ struct ring_buffer_per_cpu { struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +583,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -2238,6 +2242,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, int page_id) +{ + return page_id > desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2283,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2312,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2342,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2449,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2532,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2475,7 +2557,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2598,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2655,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2672,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2620,7 +2713,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2740,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -5274,6 +5378,16 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -5428,7 +5542,43 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + cpu_buffer->reader_page->page = new_reader->page; + cpu_buffer->reader_page->id = new_reader->id; + cpu_buffer->reader_page->read = 0; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5748,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5998,7 +6155,7 @@ ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return NULL; iter = kzalloc_obj(*iter, flags); @@ -6166,6 +6323,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6396,6 +6570,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6634,6 +6848,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6671,6 +6886,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6680,7 +6897,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7259,7 +7476,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; -- cgit v1.2.3 From 96e43537af5461b26f50904c6055046ba65d742f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:51 +0000 Subject: tracing: Introduce trace remotes A trace remote relies on ring-buffer remotes to read and control compatible tracing buffers, written by entity such as firmware or hypervisor. Add a Tracefs directory remotes/ that contains all instances of trace remotes. Each instance follows the same hierarchy as any other to ease the support by existing user-space tools. This currently does not provide any event support, which will come later. Link: https://patch.msgid.link/20260309162516.2623589-6-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 36 +++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 6 + kernel/trace/trace_remote.c | 619 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 666 insertions(+), 1 deletion(-) create mode 100644 include/linux/trace_remote.h create mode 100644 kernel/trace/trace_remote.c (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h new file mode 100644 index 000000000000..65b7e7b8267c --- /dev/null +++ b/include/linux/trace_remote.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_H +#define _LINUX_TRACE_REMOTE_H + +#include + +/** + * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first + * time. Must return a &trace_buffer_desc + * (most likely filled with trace_remote_alloc_buffer()) + * @unload_trace_buffer: + * Called once Tracefs has no use for the trace buffer + * (most likely call trace_remote_free_buffer()) + * @enable_tracing: Called on Tracefs tracing_on. It is expected from the + * remote to allow writing. + * @swap_reader_page: Called when Tracefs consumes a new page from a + * ring-buffer. It is expected from the remote to isolate a + * new reader-page from the @cpu ring-buffer. + */ +struct trace_remote_callbacks { + struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); + void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); + int (*enable_tracing)(bool enable, void *priv); + int (*swap_reader_page)(unsigned int cpu, void *priv); +}; + +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); + +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask); + +void trace_remote_free_buffer(struct trace_buffer_desc *desc); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..384dd36c8e29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,7 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..318923ce39f5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -128,4 +128,5 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebd996f8710e..e33cb3c39f6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8589,7 +8589,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..d6af4405b341 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -689,6 +689,12 @@ struct dentry *trace_create_file(const char *name, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); /** diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..8b06f730376e --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + int cpu; + int evt_cpu; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + trace_remote_try_unload(remote); +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static struct trace_remote_iterator *trace_remote_iter(struct trace_remote *remote, int cpu) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + /* Test the CPU */ + ret = ring_buffer_poll_remote(remote->trace_buffer, cpu); + if (ret) + goto err; + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + trace_seq_init(&iter->seq); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + kfree(iter); + trace_remote_put(remote); +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + if (!ring_buffer_peek(trace_buffer, cpu, &iter->ts, &iter->lost_events)) + return false; + + iter->evt_cpu = cpu; + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + if (!ring_buffer_peek(trace_buffer, cpu, &ts, &lost_events)) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + iter = trace_remote_iter(remote, cpu); + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + } + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + } + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +{ + struct trace_remote *remote; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); -- cgit v1.2.3 From 9af4ab0e11e336e2671d303ffcc6578e3546d9fc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:52 +0000 Subject: tracing: Add reset to trace remotes Allow to reset the trace remote buffer by writing to the Tracefs "trace" file. This is similar to the regular Tracefs interface. Link: https://patch.msgid.link/20260309162516.2623589-7-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 3 +++ kernel/trace/trace_remote.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 65b7e7b8267c..10ca03dc192b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -17,6 +17,8 @@ * remote to allow writing. * @swap_reader_page: Called when Tracefs consumes a new page from a * ring-buffer. It is expected from the remote to isolate a + * @reset: Called on `echo 0 > trace`. It is expected from the + * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { @@ -24,6 +26,7 @@ struct trace_remote_callbacks { void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); }; int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 8b06f730376e..a7b94736dd38 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -63,6 +63,7 @@ static int trace_remote_load(struct trace_remote *remote) rb_remote->desc = desc; rb_remote->swap_reader_page = remote->cbs->swap_reader_page; rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); if (!remote->trace_buffer) { remote->cbs->unload_trace_buffer(desc, remote->priv); @@ -138,6 +139,21 @@ static int trace_remote_disable_tracing(struct trace_remote *remote) return 0; } +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -414,6 +430,26 @@ static const struct file_operations trace_pipe_fops = { .release = trace_pipe_release, }; +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .write = trace_write, +}; + static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) { struct dentry *remote_d, *percpu_d, *d; @@ -452,6 +488,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo if (!d) goto err; + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + percpu_d = tracefs_create_dir("per_cpu", remote_d); if (!percpu_d) { pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); @@ -474,6 +514,11 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo &trace_pipe_fops); if (!d) goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; } return 0; -- cgit v1.2.3 From bf2ba0f8ca1af14aaaa765cbb93caf564d383aad Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:54 +0000 Subject: tracing: Add init callback to trace remotes Add a .init call back so the trace remote callers can add entries to the tracefs directory. Link: https://patch.msgid.link/20260309162516.2623589-9-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 4 ++++ kernel/trace/trace_remote.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 10ca03dc192b..090c58b7d92b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -3,10 +3,13 @@ #ifndef _LINUX_TRACE_REMOTE_H #define _LINUX_TRACE_REMOTE_H +#include #include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @init: Called once the remote has been registered. Allows the + * caller to extend the Tracefs remote directory * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first * time. Must return a &trace_buffer_desc * (most likely filled with trace_remote_alloc_buffer()) @@ -22,6 +25,7 @@ * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { + int (*init)(struct dentry *d, void *priv); struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 039ba71c3b3e..294d051dcef1 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -863,6 +863,7 @@ err: int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) { struct trace_remote *remote; + int ret; remote = kzalloc_obj(*remote); if (!remote) @@ -880,7 +881,11 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } - return 0; + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; } EXPORT_SYMBOL_GPL(trace_remote_register); -- cgit v1.2.3 From 072529158e604cc964feb78dcf094c6975828146 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:55 +0000 Subject: tracing: Add events to trace remotes An event is predefined point in the writer code that allows to log data. Following the same scheme as kernel events, add remote events, described to user-space within the events/ tracefs directory found in the corresponding trace remote. Remote events are expected to be described during the trace remote registration. Add also a .enable_event callback for trace_remote to toggle the event logging, if supported. Link: https://patch.msgid.link/20260309162516.2623589-10-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 7 +- include/linux/trace_remote_event.h | 23 ++++ kernel/trace/trace_remote.c | 264 ++++++++++++++++++++++++++++++++++++- 3 files changed, 288 insertions(+), 6 deletions(-) create mode 100644 include/linux/trace_remote_event.h (limited to 'include') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 090c58b7d92b..fcd1d46ea466 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -5,6 +5,7 @@ #include #include +#include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote @@ -23,6 +24,8 @@ * @reset: Called on `echo 0 > trace`. It is expected from the * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. + * @enable_event: Called on events/event_name/enable. It is expected from + * the remote to allow the writing event @id. */ struct trace_remote_callbacks { int (*init)(struct dentry *d, void *priv); @@ -31,9 +34,11 @@ struct trace_remote_callbacks { int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); int (*reset)(unsigned int cpu, void *priv); + int (*enable_event)(unsigned short id, bool enable, void *priv); }; -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events); int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, const struct cpumask *cpumask); diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h new file mode 100644 index 000000000000..a4449008a075 --- /dev/null +++ b/include/linux/trace_remote_event.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_EVENTS_H +#define _LINUX_TRACE_REMOTE_EVENTS_H + +struct trace_remote; +struct trace_event_fields; + +struct remote_event_hdr { + unsigned short id; +}; + +#define REMOTE_EVENT_NAME_MAX 30 +struct remote_event { + char name[REMOTE_EVENT_NAME_MAX]; + unsigned short id; + bool enabled; + struct trace_remote *remote; + struct trace_event_fields *fields; + char *print_fmt; + void (*print)(void *evt, struct trace_seq *seq); +}; +#endif diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 294d051dcef1..0d0af53c0ce9 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -31,6 +31,7 @@ struct trace_remote_iterator { u64 ts; struct ring_buffer_iter *rb_iter; struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; int cpu; int evt_cpu; loff_t pos; @@ -42,6 +43,10 @@ struct trace_remote { void *priv; struct trace_buffer *trace_buffer; struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; unsigned long trace_buffer_size; struct ring_buffer_remote rb_remote; struct mutex lock; @@ -168,7 +173,8 @@ static void trace_remote_reset(struct trace_remote *remote, int cpu) static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -197,7 +203,8 @@ DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -484,16 +491,19 @@ __peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) { struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; int cpu = iter->cpu; if (cpu != RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(trace_buffer, cpu)) return false; - if (!__peek_event(iter, cpu, &iter->ts, &iter->lost_events)) + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) return false; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); return true; } @@ -505,7 +515,8 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) if (ring_buffer_empty_cpu(trace_buffer, cpu)) continue; - if (!__peek_event(iter, cpu, &ts, &lost_events)) + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) continue; if (ts >= iter->ts) @@ -513,6 +524,7 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) iter->ts = ts; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); iter->lost_events = lost_events; } @@ -533,8 +545,11 @@ static void trace_remote_iter_move(struct trace_remote_iterator *iter) } } +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) { + struct remote_event *evt; unsigned long usecs_rem; u64 ts = iter->ts; @@ -548,6 +563,12 @@ static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, ts, usecs_rem); + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; } @@ -829,6 +850,8 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo goto err; } + remote->dentry = remote_d; + return 0; err: @@ -842,6 +865,9 @@ err: return -ENOMEM; } +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + /** * trace_remote_register() - Register a Tracefs remote * @name: Name of the remote, used for the Tracefs remotes/ directory. @@ -860,7 +886,8 @@ err: * * Return: 0 on success, negative error code on failure. */ -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) { struct trace_remote *remote; int ret; @@ -881,6 +908,13 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; if (ret) pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); @@ -976,3 +1010,223 @@ err: return ret; } EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} -- cgit v1.2.3 From 5f3efd1dcebc35d44cce39630ae00980a45d9247 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:57 +0000 Subject: tracing: Add helpers to create trace remote events Declaring remote events can be cumbersome let's add a set of macros to simplify developers life. The declaration of a remote event is very similar to kernel's events: REMOTE_EVENT(name, id, RE_STRUCT( re_field(u64 foo) ), RE_PRINTK("foo=%llu", __entry->foo) ) Link: https://patch.msgid.link/20260309162516.2623589-12-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote_event.h | 10 +++++ include/trace/define_remote_events.h | 73 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 include/trace/define_remote_events.h (limited to 'include') diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h index a4449008a075..c8ae1e1f5e72 100644 --- a/include/linux/trace_remote_event.h +++ b/include/linux/trace_remote_event.h @@ -5,6 +5,7 @@ struct trace_remote; struct trace_event_fields; +struct trace_seq; struct remote_event_hdr { unsigned short id; @@ -20,4 +21,13 @@ struct remote_event { char *print_fmt; void (*print)(void *evt, struct trace_seq *seq); }; + +#define RE_STRUCT(__args...) __args +#define re_field(__type, __field) __type __field; + +#define REMOTE_EVENT_FORMAT(__name, __struct) \ + struct remote_event_format_##__name { \ + struct remote_event_hdr hdr; \ + __struct \ + } #endif diff --git a/include/trace/define_remote_events.h b/include/trace/define_remote_events.h new file mode 100644 index 000000000000..676e803dc144 --- /dev/null +++ b/include/trace/define_remote_events.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define REMOTE_EVENT_INCLUDE(__file) __stringify(../../__file) + +#ifdef REMOTE_EVENT_SECTION +# define __REMOTE_EVENT_SECTION(__name) __used __section(REMOTE_EVENT_SECTION"."#__name) +#else +# define __REMOTE_EVENT_SECTION(__name) +#endif + +#define REMOTE_PRINTK_COUNT_ARGS(__args...) \ + __COUNT_ARGS(, ##__args, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0) + +#define __remote_printk0() \ + trace_seq_putc(seq, '\n') + +#define __remote_printk1(__fmt) \ + trace_seq_puts(seq, " " __fmt "\n") \ + +#define __remote_printk2(__fmt, __args...) \ +do { \ + trace_seq_putc(seq, ' '); \ + trace_seq_printf(seq, __fmt, __args); \ + trace_seq_putc(seq, '\n'); \ +} while (0) + +/* Apply the appropriate trace_seq sequence according to the number of arguments */ +#define remote_printk(__args...) \ + CONCATENATE(__remote_printk, REMOTE_PRINTK_COUNT_ARGS(__args))(__args) + +#define RE_PRINTK(__args...) __args + +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + static void remote_event_print_##__name(void *evt, struct trace_seq *seq) \ + { \ + struct remote_event_format_##__name __maybe_unused *__entry = evt; \ + trace_seq_puts(seq, #__name); \ + remote_printk(__printk); \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) + +#undef REMOTE_EVENT +#undef RE_PRINTK +#undef re_field +#define re_field(__type, __field) \ + { \ + .type = #__type, .name = #__field, \ + .size = sizeof(__type), .align = __alignof__(__type), \ + .is_signed = is_signed_type(__type), \ + }, +#define __entry REC +#define RE_PRINTK(__fmt, __args...) "\"" __fmt "\", " __stringify(__args) +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + static struct trace_event_fields remote_event_fields_##__name[] = { \ + __struct \ + {} \ + }; \ + static char remote_event_print_fmt_##__name[] = __printk; \ + static struct remote_event __REMOTE_EVENT_SECTION(__name) \ + remote_event_##__name = { \ + .name = #__name, \ + .id = __id, \ + .fields = remote_event_fields_##__name, \ + .print_fmt = remote_event_print_fmt_##__name, \ + .print = remote_event_print_##__name, \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) -- cgit v1.2.3 From 93ae1b76fff9e745f870a2f2cd32f472328c4a8f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:58 +0000 Subject: ring-buffer: Export buffer_data_page and macros In preparation for allowing the writing of ring-buffer compliant pages outside of ring_buffer.c, move buffer_data_page and timestamps encoding macros into the publicly available ring_buffer_types.h. Link: https://patch.msgid.link/20260309162516.2623589-13-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer_types.h | 41 +++++++++++++++++++++++++++++++++++++++ kernel/trace/ring_buffer.c | 36 +--------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 include/linux/ring_buffer_types.h (limited to 'include') diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h new file mode 100644 index 000000000000..54577021a49d --- /dev/null +++ b/include/linux/ring_buffer_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_TYPES_H +#define _LINUX_RING_BUFFER_TYPES_H + +#include + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; +#endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 605142e06863..96e0d80d492b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; -- cgit v1.2.3 From 34e5b958bdad0f9cf16306368bbc2dc5b2a50143 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:59 +0000 Subject: tracing: Introduce simple_ring_buffer Add a simple implementation of the kernel ring-buffer. This intends to be used later by ring-buffer remotes such as the pKVM hypervisor, hence the need for a cut down version (write only) without any dependency. Link: https://patch.msgid.link/20260309162516.2623589-14-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 57 +++++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/simple_ring_buffer.c | 464 +++++++++++++++++++++++++++++++++++++ 4 files changed, 525 insertions(+) create mode 100644 include/linux/simple_ring_buffer.h create mode 100644 kernel/trace/simple_ring_buffer.c (limited to 'include') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h new file mode 100644 index 000000000000..2c4c0ae336bc --- /dev/null +++ b/include/linux/simple_ring_buffer.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SIMPLE_RING_BUFFER_H +#define _LINUX_SIMPLE_RING_BUFFER_H + +#include +#include +#include +#include + +/* + * Ideally those struct would stay private but the caller needs to know + * the allocation size for simple_ring_buffer_init(). + */ +struct simple_buffer_page { + struct list_head link; + struct buffer_data_page *page; + u64 entries; + u32 write; + u32 id; +}; + +struct simple_rb_per_cpu { + struct simple_buffer_page *tail_page; + struct simple_buffer_page *reader_page; + struct simple_buffer_page *head_page; + struct simple_buffer_page *bpages; + struct trace_buffer_meta *meta; + u32 nr_pages; + +#define SIMPLE_RB_UNAVAILABLE 0 +#define SIMPLE_RB_READY 1 +#define SIMPLE_RB_WRITING 2 + u32 status; + + u64 last_overrun; + u64 write_stamp; + + struct simple_rb_cbs *cbs; +}; + +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc); + +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); + +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp); + +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); + +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 384dd36c8e29..edbdd7b38f61 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1284,4 +1284,7 @@ source "kernel/trace/rv/Kconfig" config TRACE_REMOTE bool +config SIMPLE_RING_BUFFER + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 318923ce39f5..2e39b09398b3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -129,4 +129,5 @@ obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o libftrace-y := ftrace.o diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..15df9781411b --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include + +#include +#include + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + struct simple_buffer_page *bpage = bpages; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->bpages = bpages; + + cpu_buffer->meta = (void *)desc->meta_va; + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + + /* The reader page is not part of the ring initially */ + simple_bpage_init(bpage, desc->page_va[0]); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + simple_bpage_init(++bpage, desc->page_va[i]); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + cpu_buffer->bpages = NULL; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); -- cgit v1.2.3 From 635923081c792c830fb87e680d6dd5f348926b3f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:25:03 +0000 Subject: tracing: load/unload page callbacks for simple_ring_buffer Add load/unload callback used for each admitted page in the ring-buffer. This will be later useful for the pKVM hypervisor which uses a different VA space and need to dynamically map/unmap the ring-buffer pages. Link: https://patch.msgid.link/20260309162516.2623589-18-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 8 ++++ kernel/trace/simple_ring_buffer.c | 91 ++++++++++++++++++++++++++++++-------- 2 files changed, 80 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h index 2c4c0ae336bc..21aec556293e 100644 --- a/include/linux/simple_ring_buffer.h +++ b/include/linux/simple_ring_buffer.h @@ -54,4 +54,12 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)); #endif diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 15df9781411b..02af2297ae5a 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -71,7 +71,7 @@ static void simple_bpage_reset(struct simple_buffer_page *bpage) local_set(&bpage->page->commit, 0); } -static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) { INIT_LIST_HEAD(&bpage->link); bpage->page = (struct buffer_data_page *)page; @@ -372,18 +372,15 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) } EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); -/** - * simple_ring_buffer_init - Init @cpu_buffer based on @desc - * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. - * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va - * @desc: A ring_buffer_desc - * - * Returns 0 on success or -EINVAL if the content of @desc is invalid - */ -int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, - const struct ring_buffer_desc *desc) +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) { struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; int i; /* At least 1 reader page and two pages in the ring-buffer */ @@ -392,15 +389,22 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ memset(cpu_buffer, 0, sizeof(*cpu_buffer)); - cpu_buffer->bpages = bpages; + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; - cpu_buffer->meta = (void *)desc->meta_va; memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ - simple_bpage_init(bpage, desc->page_va[0]); + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); bpage->id = 0; cpu_buffer->nr_pages = 1; @@ -410,7 +414,13 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->head_page = bpage + 1; for (i = 1; i < desc->nr_page_va; i++) { - simple_bpage_init(++bpage, desc->page_va[i]); + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); bpage->link.next = &(bpage + 1)->link; bpage->link.prev = &(bpage - 1)->link; @@ -419,6 +429,14 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->nr_pages = i + 1; } + if (ret) { + for (i--; i >= 0; i--) + unload_page((void *)desc->page_va[i]); + unload_page(cpu_buffer->meta); + + return ret; + } + /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; @@ -426,23 +444,58 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ /* The last init'ed page points to the head page */ simple_bpage_set_head_link(bpage); + cpu_buffer->bpages = bpages; + return 0; } -EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } /** - * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion - * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid */ -void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) { + int p; + if (!simple_rb_loaded(cpu_buffer)) return; simple_rb_enable_tracing(cpu_buffer, false); + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + cpu_buffer->bpages = NULL; } + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); /** -- cgit v1.2.3 From c116737e972ea74f4468a1bd0703d623a3c0ee4a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 9 Mar 2026 14:15:28 +0100 Subject: workqueue: Add system_dfl_long_wq for long unbound works Currently there are users of queue_delayed_work() who specify system_long_wq, the per-cpu workqueue. This workqueue should be used for long per-cpu works, but queue_delayed_work() queue the work using: queue_delayed_work_on(WORK_CPU_UNBOUND, ...); This would end up calling __queue_delayed_work() that does: if (housekeeping_enabled(HK_TYPE_TIMER)) { // [....] } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } So when cpu == WORK_CPU_UNBOUND the timer is global and is not using a specific CPU. Later, when __queue_work() is called: if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } Because the wq is not unbound, it takes the CPU where the timer fired and enqueue the work on that CPU. The consequence of all of this is that the work can run anywhere, depending on where the timer fired. Introduce system_dfl_long_wq in order to change, in a future step, users that are still calling: queue_delayed_work(system_long_wq, ...); with the new system_dfl_long_wq instead, so that the work may benefit from scheduler task placement. Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++++ kernel/workqueue.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc5744402a66..8e0855d56e74 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -440,6 +440,9 @@ enum wq_consts { * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * + * system_dfl_long_wq is similar to system_dfl_wq but it may host long running + * works. + * * system_dfl_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and @@ -468,6 +471,7 @@ extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; +extern struct workqueue_struct *system_dfl_long_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); @@ -783,6 +787,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_dfl_long_wq) && \ + _wq == system_dfl_long_wq) || \ (__builtin_constant_p(_wq == system_dfl_wq) && \ _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f95cb0d2f1b..2d8ff903f113 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -530,6 +530,8 @@ struct workqueue_struct *system_bh_wq; EXPORT_SYMBOL_GPL(system_bh_wq); struct workqueue_struct *system_bh_highpri_wq; EXPORT_SYMBOL_GPL(system_bh_highpri_wq); +struct workqueue_struct *system_dfl_long_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_long_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -7954,11 +7956,12 @@ void __init workqueue_init_early(void) system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); + system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || - !system_bh_wq || !system_bh_highpri_wq); + !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq); } static void __init wq_cpu_intensive_thresh_init(void) -- cgit v1.2.3 From 878004e2852bc22ce0687c5597d6fe3909fb59f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Mar 2026 14:10:32 -0800 Subject: iopoll: fix function parameter names in read_poll_timeout_atomic() Correct the function parameter names to avoid kernel-doc warnings and to emphasize this function is atomic (non-sleeping). Warning: include/linux/iopoll.h:169 function parameter 'sleep_us' not described in 'read_poll_timeout_atomic' Warning: ../include/linux/iopoll.h:169 function parameter 'sleep_before_read' not described in 'read_poll_timeout_atomic' Fixes: 9df8043a546d ("iopoll: Generalize read_poll_timeout() into poll_timeout_us()") Signed-off-by: Randy Dunlap Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260306221033.2357305-1-rdunlap@infradead.org Signed-off-by: Jani Nikula --- include/linux/iopoll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index bdd2e0652bc3..53edd69acb9b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -159,7 +159,7 @@ * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock - * time, which is rather minimal with a non-zero delay_us. + * time, which is rather minimal with a non-zero @delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. @@ -167,9 +167,9 @@ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. */ -#define read_poll_timeout_atomic(op, val, cond, sleep_us, timeout_us, \ - sleep_before_read, args...) \ - poll_timeout_us_atomic((val) = op(args), cond, sleep_us, timeout_us, sleep_before_read) +#define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ + delay_before_read, args...) \ + poll_timeout_us_atomic((val) = op(args), cond, delay_us, timeout_us, delay_before_read) /** * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs -- cgit v1.2.3 From 309a7e514da7d53e05b5d053594f6aabb0d382b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Feb 2026 13:34:47 -0800 Subject: lib/crypto: aes: Add support for CBC-based MACs Add support for CBC-based MACs to the AES library, specifically AES-CMAC, AES-XCBC-MAC, and AES-CBC-MAC. Of these three algorithms, AES-CMAC is the most modern and the most commonly used. Use cases for the AES-CMAC library include the kernel's SMB client and server, and the bluetooth and mac80211 drivers. Support for AES-XCBC-MAC and AES-CBC-MAC is included so that there will be no performance regression in the "xcbc(aes)" and "ccm(aes)" support in the traditional crypto API once the arm64-optimized code is migrated into the library. AES-XCBC-MAC is given its own key preparation function but is otherwise identical to AES-CMAC and just reuses the AES-CMAC structs and functions. The implementation automatically uses the optimized AES key expansion and single block en/decryption functions. It also allows architectures to provide an optimized implementation of aes_cbcmac_blocks(), which allows the existing arm64-optimized code for these modes to be used. Just put the code for these modes directly in the libaes module rather than in a separate module. This is simpler, it makes it easier to share code between AES modes, and it increases the amount of inlining that is possible. (Indeed, for these reasons, most of the architecture-optimized AES code already provides multiple modes per module. x86 for example has only a single aesni-intel module. So to a large extent, this design choice just reflects the status quo.) However, since there are a lot of AES modes, there's still some value in omitting modes that are not needed at all in a given kernel. Therefore, make these modes an optional feature of libaes, controlled by CONFIG_CRYPTO_LIB_AES_CBC_MACS. This seems like a good middle ground. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260218213501.136844-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/aes-cbc-macs.h | 154 ++++++++++++++++++++++++++++++++ lib/crypto/Kconfig | 10 +++ lib/crypto/aes.c | 198 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 362 insertions(+) create mode 100644 include/crypto/aes-cbc-macs.h (limited to 'include') diff --git a/include/crypto/aes-cbc-macs.h b/include/crypto/aes-cbc-macs.h new file mode 100644 index 000000000000..e61df108b926 --- /dev/null +++ b/include/crypto/aes-cbc-macs.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for AES-CMAC, AES-XCBC-MAC, and AES-CBC-MAC + * + * Copyright 2026 Google LLC + */ +#ifndef _CRYPTO_AES_CBC_MACS_H +#define _CRYPTO_AES_CBC_MACS_H + +#include + +/** + * struct aes_cmac_key - Prepared key for AES-CMAC or AES-XCBC-MAC + * @aes: The AES key for cipher block chaining + * @k_final: Finalization subkeys for the final block. + * k_final[0] (CMAC K1, XCBC-MAC K2) is used if it's a full block. + * k_final[1] (CMAC K2, XCBC-MAC K3) is used if it's a partial block. + */ +struct aes_cmac_key { + struct aes_enckey aes; + union { + u8 b[AES_BLOCK_SIZE]; + __be64 w[2]; + } k_final[2]; +}; + +/** + * struct aes_cmac_ctx - Context for computing an AES-CMAC or AES-XCBC-MAC value + * @key: Pointer to the key struct. A pointer is used rather than a copy of the + * struct, since the key struct size may be large. It is assumed that the + * key lives at least as long as the context. + * @partial_len: Number of bytes that have been XOR'ed into @h since the last + * AES encryption. This is 0 if no data has been processed yet, + * or between 1 and AES_BLOCK_SIZE inclusive otherwise. + * @h: The current chaining value + */ +struct aes_cmac_ctx { + const struct aes_cmac_key *key; + size_t partial_len; + u8 h[AES_BLOCK_SIZE]; +}; + +/** + * aes_cmac_preparekey() - Prepare a key for AES-CMAC + * @key: (output) The key struct to initialize + * @in_key: The raw AES key + * @key_len: Length of the raw key in bytes. The supported values are + * AES_KEYSIZE_128, AES_KEYSIZE_192, and AES_KEYSIZE_256. + * + * Context: Any context. + * Return: 0 on success or -EINVAL if the given key length is invalid. No other + * errors are possible, so callers that always pass a valid key length + * don't need to check for errors. + */ +int aes_cmac_preparekey(struct aes_cmac_key *key, const u8 *in_key, + size_t key_len); + +/** + * aes_xcbcmac_preparekey() - Prepare a key for AES-XCBC-MAC + * @key: (output) The key struct to initialize + * @in_key: The raw key. As per the AES-XCBC-MAC specification (RFC 3566), this + * is 128 bits, matching the internal use of AES-128. + * + * AES-XCBC-MAC and AES-CMAC are the same except for the key preparation. After + * that step, AES-XCBC-MAC is supported via the aes_cmac_* functions. + * + * New users should use AES-CMAC instead of AES-XCBC-MAC. + * + * Context: Any context. + */ +void aes_xcbcmac_preparekey(struct aes_cmac_key *key, + const u8 in_key[at_least AES_KEYSIZE_128]); + +/** + * aes_cmac_init() - Start computing an AES-CMAC or AES-XCBC-MAC value + * @ctx: (output) The context to initialize + * @key: The key to use. Note that a pointer to the key is saved in the + * context, so the key must live at least as long as the context. + * + * This supports both AES-CMAC and AES-XCBC-MAC. Which one is done depends on + * whether aes_cmac_preparekey() or aes_xcbcmac_preparekey() was called. + */ +static inline void aes_cmac_init(struct aes_cmac_ctx *ctx, + const struct aes_cmac_key *key) +{ + *ctx = (struct aes_cmac_ctx){ .key = key }; +} + +/** + * aes_cmac_update() - Update an AES-CMAC or AES-XCBC-MAC context with more data + * @ctx: The context to update; must have been initialized + * @data: The message data + * @data_len: The data length in bytes. Doesn't need to be block-aligned. + * + * This can be called any number of times. + * + * Context: Any context. + */ +void aes_cmac_update(struct aes_cmac_ctx *ctx, const u8 *data, size_t data_len); + +/** + * aes_cmac_final() - Finish computing an AES-CMAC or AES-XCBC-MAC value + * @ctx: The context to finalize; must have been initialized + * @out: (output) The resulting MAC + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void aes_cmac_final(struct aes_cmac_ctx *ctx, u8 out[at_least AES_BLOCK_SIZE]); + +/** + * aes_cmac() - Compute AES-CMAC or AES-XCBC-MAC in one shot + * @key: The key to use + * @data: The message data + * @data_len: The data length in bytes + * @out: (output) The resulting AES-CMAC or AES-XCBC-MAC value + * + * This supports both AES-CMAC and AES-XCBC-MAC. Which one is done depends on + * whether aes_cmac_preparekey() or aes_xcbcmac_preparekey() was called. + * + * Context: Any context. + */ +static inline void aes_cmac(const struct aes_cmac_key *key, const u8 *data, + size_t data_len, u8 out[at_least AES_BLOCK_SIZE]) +{ + struct aes_cmac_ctx ctx; + + aes_cmac_init(&ctx, key); + aes_cmac_update(&ctx, data, data_len); + aes_cmac_final(&ctx, out); +} + +/* + * AES-CBC-MAC support. This is provided only for use by the implementation of + * AES-CCM. It should have no other users. Warning: unlike AES-CMAC and + * AES-XCBC-MAC, AES-CBC-MAC isn't a secure MAC for variable-length messages. + */ +struct aes_cbcmac_ctx { + const struct aes_enckey *key; + size_t partial_len; + u8 h[AES_BLOCK_SIZE]; +}; +static inline void aes_cbcmac_init(struct aes_cbcmac_ctx *ctx, + const struct aes_enckey *key) +{ + *ctx = (struct aes_cbcmac_ctx){ .key = key }; +} +void aes_cbcmac_update(struct aes_cbcmac_ctx *ctx, const u8 *data, + size_t data_len); +void aes_cbcmac_final(struct aes_cbcmac_ctx *ctx, + u8 out[at_least AES_BLOCK_SIZE]); + +#endif /* _CRYPTO_AES_CBC_MACS_H */ diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 032f9755f999..42ec51645915 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -10,6 +10,8 @@ config CRYPTO_LIB_UTILS config CRYPTO_LIB_AES tristate + # Select dependencies of modes that are part of libaes. + select CRYPTO_LIB_UTILS if CRYPTO_LIB_AES_CBC_MACS config CRYPTO_LIB_AES_ARCH bool @@ -28,6 +30,14 @@ config CRYPTO_LIB_AESCFB select CRYPTO_LIB_AES select CRYPTO_LIB_UTILS +config CRYPTO_LIB_AES_CBC_MACS + tristate + select CRYPTO_LIB_AES + help + The AES-CMAC, AES-XCBC-MAC, and AES-CBC-MAC library functions. Select + this if your module uses any of the functions from + . + config CRYPTO_LIB_AESGCM tristate select CRYPTO_LIB_AES diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c index b73e19f1bb95..39deae6105c0 100644 --- a/lib/crypto/aes.c +++ b/lib/crypto/aes.c @@ -4,7 +4,9 @@ * Copyright 2026 Google LLC */ +#include #include +#include #include #include #include @@ -512,6 +514,202 @@ void aes_decrypt(const struct aes_key *key, u8 out[AES_BLOCK_SIZE], } EXPORT_SYMBOL(aes_decrypt); +#if IS_ENABLED(CONFIG_CRYPTO_LIB_AES_CBC_MACS) + +#ifndef aes_cbcmac_blocks_arch +static bool aes_cbcmac_blocks_arch(u8 h[AES_BLOCK_SIZE], + const struct aes_enckey *key, const u8 *data, + size_t nblocks, bool enc_before, + bool enc_after) +{ + return false; +} +#endif + +/* This assumes nblocks >= 1. */ +static void aes_cbcmac_blocks(u8 h[AES_BLOCK_SIZE], + const struct aes_enckey *key, const u8 *data, + size_t nblocks, bool enc_before, bool enc_after) +{ + if (aes_cbcmac_blocks_arch(h, key, data, nblocks, enc_before, + enc_after)) + return; + + if (enc_before) + aes_encrypt(key, h, h); + for (; nblocks > 1; nblocks--) { + crypto_xor(h, data, AES_BLOCK_SIZE); + data += AES_BLOCK_SIZE; + aes_encrypt(key, h, h); + } + crypto_xor(h, data, AES_BLOCK_SIZE); + if (enc_after) + aes_encrypt(key, h, h); +} + +int aes_cmac_preparekey(struct aes_cmac_key *key, const u8 *in_key, + size_t key_len) +{ + u64 hi, lo, mask; + int err; + + /* Prepare the AES key. */ + err = aes_prepareenckey(&key->aes, in_key, key_len); + if (err) + return err; + + /* + * Prepare the subkeys K1 and K2 by encrypting the all-zeroes block, + * then multiplying by 'x' and 'x^2' (respectively) in GF(2^128). + * Reference: NIST SP 800-38B, Section 6.1 "Subkey Generation". + */ + memset(key->k_final[0].b, 0, AES_BLOCK_SIZE); + aes_encrypt(&key->aes, key->k_final[0].b, key->k_final[0].b); + hi = be64_to_cpu(key->k_final[0].w[0]); + lo = be64_to_cpu(key->k_final[0].w[1]); + for (int i = 0; i < 2; i++) { + mask = ((s64)hi >> 63) & 0x87; + hi = (hi << 1) ^ (lo >> 63); + lo = (lo << 1) ^ mask; + key->k_final[i].w[0] = cpu_to_be64(hi); + key->k_final[i].w[1] = cpu_to_be64(lo); + } + return 0; +} +EXPORT_SYMBOL_GPL(aes_cmac_preparekey); + +void aes_xcbcmac_preparekey(struct aes_cmac_key *key, + const u8 in_key[AES_KEYSIZE_128]) +{ + static const u8 constants[3][AES_BLOCK_SIZE] = { + { [0 ... AES_BLOCK_SIZE - 1] = 0x1 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x2 }, + { [0 ... AES_BLOCK_SIZE - 1] = 0x3 }, + }; + u8 new_aes_key[AES_BLOCK_SIZE]; + + static_assert(AES_BLOCK_SIZE == AES_KEYSIZE_128); + aes_prepareenckey(&key->aes, in_key, AES_BLOCK_SIZE); + aes_encrypt(&key->aes, new_aes_key, constants[0]); + aes_encrypt(&key->aes, key->k_final[0].b, constants[1]); + aes_encrypt(&key->aes, key->k_final[1].b, constants[2]); + aes_prepareenckey(&key->aes, new_aes_key, AES_BLOCK_SIZE); + memzero_explicit(new_aes_key, AES_BLOCK_SIZE); +} +EXPORT_SYMBOL_GPL(aes_xcbcmac_preparekey); + +void aes_cmac_update(struct aes_cmac_ctx *ctx, const u8 *data, size_t data_len) +{ + bool enc_before = false; + size_t nblocks; + + if (ctx->partial_len) { + /* XOR data into a pending block. */ + size_t l = min(data_len, AES_BLOCK_SIZE - ctx->partial_len); + + crypto_xor(&ctx->h[ctx->partial_len], data, l); + data += l; + data_len -= l; + ctx->partial_len += l; + if (data_len == 0) { + /* + * Either the pending block hasn't been filled yet, or + * no more data was given so it's not yet known whether + * the block is the final block. + */ + return; + } + /* Pending block has been filled and isn't the final block. */ + enc_before = true; + } + + nblocks = data_len / AES_BLOCK_SIZE; + data_len %= AES_BLOCK_SIZE; + if (nblocks == 0) { + /* 0 additional full blocks, then optionally a partial block */ + if (enc_before) + aes_encrypt(&ctx->key->aes, ctx->h, ctx->h); + crypto_xor(ctx->h, data, data_len); + ctx->partial_len = data_len; + } else if (data_len != 0) { + /* 1 or more additional full blocks, then a partial block */ + aes_cbcmac_blocks(ctx->h, &ctx->key->aes, data, nblocks, + enc_before, /* enc_after= */ true); + data += nblocks * AES_BLOCK_SIZE; + crypto_xor(ctx->h, data, data_len); + ctx->partial_len = data_len; + } else { + /* + * 1 or more additional full blocks only. Encryption of the + * last block is delayed until it's known whether it's the final + * block in the message or not. + */ + aes_cbcmac_blocks(ctx->h, &ctx->key->aes, data, nblocks, + enc_before, /* enc_after= */ false); + ctx->partial_len = AES_BLOCK_SIZE; + } +} +EXPORT_SYMBOL_GPL(aes_cmac_update); + +void aes_cmac_final(struct aes_cmac_ctx *ctx, u8 out[AES_BLOCK_SIZE]) +{ + if (ctx->partial_len == AES_BLOCK_SIZE) { + /* Final block is a full block. Use k_final[0]. */ + crypto_xor(ctx->h, ctx->key->k_final[0].b, AES_BLOCK_SIZE); + } else { + /* Final block is a partial block. Pad, and use k_final[1]. */ + ctx->h[ctx->partial_len] ^= 0x80; + crypto_xor(ctx->h, ctx->key->k_final[1].b, AES_BLOCK_SIZE); + } + aes_encrypt(&ctx->key->aes, out, ctx->h); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(aes_cmac_final); + +void aes_cbcmac_update(struct aes_cbcmac_ctx *ctx, const u8 *data, + size_t data_len) +{ + bool enc_before = false; + size_t nblocks; + + if (ctx->partial_len) { + size_t l = min(data_len, AES_BLOCK_SIZE - ctx->partial_len); + + crypto_xor(&ctx->h[ctx->partial_len], data, l); + data += l; + data_len -= l; + ctx->partial_len += l; + if (ctx->partial_len < AES_BLOCK_SIZE) + return; + enc_before = true; + } + + nblocks = data_len / AES_BLOCK_SIZE; + data_len %= AES_BLOCK_SIZE; + if (nblocks == 0) { + if (enc_before) + aes_encrypt(ctx->key, ctx->h, ctx->h); + } else { + aes_cbcmac_blocks(ctx->h, ctx->key, data, nblocks, enc_before, + /* enc_after= */ true); + data += nblocks * AES_BLOCK_SIZE; + } + crypto_xor(ctx->h, data, data_len); + ctx->partial_len = data_len; +} +EXPORT_SYMBOL_NS_GPL(aes_cbcmac_update, "CRYPTO_INTERNAL"); + +void aes_cbcmac_final(struct aes_cbcmac_ctx *ctx, u8 out[AES_BLOCK_SIZE]) +{ + if (ctx->partial_len) + aes_encrypt(ctx->key, out, ctx->h); + else + memcpy(out, ctx->h, AES_BLOCK_SIZE); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_NS_GPL(aes_cbcmac_final, "CRYPTO_INTERNAL"); +#endif /* CONFIG_CRYPTO_LIB_AES_CBC_MACS */ + #ifdef aes_mod_init_arch static int __init aes_mod_init(void) { -- cgit v1.2.3 From 4b908403209252e59ecad4c068bf967fa3f07525 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Feb 2026 13:34:50 -0800 Subject: lib/crypto: arm64/aes: Move assembly code for AES modes into libaes To migrate the support for CBC-based MACs into libaes, the corresponding arm64 assembly code needs to be moved there. However, the arm64 AES assembly code groups many AES modes together; individual modes aren't easily separable. (This isn't unique to arm64; other architectures organize their AES modes similarly.) Since the other AES modes will be migrated into the library eventually too, just move the full assembly files for the AES modes into the library. (This is similar to what I already did for PowerPC and SPARC.) Specifically: move the assembly files aes-ce.S, aes-modes.S, and aes-neon.S and their build rules; declare the assembly functions in ; and export the assembly functions from libaes. Note that the exports and public declarations of the assembly functions are temporary. They exist only to keep arch/arm64/crypto/ working until the AES modes are fully moved into the library. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260218213501.136844-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/Makefile | 4 +- arch/arm64/crypto/aes-ce-ccm-glue.c | 4 - arch/arm64/crypto/aes-ce.S | 146 ------ arch/arm64/crypto/aes-glue.c | 47 +- arch/arm64/crypto/aes-modes.S | 866 ------------------------------------ arch/arm64/crypto/aes-neon.S | 250 ----------- arch/arm64/crypto/aes-neonbs-glue.c | 15 +- include/crypto/aes.h | 69 +++ lib/crypto/Makefile | 4 +- lib/crypto/arm64/aes-ce.S | 145 ++++++ lib/crypto/arm64/aes-modes.S | 866 ++++++++++++++++++++++++++++++++++++ lib/crypto/arm64/aes-neon.S | 250 +++++++++++ lib/crypto/arm64/aes.h | 30 ++ 13 files changed, 1367 insertions(+), 1329 deletions(-) delete mode 100644 arch/arm64/crypto/aes-ce.S delete mode 100644 arch/arm64/crypto/aes-modes.S delete mode 100644 arch/arm64/crypto/aes-neon.S create mode 100644 lib/crypto/arm64/aes-ce.S create mode 100644 lib/crypto/arm64/aes-modes.S create mode 100644 lib/crypto/arm64/aes-neon.S (limited to 'include') diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 3574e917bc37..8a8e3e551ed3 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -33,10 +33,10 @@ obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o -aes-ce-blk-y := aes-glue-ce.o aes-ce.o +aes-ce-blk-y := aes-glue-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o -aes-neon-blk-y := aes-glue-neon.o aes-neon.o +aes-neon-blk-y := aes-glue-neon.o obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index db371ac051fc..45aed0073283 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -31,10 +31,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx) return 6 + ctx->key_length / 4; } -asmlinkage u32 ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); - asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, u32 const rk[], u32 rounds, u8 mac[], u8 ctr[], u8 const final_iv[]); diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S deleted file mode 100644 index b262eaa9170c..000000000000 --- a/arch/arm64/crypto/aes-ce.S +++ /dev/null @@ -1,146 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with - * Crypto Extensions - * - * Copyright (C) 2013 - 2017 Linaro Ltd - */ - -#include -#include - -#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func) -#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func) - - .arch armv8-a+crypto - - xtsmask .req v16 - cbciv .req v16 - vctr .req v16 - - .macro xts_reload_mask, tmp - .endm - - .macro xts_cts_skip_tw, reg, lbl - .endm - - /* preload all round keys */ - .macro load_round_keys, rk, nr, tmp - add \tmp, \rk, \nr, sxtw #4 - sub \tmp, \tmp, #160 - ld1 {v17.4s-v20.4s}, [\rk] - ld1 {v21.4s-v24.4s}, [\tmp], #64 - ld1 {v25.4s-v28.4s}, [\tmp], #64 - ld1 {v29.4s-v31.4s}, [\tmp] - .endm - - /* prepare for encryption with key in rk[] */ - .macro enc_prepare, rounds, rk, temp - load_round_keys \rk, \rounds, \temp - .endm - - /* prepare for encryption (again) but with new key in rk[] */ - .macro enc_switch_key, rounds, rk, temp - load_round_keys \rk, \rounds, \temp - .endm - - /* prepare for decryption with key in rk[] */ - .macro dec_prepare, rounds, rk, temp - load_round_keys \rk, \rounds, \temp - .endm - - .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 - aes\de \i0\().16b, \k\().16b - aes\mc \i0\().16b, \i0\().16b - .ifnb \i1 - aes\de \i1\().16b, \k\().16b - aes\mc \i1\().16b, \i1\().16b - .ifnb \i3 - aes\de \i2\().16b, \k\().16b - aes\mc \i2\().16b, \i2\().16b - aes\de \i3\().16b, \k\().16b - aes\mc \i3\().16b, \i3\().16b - .ifnb \i4 - aes\de \i4\().16b, \k\().16b - aes\mc \i4\().16b, \i4\().16b - .endif - .endif - .endif - .endm - - /* up to 5 interleaved encryption rounds with the same round key */ - .macro round_Nx, enc, k, i0, i1, i2, i3, i4 - .ifc \enc, e - do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4 - .else - do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4 - .endif - .endm - - /* up to 5 interleaved final rounds */ - .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4 - aes\de \i0\().16b, \k\().16b - .ifnb \i1 - aes\de \i1\().16b, \k\().16b - .ifnb \i3 - aes\de \i2\().16b, \k\().16b - aes\de \i3\().16b, \k\().16b - .ifnb \i4 - aes\de \i4\().16b, \k\().16b - .endif - .endif - .endif - eor \i0\().16b, \i0\().16b, \k2\().16b - .ifnb \i1 - eor \i1\().16b, \i1\().16b, \k2\().16b - .ifnb \i3 - eor \i2\().16b, \i2\().16b, \k2\().16b - eor \i3\().16b, \i3\().16b, \k2\().16b - .ifnb \i4 - eor \i4\().16b, \i4\().16b, \k2\().16b - .endif - .endif - .endif - .endm - - /* up to 5 interleaved blocks */ - .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 - tbz \rounds, #2, .L\@ /* 128 bits */ - round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 - round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 - tbz \rounds, #1, .L\@ /* 192 bits */ - round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 - round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 -.L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 - round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 - .endr - fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4 - .endm - - .macro encrypt_block, in, rounds, t0, t1, t2 - do_block_Nx e, \rounds, \in - .endm - - .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 - do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 - .endm - - .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 - do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4 - .endm - - .macro decrypt_block, in, rounds, t0, t1, t2 - do_block_Nx d, \rounds, \in - .endm - - .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 - do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 - .endm - - .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 - do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4 - .endm - -#define MAX_STRIDE 5 - -#include "aes-modes.S" diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 92f43e1cd097..fd7c3a560a71 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -71,48 +71,9 @@ MODULE_ALIAS_CRYPTO("xcbc(aes)"); MODULE_ALIAS_CRYPTO("cbcmac(aes)"); MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_IMPORT_NS("CRYPTO_INTERNAL"); MODULE_LICENSE("GPL v2"); -/* defined in aes-modes.S */ -asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks); -asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks); - -asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks, u8 iv[]); -asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks, u8 iv[]); - -asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 const iv[]); -asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 const iv[]); - -asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 ctr[]); - -asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 ctr[], int byte_ctr); - -asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], - int rounds, int bytes, u32 const rk2[], u8 iv[], - int first); -asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], - int rounds, int bytes, u32 const rk2[], u8 iv[], - int first); - -asmlinkage void aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], - int rounds, int blocks, u8 iv[], - u32 const rk2[]); -asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], - int rounds, int blocks, u8 iv[], - u32 const rk2[]); - -asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); - struct crypto_aes_xts_ctx { struct crypto_aes_ctx key1; struct crypto_aes_ctx __aligned(8) key2; @@ -971,13 +932,7 @@ unregister_ciphers: #ifdef USE_V8_CRYPTO_EXTENSIONS module_cpu_feature_match(AES, aes_init); -EXPORT_SYMBOL_NS(ce_aes_mac_update, "CRYPTO_INTERNAL"); #else module_init(aes_init); -EXPORT_SYMBOL(neon_aes_ecb_encrypt); -EXPORT_SYMBOL(neon_aes_cbc_encrypt); -EXPORT_SYMBOL(neon_aes_ctr_encrypt); -EXPORT_SYMBOL(neon_aes_xts_encrypt); -EXPORT_SYMBOL(neon_aes_xts_decrypt); #endif module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S deleted file mode 100644 index e793478f37c1..000000000000 --- a/arch/arm64/crypto/aes-modes.S +++ /dev/null @@ -1,866 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES - * - * Copyright (C) 2013 - 2017 Linaro Ltd - */ - -/* included by aes-ce.S and aes-neon.S */ - - .text - .align 4 - -#ifndef MAX_STRIDE -#define MAX_STRIDE 4 -#endif - -#if MAX_STRIDE == 4 -#define ST4(x...) x -#define ST5(x...) -#else -#define ST4(x...) -#define ST5(x...) x -#endif - -SYM_FUNC_START_LOCAL(aes_encrypt_block4x) - encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 - ret -SYM_FUNC_END(aes_encrypt_block4x) - -SYM_FUNC_START_LOCAL(aes_decrypt_block4x) - decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 - ret -SYM_FUNC_END(aes_decrypt_block4x) - -#if MAX_STRIDE == 5 -SYM_FUNC_START_LOCAL(aes_encrypt_block5x) - encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 - ret -SYM_FUNC_END(aes_encrypt_block5x) - -SYM_FUNC_START_LOCAL(aes_decrypt_block5x) - decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 - ret -SYM_FUNC_END(aes_decrypt_block5x) -#endif - - /* - * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks) - * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks) - */ - -AES_FUNC_START(aes_ecb_encrypt) - frame_push 0 - - enc_prepare w3, x2, x5 - -.LecbencloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lecbenc1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ -ST4( bl aes_encrypt_block4x ) -ST5( ld1 {v4.16b}, [x1], #16 ) -ST5( bl aes_encrypt_block5x ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - b .LecbencloopNx -.Lecbenc1x: - adds w4, w4, #MAX_STRIDE - beq .Lecbencout -.Lecbencloop: - ld1 {v0.16b}, [x1], #16 /* get next pt block */ - encrypt_block v0, w3, x2, x5, w6 - st1 {v0.16b}, [x0], #16 - subs w4, w4, #1 - bne .Lecbencloop -.Lecbencout: - frame_pop - ret -AES_FUNC_END(aes_ecb_encrypt) - - -AES_FUNC_START(aes_ecb_decrypt) - frame_push 0 - - dec_prepare w3, x2, x5 - -.LecbdecloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lecbdec1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ -ST4( bl aes_decrypt_block4x ) -ST5( ld1 {v4.16b}, [x1], #16 ) -ST5( bl aes_decrypt_block5x ) - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - b .LecbdecloopNx -.Lecbdec1x: - adds w4, w4, #MAX_STRIDE - beq .Lecbdecout -.Lecbdecloop: - ld1 {v0.16b}, [x1], #16 /* get next ct block */ - decrypt_block v0, w3, x2, x5, w6 - st1 {v0.16b}, [x0], #16 - subs w4, w4, #1 - bne .Lecbdecloop -.Lecbdecout: - frame_pop - ret -AES_FUNC_END(aes_ecb_decrypt) - - - /* - * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 iv[]) - * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 iv[]) - * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], - * int rounds, int blocks, u8 iv[], - * u32 const rk2[]); - * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], - * int rounds, int blocks, u8 iv[], - * u32 const rk2[]); - */ - -AES_FUNC_START(aes_essiv_cbc_encrypt) - ld1 {v4.16b}, [x5] /* get iv */ - - mov w8, #14 /* AES-256: 14 rounds */ - enc_prepare w8, x6, x7 - encrypt_block v4, w8, x6, x7, w9 - enc_switch_key w3, x2, x6 - b .Lcbcencloop4x - -AES_FUNC_START(aes_cbc_encrypt) - ld1 {v4.16b}, [x5] /* get iv */ - enc_prepare w3, x2, x6 - -.Lcbcencloop4x: - subs w4, w4, #4 - bmi .Lcbcenc1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ - eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ - encrypt_block v0, w3, x2, x6, w7 - eor v1.16b, v1.16b, v0.16b - encrypt_block v1, w3, x2, x6, w7 - eor v2.16b, v2.16b, v1.16b - encrypt_block v2, w3, x2, x6, w7 - eor v3.16b, v3.16b, v2.16b - encrypt_block v3, w3, x2, x6, w7 - st1 {v0.16b-v3.16b}, [x0], #64 - mov v4.16b, v3.16b - b .Lcbcencloop4x -.Lcbcenc1x: - adds w4, w4, #4 - beq .Lcbcencout -.Lcbcencloop: - ld1 {v0.16b}, [x1], #16 /* get next pt block */ - eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ - encrypt_block v4, w3, x2, x6, w7 - st1 {v4.16b}, [x0], #16 - subs w4, w4, #1 - bne .Lcbcencloop -.Lcbcencout: - st1 {v4.16b}, [x5] /* return iv */ - ret -AES_FUNC_END(aes_cbc_encrypt) -AES_FUNC_END(aes_essiv_cbc_encrypt) - -AES_FUNC_START(aes_essiv_cbc_decrypt) - ld1 {cbciv.16b}, [x5] /* get iv */ - - mov w8, #14 /* AES-256: 14 rounds */ - enc_prepare w8, x6, x7 - encrypt_block cbciv, w8, x6, x7, w9 - b .Lessivcbcdecstart - -AES_FUNC_START(aes_cbc_decrypt) - ld1 {cbciv.16b}, [x5] /* get iv */ -.Lessivcbcdecstart: - frame_push 0 - dec_prepare w3, x2, x6 - -.LcbcdecloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lcbcdec1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ -#if MAX_STRIDE == 5 - ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ - mov v5.16b, v0.16b - mov v6.16b, v1.16b - mov v7.16b, v2.16b - bl aes_decrypt_block5x - sub x1, x1, #32 - eor v0.16b, v0.16b, cbciv.16b - eor v1.16b, v1.16b, v5.16b - ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ - ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ - eor v2.16b, v2.16b, v6.16b - eor v3.16b, v3.16b, v7.16b - eor v4.16b, v4.16b, v5.16b -#else - mov v4.16b, v0.16b - mov v5.16b, v1.16b - mov v6.16b, v2.16b - bl aes_decrypt_block4x - sub x1, x1, #16 - eor v0.16b, v0.16b, cbciv.16b - eor v1.16b, v1.16b, v4.16b - ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ - eor v2.16b, v2.16b, v5.16b - eor v3.16b, v3.16b, v6.16b -#endif - st1 {v0.16b-v3.16b}, [x0], #64 -ST5( st1 {v4.16b}, [x0], #16 ) - b .LcbcdecloopNx -.Lcbcdec1x: - adds w4, w4, #MAX_STRIDE - beq .Lcbcdecout -.Lcbcdecloop: - ld1 {v1.16b}, [x1], #16 /* get next ct block */ - mov v0.16b, v1.16b /* ...and copy to v0 */ - decrypt_block v0, w3, x2, x6, w7 - eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ - mov cbciv.16b, v1.16b /* ct is next iv */ - st1 {v0.16b}, [x0], #16 - subs w4, w4, #1 - bne .Lcbcdecloop -.Lcbcdecout: - st1 {cbciv.16b}, [x5] /* return iv */ - frame_pop - ret -AES_FUNC_END(aes_cbc_decrypt) -AES_FUNC_END(aes_essiv_cbc_decrypt) - - - /* - * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], - * int rounds, int bytes, u8 const iv[]) - * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], - * int rounds, int bytes, u8 const iv[]) - */ - -AES_FUNC_START(aes_cbc_cts_encrypt) - adr_l x8, .Lcts_permute_table - sub x4, x4, #16 - add x9, x8, #32 - add x8, x8, x4 - sub x9, x9, x4 - ld1 {v3.16b}, [x8] - ld1 {v4.16b}, [x9] - - ld1 {v0.16b}, [x1], x4 /* overlapping loads */ - ld1 {v1.16b}, [x1] - - ld1 {v5.16b}, [x5] /* get iv */ - enc_prepare w3, x2, x6 - - eor v0.16b, v0.16b, v5.16b /* xor with iv */ - tbl v1.16b, {v1.16b}, v4.16b - encrypt_block v0, w3, x2, x6, w7 - - eor v1.16b, v1.16b, v0.16b - tbl v0.16b, {v0.16b}, v3.16b - encrypt_block v1, w3, x2, x6, w7 - - add x4, x0, x4 - st1 {v0.16b}, [x4] /* overlapping stores */ - st1 {v1.16b}, [x0] - ret -AES_FUNC_END(aes_cbc_cts_encrypt) - -AES_FUNC_START(aes_cbc_cts_decrypt) - adr_l x8, .Lcts_permute_table - sub x4, x4, #16 - add x9, x8, #32 - add x8, x8, x4 - sub x9, x9, x4 - ld1 {v3.16b}, [x8] - ld1 {v4.16b}, [x9] - - ld1 {v0.16b}, [x1], x4 /* overlapping loads */ - ld1 {v1.16b}, [x1] - - ld1 {v5.16b}, [x5] /* get iv */ - dec_prepare w3, x2, x6 - - decrypt_block v0, w3, x2, x6, w7 - tbl v2.16b, {v0.16b}, v3.16b - eor v2.16b, v2.16b, v1.16b - - tbx v0.16b, {v1.16b}, v4.16b - decrypt_block v0, w3, x2, x6, w7 - eor v0.16b, v0.16b, v5.16b /* xor with iv */ - - add x4, x0, x4 - st1 {v2.16b}, [x4] /* overlapping stores */ - st1 {v0.16b}, [x0] - ret -AES_FUNC_END(aes_cbc_cts_decrypt) - - .section ".rodata", "a" - .align 6 -.Lcts_permute_table: - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .previous - - /* - * This macro generates the code for CTR and XCTR mode. - */ -.macro ctr_encrypt xctr - // Arguments - OUT .req x0 - IN .req x1 - KEY .req x2 - ROUNDS_W .req w3 - BYTES_W .req w4 - IV .req x5 - BYTE_CTR_W .req w6 // XCTR only - // Intermediate values - CTR_W .req w11 // XCTR only - CTR .req x11 // XCTR only - IV_PART .req x12 - BLOCKS .req x13 - BLOCKS_W .req w13 - - frame_push 0 - - enc_prepare ROUNDS_W, KEY, IV_PART - ld1 {vctr.16b}, [IV] - - /* - * Keep 64 bits of the IV in a register. For CTR mode this lets us - * easily increment the IV. For XCTR mode this lets us efficiently XOR - * the 64-bit counter with the IV. - */ - .if \xctr - umov IV_PART, vctr.d[0] - lsr CTR_W, BYTE_CTR_W, #4 - .else - umov IV_PART, vctr.d[1] - rev IV_PART, IV_PART - .endif - -.LctrloopNx\xctr: - add BLOCKS_W, BYTES_W, #15 - sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 - lsr BLOCKS_W, BLOCKS_W, #4 - mov w8, #MAX_STRIDE - cmp BLOCKS_W, w8 - csel BLOCKS_W, BLOCKS_W, w8, lt - - /* - * Set up the counter values in v0-v{MAX_STRIDE-1}. - * - * If we are encrypting less than MAX_STRIDE blocks, the tail block - * handling code expects the last keystream block to be in - * v{MAX_STRIDE-1}. For example: if encrypting two blocks with - * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. - */ - .if \xctr - add CTR, CTR, BLOCKS - .else - adds IV_PART, IV_PART, BLOCKS - .endif - mov v0.16b, vctr.16b - mov v1.16b, vctr.16b - mov v2.16b, vctr.16b - mov v3.16b, vctr.16b -ST5( mov v4.16b, vctr.16b ) - .if \xctr - sub x6, CTR, #MAX_STRIDE - 1 - sub x7, CTR, #MAX_STRIDE - 2 - sub x8, CTR, #MAX_STRIDE - 3 - sub x9, CTR, #MAX_STRIDE - 4 -ST5( sub x10, CTR, #MAX_STRIDE - 5 ) - eor x6, x6, IV_PART - eor x7, x7, IV_PART - eor x8, x8, IV_PART - eor x9, x9, IV_PART -ST5( eor x10, x10, IV_PART ) - mov v0.d[0], x6 - mov v1.d[0], x7 - mov v2.d[0], x8 - mov v3.d[0], x9 -ST5( mov v4.d[0], x10 ) - .else - bcs 0f - .subsection 1 - /* - * This subsection handles carries. - * - * Conditional branching here is allowed with respect to time - * invariance since the branches are dependent on the IV instead - * of the plaintext or key. This code is rarely executed in - * practice anyway. - */ - - /* Apply carry to outgoing counter. */ -0: umov x8, vctr.d[0] - rev x8, x8 - add x8, x8, #1 - rev x8, x8 - ins vctr.d[0], x8 - - /* - * Apply carry to counter blocks if needed. - * - * Since the carry flag was set, we know 0 <= IV_PART < - * MAX_STRIDE. Using the value of IV_PART we can determine how - * many counter blocks need to be updated. - */ - cbz IV_PART, 2f - adr x16, 1f - sub x16, x16, IV_PART, lsl #3 - br x16 - bti c - mov v0.d[0], vctr.d[0] - bti c - mov v1.d[0], vctr.d[0] - bti c - mov v2.d[0], vctr.d[0] - bti c - mov v3.d[0], vctr.d[0] -ST5( bti c ) -ST5( mov v4.d[0], vctr.d[0] ) -1: b 2f - .previous - -2: rev x7, IV_PART - ins vctr.d[1], x7 - sub x7, IV_PART, #MAX_STRIDE - 1 - sub x8, IV_PART, #MAX_STRIDE - 2 - sub x9, IV_PART, #MAX_STRIDE - 3 - rev x7, x7 - rev x8, x8 - mov v1.d[1], x7 - rev x9, x9 -ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) - mov v2.d[1], x8 -ST5( rev x10, x10 ) - mov v3.d[1], x9 -ST5( mov v4.d[1], x10 ) - .endif - - /* - * If there are at least MAX_STRIDE blocks left, XOR the data with - * keystream and store. Otherwise jump to tail handling. - */ - tbnz BYTES_W, #31, .Lctrtail\xctr - ld1 {v5.16b-v7.16b}, [IN], #48 -ST4( bl aes_encrypt_block4x ) -ST5( bl aes_encrypt_block5x ) - eor v0.16b, v5.16b, v0.16b -ST4( ld1 {v5.16b}, [IN], #16 ) - eor v1.16b, v6.16b, v1.16b -ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) - eor v2.16b, v7.16b, v2.16b - eor v3.16b, v5.16b, v3.16b -ST5( eor v4.16b, v6.16b, v4.16b ) - st1 {v0.16b-v3.16b}, [OUT], #64 -ST5( st1 {v4.16b}, [OUT], #16 ) - cbz BYTES_W, .Lctrout\xctr - b .LctrloopNx\xctr - -.Lctrout\xctr: - .if !\xctr - st1 {vctr.16b}, [IV] /* return next CTR value */ - .endif - frame_pop - ret - -.Lctrtail\xctr: - /* - * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext - * - * This code expects the last keystream block to be in v{MAX_STRIDE-1}. - * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and - * v4 should have the next two counter blocks. - * - * This allows us to store the ciphertext by writing to overlapping - * regions of memory. Any invalid ciphertext blocks get overwritten by - * correctly computed blocks. This approach greatly simplifies the - * logic for storing the ciphertext. - */ - mov x16, #16 - ands w7, BYTES_W, #0xf - csel x13, x7, x16, ne - -ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) -ST5( csel x14, x16, xzr, gt ) - cmp BYTES_W, #48 - (MAX_STRIDE << 4) - csel x15, x16, xzr, gt - cmp BYTES_W, #32 - (MAX_STRIDE << 4) - csel x16, x16, xzr, gt - cmp BYTES_W, #16 - (MAX_STRIDE << 4) - - adr_l x9, .Lcts_permute_table - add x9, x9, x13 - ble .Lctrtail1x\xctr - -ST5( ld1 {v5.16b}, [IN], x14 ) - ld1 {v6.16b}, [IN], x15 - ld1 {v7.16b}, [IN], x16 - -ST4( bl aes_encrypt_block4x ) -ST5( bl aes_encrypt_block5x ) - - ld1 {v8.16b}, [IN], x13 - ld1 {v9.16b}, [IN] - ld1 {v10.16b}, [x9] - -ST4( eor v6.16b, v6.16b, v0.16b ) -ST4( eor v7.16b, v7.16b, v1.16b ) -ST4( tbl v3.16b, {v3.16b}, v10.16b ) -ST4( eor v8.16b, v8.16b, v2.16b ) -ST4( eor v9.16b, v9.16b, v3.16b ) - -ST5( eor v5.16b, v5.16b, v0.16b ) -ST5( eor v6.16b, v6.16b, v1.16b ) -ST5( tbl v4.16b, {v4.16b}, v10.16b ) -ST5( eor v7.16b, v7.16b, v2.16b ) -ST5( eor v8.16b, v8.16b, v3.16b ) -ST5( eor v9.16b, v9.16b, v4.16b ) - -ST5( st1 {v5.16b}, [OUT], x14 ) - st1 {v6.16b}, [OUT], x15 - st1 {v7.16b}, [OUT], x16 - add x13, x13, OUT - st1 {v9.16b}, [x13] // overlapping stores - st1 {v8.16b}, [OUT] - b .Lctrout\xctr - -.Lctrtail1x\xctr: - /* - * Handle <= 16 bytes of plaintext - * - * This code always reads and writes 16 bytes. To avoid out of bounds - * accesses, XCTR and CTR modes must use a temporary buffer when - * encrypting/decrypting less than 16 bytes. - * - * This code is unusual in that it loads the input and stores the output - * relative to the end of the buffers rather than relative to the start. - * This causes unusual behaviour when encrypting/decrypting less than 16 - * bytes; the end of the data is expected to be at the end of the - * temporary buffer rather than the start of the data being at the start - * of the temporary buffer. - */ - sub x8, x7, #16 - csel x7, x7, x8, eq - add IN, IN, x7 - add OUT, OUT, x7 - ld1 {v5.16b}, [IN] - ld1 {v6.16b}, [OUT] -ST5( mov v3.16b, v4.16b ) - encrypt_block v3, ROUNDS_W, KEY, x8, w7 - ld1 {v10.16b-v11.16b}, [x9] - tbl v3.16b, {v3.16b}, v10.16b - sshr v11.16b, v11.16b, #7 - eor v5.16b, v5.16b, v3.16b - bif v5.16b, v6.16b, v11.16b - st1 {v5.16b}, [OUT] - b .Lctrout\xctr - - // Arguments - .unreq OUT - .unreq IN - .unreq KEY - .unreq ROUNDS_W - .unreq BYTES_W - .unreq IV - .unreq BYTE_CTR_W // XCTR only - // Intermediate values - .unreq CTR_W // XCTR only - .unreq CTR // XCTR only - .unreq IV_PART - .unreq BLOCKS - .unreq BLOCKS_W -.endm - - /* - * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int bytes, u8 ctr[]) - * - * The input and output buffers must always be at least 16 bytes even if - * encrypting/decrypting less than 16 bytes. Otherwise out of bounds - * accesses will occur. The data to be encrypted/decrypted is expected - * to be at the end of this 16-byte temporary buffer rather than the - * start. - */ - -AES_FUNC_START(aes_ctr_encrypt) - ctr_encrypt 0 -AES_FUNC_END(aes_ctr_encrypt) - - /* - * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int bytes, u8 const iv[], int byte_ctr) - * - * The input and output buffers must always be at least 16 bytes even if - * encrypting/decrypting less than 16 bytes. Otherwise out of bounds - * accesses will occur. The data to be encrypted/decrypted is expected - * to be at the end of this 16-byte temporary buffer rather than the - * start. - */ - -AES_FUNC_START(aes_xctr_encrypt) - ctr_encrypt 1 -AES_FUNC_END(aes_xctr_encrypt) - - - /* - * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, - * int bytes, u8 const rk2[], u8 iv[], int first) - * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, - * int bytes, u8 const rk2[], u8 iv[], int first) - */ - - .macro next_tweak, out, in, tmp - sshr \tmp\().2d, \in\().2d, #63 - and \tmp\().16b, \tmp\().16b, xtsmask.16b - add \out\().2d, \in\().2d, \in\().2d - ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 - eor \out\().16b, \out\().16b, \tmp\().16b - .endm - - .macro xts_load_mask, tmp - movi xtsmask.2s, #0x1 - movi \tmp\().2s, #0x87 - uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s - .endm - -AES_FUNC_START(aes_xts_encrypt) - frame_push 0 - - ld1 {v4.16b}, [x6] - xts_load_mask v8 - cbz w7, .Lxtsencnotfirst - - enc_prepare w3, x5, x8 - xts_cts_skip_tw w7, .LxtsencNx - encrypt_block v4, w3, x5, x8, w7 /* first tweak */ - enc_switch_key w3, x2, x8 - b .LxtsencNx - -.Lxtsencnotfirst: - enc_prepare w3, x2, x8 -.LxtsencloopNx: - next_tweak v4, v4, v8 -.LxtsencNx: - subs w4, w4, #64 - bmi .Lxtsenc1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ - next_tweak v5, v4, v8 - eor v0.16b, v0.16b, v4.16b - next_tweak v6, v5, v8 - eor v1.16b, v1.16b, v5.16b - eor v2.16b, v2.16b, v6.16b - next_tweak v7, v6, v8 - eor v3.16b, v3.16b, v7.16b - bl aes_encrypt_block4x - eor v3.16b, v3.16b, v7.16b - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - eor v2.16b, v2.16b, v6.16b - st1 {v0.16b-v3.16b}, [x0], #64 - mov v4.16b, v7.16b - cbz w4, .Lxtsencret - xts_reload_mask v8 - b .LxtsencloopNx -.Lxtsenc1x: - adds w4, w4, #64 - beq .Lxtsencout - subs w4, w4, #16 - bmi .LxtsencctsNx -.Lxtsencloop: - ld1 {v0.16b}, [x1], #16 -.Lxtsencctsout: - eor v0.16b, v0.16b, v4.16b - encrypt_block v0, w3, x2, x8, w7 - eor v0.16b, v0.16b, v4.16b - cbz w4, .Lxtsencout - subs w4, w4, #16 - next_tweak v4, v4, v8 - bmi .Lxtsenccts - st1 {v0.16b}, [x0], #16 - b .Lxtsencloop -.Lxtsencout: - st1 {v0.16b}, [x0] -.Lxtsencret: - st1 {v4.16b}, [x6] - frame_pop - ret - -.LxtsencctsNx: - mov v0.16b, v3.16b - sub x0, x0, #16 -.Lxtsenccts: - adr_l x8, .Lcts_permute_table - - add x1, x1, w4, sxtw /* rewind input pointer */ - add w4, w4, #16 /* # bytes in final block */ - add x9, x8, #32 - add x8, x8, x4 - sub x9, x9, x4 - add x4, x0, x4 /* output address of final block */ - - ld1 {v1.16b}, [x1] /* load final block */ - ld1 {v2.16b}, [x8] - ld1 {v3.16b}, [x9] - - tbl v2.16b, {v0.16b}, v2.16b - tbx v0.16b, {v1.16b}, v3.16b - st1 {v2.16b}, [x4] /* overlapping stores */ - mov w4, wzr - b .Lxtsencctsout -AES_FUNC_END(aes_xts_encrypt) - -AES_FUNC_START(aes_xts_decrypt) - frame_push 0 - - /* subtract 16 bytes if we are doing CTS */ - sub w8, w4, #0x10 - tst w4, #0xf - csel w4, w4, w8, eq - - ld1 {v4.16b}, [x6] - xts_load_mask v8 - xts_cts_skip_tw w7, .Lxtsdecskiptw - cbz w7, .Lxtsdecnotfirst - - enc_prepare w3, x5, x8 - encrypt_block v4, w3, x5, x8, w7 /* first tweak */ -.Lxtsdecskiptw: - dec_prepare w3, x2, x8 - b .LxtsdecNx - -.Lxtsdecnotfirst: - dec_prepare w3, x2, x8 -.LxtsdecloopNx: - next_tweak v4, v4, v8 -.LxtsdecNx: - subs w4, w4, #64 - bmi .Lxtsdec1x - ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ - next_tweak v5, v4, v8 - eor v0.16b, v0.16b, v4.16b - next_tweak v6, v5, v8 - eor v1.16b, v1.16b, v5.16b - eor v2.16b, v2.16b, v6.16b - next_tweak v7, v6, v8 - eor v3.16b, v3.16b, v7.16b - bl aes_decrypt_block4x - eor v3.16b, v3.16b, v7.16b - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - eor v2.16b, v2.16b, v6.16b - st1 {v0.16b-v3.16b}, [x0], #64 - mov v4.16b, v7.16b - cbz w4, .Lxtsdecout - xts_reload_mask v8 - b .LxtsdecloopNx -.Lxtsdec1x: - adds w4, w4, #64 - beq .Lxtsdecout - subs w4, w4, #16 -.Lxtsdecloop: - ld1 {v0.16b}, [x1], #16 - bmi .Lxtsdeccts -.Lxtsdecctsout: - eor v0.16b, v0.16b, v4.16b - decrypt_block v0, w3, x2, x8, w7 - eor v0.16b, v0.16b, v4.16b - st1 {v0.16b}, [x0], #16 - cbz w4, .Lxtsdecout - subs w4, w4, #16 - next_tweak v4, v4, v8 - b .Lxtsdecloop -.Lxtsdecout: - st1 {v4.16b}, [x6] - frame_pop - ret - -.Lxtsdeccts: - adr_l x8, .Lcts_permute_table - - add x1, x1, w4, sxtw /* rewind input pointer */ - add w4, w4, #16 /* # bytes in final block */ - add x9, x8, #32 - add x8, x8, x4 - sub x9, x9, x4 - add x4, x0, x4 /* output address of final block */ - - next_tweak v5, v4, v8 - - ld1 {v1.16b}, [x1] /* load final block */ - ld1 {v2.16b}, [x8] - ld1 {v3.16b}, [x9] - - eor v0.16b, v0.16b, v5.16b - decrypt_block v0, w3, x2, x8, w7 - eor v0.16b, v0.16b, v5.16b - - tbl v2.16b, {v0.16b}, v2.16b - tbx v0.16b, {v1.16b}, v3.16b - - st1 {v2.16b}, [x4] /* overlapping stores */ - mov w4, wzr - b .Lxtsdecctsout -AES_FUNC_END(aes_xts_decrypt) - - /* - * aes_mac_update(u8 const in[], u32 const rk[], int rounds, - * int blocks, u8 dg[], int enc_before, int enc_after) - */ -AES_FUNC_START(aes_mac_update) - ld1 {v0.16b}, [x4] /* get dg */ - enc_prepare w2, x1, x7 - cbz w5, .Lmacloop4x - - encrypt_block v0, w2, x1, x7, w8 - -.Lmacloop4x: - subs w3, w3, #4 - bmi .Lmac1x - ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ - eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - encrypt_block v0, w2, x1, x7, w8 - eor v0.16b, v0.16b, v2.16b - encrypt_block v0, w2, x1, x7, w8 - eor v0.16b, v0.16b, v3.16b - encrypt_block v0, w2, x1, x7, w8 - eor v0.16b, v0.16b, v4.16b - cmp w3, wzr - csinv w5, w6, wzr, eq - cbz w5, .Lmacout - encrypt_block v0, w2, x1, x7, w8 - st1 {v0.16b}, [x4] /* return dg */ - cond_yield .Lmacout, x7, x8 - b .Lmacloop4x -.Lmac1x: - add w3, w3, #4 -.Lmacloop: - cbz w3, .Lmacout - ld1 {v1.16b}, [x0], #16 /* get next pt block */ - eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - - subs w3, w3, #1 - csinv w5, w6, wzr, eq - cbz w5, .Lmacout - -.Lmacenc: - encrypt_block v0, w2, x1, x7, w8 - b .Lmacloop - -.Lmacout: - st1 {v0.16b}, [x4] /* return dg */ - mov w0, w3 - ret -AES_FUNC_END(aes_mac_update) diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S deleted file mode 100644 index 3a8961b6ea51..000000000000 --- a/arch/arm64/crypto/aes-neon.S +++ /dev/null @@ -1,250 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON - * - * Copyright (C) 2013 - 2017 Linaro Ltd. - */ - -#include -#include - -#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) -#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) - - xtsmask .req v7 - cbciv .req v7 - vctr .req v4 - - .macro xts_reload_mask, tmp - xts_load_mask \tmp - .endm - - /* special case for the neon-bs driver calling into this one for CTS */ - .macro xts_cts_skip_tw, reg, lbl - tbnz \reg, #1, \lbl - .endm - - /* multiply by polynomial 'x' in GF(2^8) */ - .macro mul_by_x, out, in, temp, const - sshr \temp, \in, #7 - shl \out, \in, #1 - and \temp, \temp, \const - eor \out, \out, \temp - .endm - - /* multiply by polynomial 'x^2' in GF(2^8) */ - .macro mul_by_x2, out, in, temp, const - ushr \temp, \in, #6 - shl \out, \in, #2 - pmul \temp, \temp, \const - eor \out, \out, \temp - .endm - - /* preload the entire Sbox */ - .macro prepare, sbox, shiftrows, temp - movi v12.16b, #0x1b - ldr_l q13, \shiftrows, \temp - ldr_l q14, .Lror32by8, \temp - adr_l \temp, \sbox - ld1 {v16.16b-v19.16b}, [\temp], #64 - ld1 {v20.16b-v23.16b}, [\temp], #64 - ld1 {v24.16b-v27.16b}, [\temp], #64 - ld1 {v28.16b-v31.16b}, [\temp] - .endm - - /* do preload for encryption */ - .macro enc_prepare, ignore0, ignore1, temp - prepare crypto_aes_sbox, .LForward_ShiftRows, \temp - .endm - - .macro enc_switch_key, ignore0, ignore1, temp - /* do nothing */ - .endm - - /* do preload for decryption */ - .macro dec_prepare, ignore0, ignore1, temp - prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp - .endm - - /* apply SubBytes transformation using the preloaded Sbox */ - .macro sub_bytes, in - sub v9.16b, \in\().16b, v15.16b - tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b - sub v10.16b, v9.16b, v15.16b - tbx \in\().16b, {v20.16b-v23.16b}, v9.16b - sub v11.16b, v10.16b, v15.16b - tbx \in\().16b, {v24.16b-v27.16b}, v10.16b - tbx \in\().16b, {v28.16b-v31.16b}, v11.16b - .endm - - /* apply MixColumns transformation */ - .macro mix_columns, in, enc - .if \enc == 0 - /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ - mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b - eor \in\().16b, \in\().16b, v8.16b - rev32 v8.8h, v8.8h - eor \in\().16b, \in\().16b, v8.16b - .endif - - mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b - rev32 v8.8h, \in\().8h - eor v8.16b, v8.16b, v9.16b - eor \in\().16b, \in\().16b, v8.16b - tbl \in\().16b, {\in\().16b}, v14.16b - eor \in\().16b, \in\().16b, v8.16b - .endm - - .macro do_block, enc, in, rounds, rk, rkp, i - ld1 {v15.4s}, [\rk] - add \rkp, \rk, #16 - mov \i, \rounds -.La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ - movi v15.16b, #0x40 - tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ - sub_bytes \in - sub \i, \i, #1 - ld1 {v15.4s}, [\rkp], #16 - cbz \i, .Lb\@ - mix_columns \in, \enc - b .La\@ -.Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ - .endm - - .macro encrypt_block, in, rounds, rk, rkp, i - do_block 1, \in, \rounds, \rk, \rkp, \i - .endm - - .macro decrypt_block, in, rounds, rk, rkp, i - do_block 0, \in, \rounds, \rk, \rkp, \i - .endm - - /* - * Interleaved versions: functionally equivalent to the - * ones above, but applied to AES states in parallel. - */ - - .macro sub_bytes_4x, in0, in1, in2, in3 - sub v8.16b, \in0\().16b, v15.16b - tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b - sub v9.16b, \in1\().16b, v15.16b - tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b - sub v10.16b, \in2\().16b, v15.16b - tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b - sub v11.16b, \in3\().16b, v15.16b - tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b - tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b - tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b - sub v8.16b, v8.16b, v15.16b - tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b - sub v9.16b, v9.16b, v15.16b - tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b - sub v10.16b, v10.16b, v15.16b - tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b - sub v11.16b, v11.16b, v15.16b - tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b - sub v8.16b, v8.16b, v15.16b - tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b - sub v9.16b, v9.16b, v15.16b - tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b - sub v10.16b, v10.16b, v15.16b - tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b - sub v11.16b, v11.16b, v15.16b - tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b - tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b - tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b - .endm - - .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const - sshr \tmp0\().16b, \in0\().16b, #7 - shl \out0\().16b, \in0\().16b, #1 - sshr \tmp1\().16b, \in1\().16b, #7 - and \tmp0\().16b, \tmp0\().16b, \const\().16b - shl \out1\().16b, \in1\().16b, #1 - and \tmp1\().16b, \tmp1\().16b, \const\().16b - eor \out0\().16b, \out0\().16b, \tmp0\().16b - eor \out1\().16b, \out1\().16b, \tmp1\().16b - .endm - - .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const - ushr \tmp0\().16b, \in0\().16b, #6 - shl \out0\().16b, \in0\().16b, #2 - ushr \tmp1\().16b, \in1\().16b, #6 - pmul \tmp0\().16b, \tmp0\().16b, \const\().16b - shl \out1\().16b, \in1\().16b, #2 - pmul \tmp1\().16b, \tmp1\().16b, \const\().16b - eor \out0\().16b, \out0\().16b, \tmp0\().16b - eor \out1\().16b, \out1\().16b, \tmp1\().16b - .endm - - .macro mix_columns_2x, in0, in1, enc - .if \enc == 0 - /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ - mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 - eor \in0\().16b, \in0\().16b, v8.16b - rev32 v8.8h, v8.8h - eor \in1\().16b, \in1\().16b, v9.16b - rev32 v9.8h, v9.8h - eor \in0\().16b, \in0\().16b, v8.16b - eor \in1\().16b, \in1\().16b, v9.16b - .endif - - mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 - rev32 v10.8h, \in0\().8h - rev32 v11.8h, \in1\().8h - eor v10.16b, v10.16b, v8.16b - eor v11.16b, v11.16b, v9.16b - eor \in0\().16b, \in0\().16b, v10.16b - eor \in1\().16b, \in1\().16b, v11.16b - tbl \in0\().16b, {\in0\().16b}, v14.16b - tbl \in1\().16b, {\in1\().16b}, v14.16b - eor \in0\().16b, \in0\().16b, v10.16b - eor \in1\().16b, \in1\().16b, v11.16b - .endm - - .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i - ld1 {v15.4s}, [\rk] - add \rkp, \rk, #16 - mov \i, \rounds -.La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ - eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ - eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ - movi v15.16b, #0x40 - tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ - tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ - tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ - tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ - sub_bytes_4x \in0, \in1, \in2, \in3 - sub \i, \i, #1 - ld1 {v15.4s}, [\rkp], #16 - cbz \i, .Lb\@ - mix_columns_2x \in0, \in1, \enc - mix_columns_2x \in2, \in3, \enc - b .La\@ -.Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ - eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ - eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ - eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ - .endm - - .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i - do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i - .endm - - .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i - do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i - .endm - -#include "aes-modes.S" - - .section ".rodata", "a" - .align 4 -.LForward_ShiftRows: - .octa 0x0b06010c07020d08030e09040f0a0500 - -.LReverse_ShiftRows: - .octa 0x0306090c0f0205080b0e0104070a0d00 - -.Lror32by8: - .octa 0x0c0f0e0d080b0a090407060500030201 diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index cb87c8fc66b3..7630a7bf5da9 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -17,6 +17,7 @@ MODULE_AUTHOR("Ard Biesheuvel "); MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); +MODULE_IMPORT_NS("CRYPTO_INTERNAL"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); @@ -42,20 +43,6 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[]); -/* borrowed from aes-neon-blk.ko */ -asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks); -asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks, u8 iv[]); -asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int bytes, u8 ctr[]); -asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[], - u32 const rk1[], int rounds, int bytes, - u32 const rk2[], u8 iv[], int first); -asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[], - u32 const rk1[], int rounds, int bytes, - u32 const rk2[], u8 iv[], int first); - struct aesbs_ctx { u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32]; int rounds; diff --git a/include/crypto/aes.h b/include/crypto/aes.h index cbf1cc96db52..91bf4667d3e9 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -167,6 +167,75 @@ int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, #ifdef CONFIG_ARM64 int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); +asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void neon_aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void neon_aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void neon_aes_cbc_cts_encrypt(u8 out[], u8 const in[], + u32 const rk[], int rounds, int bytes, + u8 const iv[]); +asmlinkage void neon_aes_cbc_cts_decrypt(u8 out[], u8 const in[], + u32 const rk[], int rounds, int bytes, + u8 const iv[]); +asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[]); +asmlinkage void neon_aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[], + int byte_ctr); +asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], + u8 iv[], int first); +asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], + u8 iv[], int first); +asmlinkage void neon_aes_essiv_cbc_encrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, + int blocks, u8 iv[], + u32 const rk2[]); +asmlinkage void neon_aes_essiv_cbc_decrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, + int blocks, u8 iv[], + u32 const rk2[]); +asmlinkage int neon_aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); + +asmlinkage void ce_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void ce_aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks); +asmlinkage void ce_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void ce_aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 const iv[]); +asmlinkage void ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 const iv[]); +asmlinkage void ce_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[]); +asmlinkage void ce_aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[], + int rounds, int bytes, u8 ctr[], + int byte_ctr); +asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], + u8 iv[], int first); +asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], + int rounds, int bytes, u32 const rk2[], + u8 iv[], int first); +asmlinkage void ce_aes_essiv_cbc_encrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, + int blocks, u8 iv[], u32 const rk2[]); +asmlinkage void ce_aes_essiv_cbc_decrypt(u8 out[], u8 const in[], + u32 const rk1[], int rounds, + int blocks, u8 iv[], u32 const rk2[]); +asmlinkage int ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); #elif defined(CONFIG_PPC) void ppc_expand_key_128(u32 *key_enc, const u8 *key); void ppc_expand_key_192(u32 *key_enc, const u8 *key); diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 725eef05b758..c05d4b4e8e82 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -26,7 +26,9 @@ libaes-$(CONFIG_ARM) += arm/aes-cipher-core.o ifeq ($(CONFIG_ARM64),y) libaes-y += arm64/aes-cipher-core.o -libaes-$(CONFIG_KERNEL_MODE_NEON) += arm64/aes-ce-core.o +libaes-$(CONFIG_KERNEL_MODE_NEON) += arm64/aes-ce-core.o \ + arm64/aes-ce.o \ + arm64/aes-neon.o endif ifeq ($(CONFIG_PPC),y) diff --git a/lib/crypto/arm64/aes-ce.S b/lib/crypto/arm64/aes-ce.S new file mode 100644 index 000000000000..b853e02f7b1e --- /dev/null +++ b/lib/crypto/arm64/aes-ce.S @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AES cipher for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2017 Linaro Ltd + */ + +#include +#include + +#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func) +#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func) + + .arch armv8-a+crypto + + xtsmask .req v16 + cbciv .req v16 + vctr .req v16 + + .macro xts_reload_mask, tmp + .endm + + .macro xts_cts_skip_tw, reg, lbl + .endm + + /* preload all round keys */ + .macro load_round_keys, rk, nr, tmp + add \tmp, \rk, \nr, sxtw #4 + sub \tmp, \tmp, #160 + ld1 {v17.4s-v20.4s}, [\rk] + ld1 {v21.4s-v24.4s}, [\tmp], #64 + ld1 {v25.4s-v28.4s}, [\tmp], #64 + ld1 {v29.4s-v31.4s}, [\tmp] + .endm + + /* prepare for encryption with key in rk[] */ + .macro enc_prepare, rounds, rk, temp + load_round_keys \rk, \rounds, \temp + .endm + + /* prepare for encryption (again) but with new key in rk[] */ + .macro enc_switch_key, rounds, rk, temp + load_round_keys \rk, \rounds, \temp + .endm + + /* prepare for decryption with key in rk[] */ + .macro dec_prepare, rounds, rk, temp + load_round_keys \rk, \rounds, \temp + .endm + + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4 + aes\de \i0\().16b, \k\().16b + aes\mc \i0\().16b, \i0\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + aes\mc \i1\().16b, \i1\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\mc \i2\().16b, \i2\().16b + aes\de \i3\().16b, \k\().16b + aes\mc \i3\().16b, \i3\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + aes\mc \i4\().16b, \i4\().16b + .endif + .endif + .endif + .endm + + /* up to 5 interleaved encryption rounds with the same round key */ + .macro round_Nx, enc, k, i0, i1, i2, i3, i4 + .ifc \enc, e + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4 + .else + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4 + .endif + .endm + + /* up to 5 interleaved final rounds */ + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4 + aes\de \i0\().16b, \k\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\de \i3\().16b, \k\().16b + .ifnb \i4 + aes\de \i4\().16b, \k\().16b + .endif + .endif + .endif + eor \i0\().16b, \i0\().16b, \k2\().16b + .ifnb \i1 + eor \i1\().16b, \i1\().16b, \k2\().16b + .ifnb \i3 + eor \i2\().16b, \i2\().16b, \k2\().16b + eor \i3\().16b, \i3\().16b, \k2\().16b + .ifnb \i4 + eor \i4\().16b, \i4\().16b, \k2\().16b + .endif + .endif + .endif + .endm + + /* up to 5 interleaved blocks */ + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4 + tbz \rounds, #2, .L\@ /* 128 bits */ + round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4 + tbz \rounds, #1, .L\@ /* 192 bits */ + round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4 + round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4 +.L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4 + .endr + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4 + .endm + + .macro encrypt_block, in, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \in + .endm + + .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 + .endm + + .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4 + .endm + + .macro decrypt_block, in, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \in + .endm + + .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 + .endm + + .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4 + .endm + +#define MAX_STRIDE 5 + +#include "aes-modes.S" diff --git a/lib/crypto/arm64/aes-modes.S b/lib/crypto/arm64/aes-modes.S new file mode 100644 index 000000000000..f4df6f84a3c7 --- /dev/null +++ b/lib/crypto/arm64/aes-modes.S @@ -0,0 +1,866 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Chaining mode wrappers for AES + * + * Copyright (C) 2013 - 2017 Linaro Ltd + */ + +/* included by aes-ce.S and aes-neon.S */ + + .text + .align 4 + +#ifndef MAX_STRIDE +#define MAX_STRIDE 4 +#endif + +#if MAX_STRIDE == 4 +#define ST4(x...) x +#define ST5(x...) +#else +#define ST4(x...) +#define ST5(x...) x +#endif + +SYM_FUNC_START_LOCAL(aes_encrypt_block4x) + encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_encrypt_block4x) + +SYM_FUNC_START_LOCAL(aes_decrypt_block4x) + decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_decrypt_block4x) + +#if MAX_STRIDE == 5 +SYM_FUNC_START_LOCAL(aes_encrypt_block5x) + encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_encrypt_block5x) + +SYM_FUNC_START_LOCAL(aes_decrypt_block5x) + decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 + ret +SYM_FUNC_END(aes_decrypt_block5x) +#endif + + /* + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ + +AES_FUNC_START(aes_ecb_encrypt) + frame_push 0 + + enc_prepare w3, x2, x5 + +.LecbencloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lecbenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ +ST4( bl aes_encrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_encrypt_block5x ) + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LecbencloopNx +.Lecbenc1x: + adds w4, w4, #MAX_STRIDE + beq .Lecbencout +.Lecbencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + encrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbencloop +.Lecbencout: + frame_pop + ret +AES_FUNC_END(aes_ecb_encrypt) + + +AES_FUNC_START(aes_ecb_decrypt) + frame_push 0 + + dec_prepare w3, x2, x5 + +.LecbdecloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lecbdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ +ST4( bl aes_decrypt_block4x ) +ST5( ld1 {v4.16b}, [x1], #16 ) +ST5( bl aes_decrypt_block5x ) + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LecbdecloopNx +.Lecbdec1x: + adds w4, w4, #MAX_STRIDE + beq .Lecbdecout +.Lecbdecloop: + ld1 {v0.16b}, [x1], #16 /* get next ct block */ + decrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbdecloop +.Lecbdecout: + frame_pop + ret +AES_FUNC_END(aes_ecb_decrypt) + + + /* + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], + * int rounds, int blocks, u8 iv[], + * u32 const rk2[]); + * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], + * int rounds, int blocks, u8 iv[], + * u32 const rk2[]); + */ + +AES_FUNC_START(aes_essiv_cbc_encrypt) + ld1 {v4.16b}, [x5] /* get iv */ + + mov w8, #14 /* AES-256: 14 rounds */ + enc_prepare w8, x6, x7 + encrypt_block v4, w8, x6, x7, w9 + enc_switch_key w3, x2, x6 + b .Lcbcencloop4x + +AES_FUNC_START(aes_cbc_encrypt) + ld1 {v4.16b}, [x5] /* get iv */ + enc_prepare w3, x2, x6 + +.Lcbcencloop4x: + subs w4, w4, #4 + bmi .Lcbcenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ + encrypt_block v0, w3, x2, x6, w7 + eor v1.16b, v1.16b, v0.16b + encrypt_block v1, w3, x2, x6, w7 + eor v2.16b, v2.16b, v1.16b + encrypt_block v2, w3, x2, x6, w7 + eor v3.16b, v3.16b, v2.16b + encrypt_block v3, w3, x2, x6, w7 + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v3.16b + b .Lcbcencloop4x +.Lcbcenc1x: + adds w4, w4, #4 + beq .Lcbcencout +.Lcbcencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ + encrypt_block v4, w3, x2, x6, w7 + st1 {v4.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcencloop +.Lcbcencout: + st1 {v4.16b}, [x5] /* return iv */ + ret +AES_FUNC_END(aes_cbc_encrypt) +AES_FUNC_END(aes_essiv_cbc_encrypt) + +AES_FUNC_START(aes_essiv_cbc_decrypt) + ld1 {cbciv.16b}, [x5] /* get iv */ + + mov w8, #14 /* AES-256: 14 rounds */ + enc_prepare w8, x6, x7 + encrypt_block cbciv, w8, x6, x7, w9 + b .Lessivcbcdecstart + +AES_FUNC_START(aes_cbc_decrypt) + ld1 {cbciv.16b}, [x5] /* get iv */ +.Lessivcbcdecstart: + frame_push 0 + dec_prepare w3, x2, x6 + +.LcbcdecloopNx: + subs w4, w4, #MAX_STRIDE + bmi .Lcbcdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ +#if MAX_STRIDE == 5 + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ + mov v5.16b, v0.16b + mov v6.16b, v1.16b + mov v7.16b, v2.16b + bl aes_decrypt_block5x + sub x1, x1, #32 + eor v0.16b, v0.16b, cbciv.16b + eor v1.16b, v1.16b, v5.16b + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v5.16b +#else + mov v4.16b, v0.16b + mov v5.16b, v1.16b + mov v6.16b, v2.16b + bl aes_decrypt_block4x + sub x1, x1, #16 + eor v0.16b, v0.16b, cbciv.16b + eor v1.16b, v1.16b, v4.16b + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v5.16b + eor v3.16b, v3.16b, v6.16b +#endif + st1 {v0.16b-v3.16b}, [x0], #64 +ST5( st1 {v4.16b}, [x0], #16 ) + b .LcbcdecloopNx +.Lcbcdec1x: + adds w4, w4, #MAX_STRIDE + beq .Lcbcdecout +.Lcbcdecloop: + ld1 {v1.16b}, [x1], #16 /* get next ct block */ + mov v0.16b, v1.16b /* ...and copy to v0 */ + decrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ + mov cbciv.16b, v1.16b /* ct is next iv */ + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcdecloop +.Lcbcdecout: + st1 {cbciv.16b}, [x5] /* return iv */ + frame_pop + ret +AES_FUNC_END(aes_cbc_decrypt) +AES_FUNC_END(aes_essiv_cbc_decrypt) + + + /* + * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], + * int rounds, int bytes, u8 const iv[]) + * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], + * int rounds, int bytes, u8 const iv[]) + */ + +AES_FUNC_START(aes_cbc_cts_encrypt) + adr_l x8, .Lcts_permute_table + sub x4, x4, #16 + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + ld1 {v3.16b}, [x8] + ld1 {v4.16b}, [x9] + + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ + ld1 {v1.16b}, [x1] + + ld1 {v5.16b}, [x5] /* get iv */ + enc_prepare w3, x2, x6 + + eor v0.16b, v0.16b, v5.16b /* xor with iv */ + tbl v1.16b, {v1.16b}, v4.16b + encrypt_block v0, w3, x2, x6, w7 + + eor v1.16b, v1.16b, v0.16b + tbl v0.16b, {v0.16b}, v3.16b + encrypt_block v1, w3, x2, x6, w7 + + add x4, x0, x4 + st1 {v0.16b}, [x4] /* overlapping stores */ + st1 {v1.16b}, [x0] + ret +AES_FUNC_END(aes_cbc_cts_encrypt) + +AES_FUNC_START(aes_cbc_cts_decrypt) + adr_l x8, .Lcts_permute_table + sub x4, x4, #16 + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + ld1 {v3.16b}, [x8] + ld1 {v4.16b}, [x9] + + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ + ld1 {v1.16b}, [x1] + + ld1 {v5.16b}, [x5] /* get iv */ + dec_prepare w3, x2, x6 + + decrypt_block v0, w3, x2, x6, w7 + tbl v2.16b, {v0.16b}, v3.16b + eor v2.16b, v2.16b, v1.16b + + tbx v0.16b, {v1.16b}, v4.16b + decrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, v5.16b /* xor with iv */ + + add x4, x0, x4 + st1 {v2.16b}, [x4] /* overlapping stores */ + st1 {v0.16b}, [x0] + ret +AES_FUNC_END(aes_cbc_cts_decrypt) + + .section ".rodata", "a" + .align 6 +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .previous + + /* + * This macro generates the code for CTR and XCTR mode. + */ +.macro ctr_encrypt xctr + // Arguments + OUT .req x0 + IN .req x1 + KEY .req x2 + ROUNDS_W .req w3 + BYTES_W .req w4 + IV .req x5 + BYTE_CTR_W .req w6 // XCTR only + // Intermediate values + CTR_W .req w11 // XCTR only + CTR .req x11 // XCTR only + IV_PART .req x12 + BLOCKS .req x13 + BLOCKS_W .req w13 + + frame_push 0 + + enc_prepare ROUNDS_W, KEY, IV_PART + ld1 {vctr.16b}, [IV] + + /* + * Keep 64 bits of the IV in a register. For CTR mode this lets us + * easily increment the IV. For XCTR mode this lets us efficiently XOR + * the 64-bit counter with the IV. + */ + .if \xctr + umov IV_PART, vctr.d[0] + lsr CTR_W, BYTE_CTR_W, #4 + .else + umov IV_PART, vctr.d[1] + rev IV_PART, IV_PART + .endif + +.LctrloopNx\xctr: + add BLOCKS_W, BYTES_W, #15 + sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 + lsr BLOCKS_W, BLOCKS_W, #4 + mov w8, #MAX_STRIDE + cmp BLOCKS_W, w8 + csel BLOCKS_W, BLOCKS_W, w8, lt + + /* + * Set up the counter values in v0-v{MAX_STRIDE-1}. + * + * If we are encrypting less than MAX_STRIDE blocks, the tail block + * handling code expects the last keystream block to be in + * v{MAX_STRIDE-1}. For example: if encrypting two blocks with + * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. + */ + .if \xctr + add CTR, CTR, BLOCKS + .else + adds IV_PART, IV_PART, BLOCKS + .endif + mov v0.16b, vctr.16b + mov v1.16b, vctr.16b + mov v2.16b, vctr.16b + mov v3.16b, vctr.16b +ST5( mov v4.16b, vctr.16b ) + .if \xctr + sub x6, CTR, #MAX_STRIDE - 1 + sub x7, CTR, #MAX_STRIDE - 2 + sub x8, CTR, #MAX_STRIDE - 3 + sub x9, CTR, #MAX_STRIDE - 4 +ST5( sub x10, CTR, #MAX_STRIDE - 5 ) + eor x6, x6, IV_PART + eor x7, x7, IV_PART + eor x8, x8, IV_PART + eor x9, x9, IV_PART +ST5( eor x10, x10, IV_PART ) + mov v0.d[0], x6 + mov v1.d[0], x7 + mov v2.d[0], x8 + mov v3.d[0], x9 +ST5( mov v4.d[0], x10 ) + .else + bcs 0f + .subsection 1 + /* + * This subsection handles carries. + * + * Conditional branching here is allowed with respect to time + * invariance since the branches are dependent on the IV instead + * of the plaintext or key. This code is rarely executed in + * practice anyway. + */ + + /* Apply carry to outgoing counter. */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* + * Apply carry to counter blocks if needed. + * + * Since the carry flag was set, we know 0 <= IV_PART < + * MAX_STRIDE. Using the value of IV_PART we can determine how + * many counter blocks need to be updated. + */ + cbz IV_PART, 2f + adr x16, 1f + sub x16, x16, IV_PART, lsl #3 + br x16 + bti c + mov v0.d[0], vctr.d[0] + bti c + mov v1.d[0], vctr.d[0] + bti c + mov v2.d[0], vctr.d[0] + bti c + mov v3.d[0], vctr.d[0] +ST5( bti c ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, IV_PART + ins vctr.d[1], x7 + sub x7, IV_PART, #MAX_STRIDE - 1 + sub x8, IV_PART, #MAX_STRIDE - 2 + sub x9, IV_PART, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + .endif + + /* + * If there are at least MAX_STRIDE blocks left, XOR the data with + * keystream and store. Otherwise jump to tail handling. + */ + tbnz BYTES_W, #31, .Lctrtail\xctr + ld1 {v5.16b-v7.16b}, [IN], #48 +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + eor v0.16b, v5.16b, v0.16b +ST4( ld1 {v5.16b}, [IN], #16 ) + eor v1.16b, v6.16b, v1.16b +ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) + eor v2.16b, v7.16b, v2.16b + eor v3.16b, v5.16b, v3.16b +ST5( eor v4.16b, v6.16b, v4.16b ) + st1 {v0.16b-v3.16b}, [OUT], #64 +ST5( st1 {v4.16b}, [OUT], #16 ) + cbz BYTES_W, .Lctrout\xctr + b .LctrloopNx\xctr + +.Lctrout\xctr: + .if !\xctr + st1 {vctr.16b}, [IV] /* return next CTR value */ + .endif + frame_pop + ret + +.Lctrtail\xctr: + /* + * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext + * + * This code expects the last keystream block to be in v{MAX_STRIDE-1}. + * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and + * v4 should have the next two counter blocks. + * + * This allows us to store the ciphertext by writing to overlapping + * regions of memory. Any invalid ciphertext blocks get overwritten by + * correctly computed blocks. This approach greatly simplifies the + * logic for storing the ciphertext. + */ + mov x16, #16 + ands w7, BYTES_W, #0xf + csel x13, x7, x16, ne + +ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) +ST5( csel x14, x16, xzr, gt ) + cmp BYTES_W, #48 - (MAX_STRIDE << 4) + csel x15, x16, xzr, gt + cmp BYTES_W, #32 - (MAX_STRIDE << 4) + csel x16, x16, xzr, gt + cmp BYTES_W, #16 - (MAX_STRIDE << 4) + + adr_l x9, .Lcts_permute_table + add x9, x9, x13 + ble .Lctrtail1x\xctr + +ST5( ld1 {v5.16b}, [IN], x14 ) + ld1 {v6.16b}, [IN], x15 + ld1 {v7.16b}, [IN], x16 + +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + + ld1 {v8.16b}, [IN], x13 + ld1 {v9.16b}, [IN] + ld1 {v10.16b}, [x9] + +ST4( eor v6.16b, v6.16b, v0.16b ) +ST4( eor v7.16b, v7.16b, v1.16b ) +ST4( tbl v3.16b, {v3.16b}, v10.16b ) +ST4( eor v8.16b, v8.16b, v2.16b ) +ST4( eor v9.16b, v9.16b, v3.16b ) + +ST5( eor v5.16b, v5.16b, v0.16b ) +ST5( eor v6.16b, v6.16b, v1.16b ) +ST5( tbl v4.16b, {v4.16b}, v10.16b ) +ST5( eor v7.16b, v7.16b, v2.16b ) +ST5( eor v8.16b, v8.16b, v3.16b ) +ST5( eor v9.16b, v9.16b, v4.16b ) + +ST5( st1 {v5.16b}, [OUT], x14 ) + st1 {v6.16b}, [OUT], x15 + st1 {v7.16b}, [OUT], x16 + add x13, x13, OUT + st1 {v9.16b}, [x13] // overlapping stores + st1 {v8.16b}, [OUT] + b .Lctrout\xctr + +.Lctrtail1x\xctr: + /* + * Handle <= 16 bytes of plaintext + * + * This code always reads and writes 16 bytes. To avoid out of bounds + * accesses, XCTR and CTR modes must use a temporary buffer when + * encrypting/decrypting less than 16 bytes. + * + * This code is unusual in that it loads the input and stores the output + * relative to the end of the buffers rather than relative to the start. + * This causes unusual behaviour when encrypting/decrypting less than 16 + * bytes; the end of the data is expected to be at the end of the + * temporary buffer rather than the start of the data being at the start + * of the temporary buffer. + */ + sub x8, x7, #16 + csel x7, x7, x8, eq + add IN, IN, x7 + add OUT, OUT, x7 + ld1 {v5.16b}, [IN] + ld1 {v6.16b}, [OUT] +ST5( mov v3.16b, v4.16b ) + encrypt_block v3, ROUNDS_W, KEY, x8, w7 + ld1 {v10.16b-v11.16b}, [x9] + tbl v3.16b, {v3.16b}, v10.16b + sshr v11.16b, v11.16b, #7 + eor v5.16b, v5.16b, v3.16b + bif v5.16b, v6.16b, v11.16b + st1 {v5.16b}, [OUT] + b .Lctrout\xctr + + // Arguments + .unreq OUT + .unreq IN + .unreq KEY + .unreq ROUNDS_W + .unreq BYTES_W + .unreq IV + .unreq BYTE_CTR_W // XCTR only + // Intermediate values + .unreq CTR_W // XCTR only + .unreq CTR // XCTR only + .unreq IV_PART + .unreq BLOCKS + .unreq BLOCKS_W +.endm + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 ctr[]) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_ctr_encrypt) + ctr_encrypt 0 +AES_FUNC_END(aes_ctr_encrypt) + + /* + * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int bytes, u8 const iv[], int byte_ctr) + * + * The input and output buffers must always be at least 16 bytes even if + * encrypting/decrypting less than 16 bytes. Otherwise out of bounds + * accesses will occur. The data to be encrypted/decrypted is expected + * to be at the end of this 16-byte temporary buffer rather than the + * start. + */ + +AES_FUNC_START(aes_xctr_encrypt) + ctr_encrypt 1 +AES_FUNC_END(aes_xctr_encrypt) + + + /* + * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int bytes, u8 const rk2[], u8 iv[], int first) + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int bytes, u8 const rk2[], u8 iv[], int first) + */ + + .macro next_tweak, out, in, tmp + sshr \tmp\().2d, \in\().2d, #63 + and \tmp\().16b, \tmp\().16b, xtsmask.16b + add \out\().2d, \in\().2d, \in\().2d + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 + eor \out\().16b, \out\().16b, \tmp\().16b + .endm + + .macro xts_load_mask, tmp + movi xtsmask.2s, #0x1 + movi \tmp\().2s, #0x87 + uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s + .endm + +AES_FUNC_START(aes_xts_encrypt) + frame_push 0 + + ld1 {v4.16b}, [x6] + xts_load_mask v8 + cbz w7, .Lxtsencnotfirst + + enc_prepare w3, x5, x8 + xts_cts_skip_tw w7, .LxtsencNx + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ + enc_switch_key w3, x2, x8 + b .LxtsencNx + +.Lxtsencnotfirst: + enc_prepare w3, x2, x8 +.LxtsencloopNx: + next_tweak v4, v4, v8 +.LxtsencNx: + subs w4, w4, #64 + bmi .Lxtsenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + next_tweak v5, v4, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v8 + eor v3.16b, v3.16b, v7.16b + bl aes_encrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsencret + xts_reload_mask v8 + b .LxtsencloopNx +.Lxtsenc1x: + adds w4, w4, #64 + beq .Lxtsencout + subs w4, w4, #16 + bmi .LxtsencctsNx +.Lxtsencloop: + ld1 {v0.16b}, [x1], #16 +.Lxtsencctsout: + eor v0.16b, v0.16b, v4.16b + encrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v4.16b + cbz w4, .Lxtsencout + subs w4, w4, #16 + next_tweak v4, v4, v8 + bmi .Lxtsenccts + st1 {v0.16b}, [x0], #16 + b .Lxtsencloop +.Lxtsencout: + st1 {v0.16b}, [x0] +.Lxtsencret: + st1 {v4.16b}, [x6] + frame_pop + ret + +.LxtsencctsNx: + mov v0.16b, v3.16b + sub x0, x0, #16 +.Lxtsenccts: + adr_l x8, .Lcts_permute_table + + add x1, x1, w4, sxtw /* rewind input pointer */ + add w4, w4, #16 /* # bytes in final block */ + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + add x4, x0, x4 /* output address of final block */ + + ld1 {v1.16b}, [x1] /* load final block */ + ld1 {v2.16b}, [x8] + ld1 {v3.16b}, [x9] + + tbl v2.16b, {v0.16b}, v2.16b + tbx v0.16b, {v1.16b}, v3.16b + st1 {v2.16b}, [x4] /* overlapping stores */ + mov w4, wzr + b .Lxtsencctsout +AES_FUNC_END(aes_xts_encrypt) + +AES_FUNC_START(aes_xts_decrypt) + frame_push 0 + + /* subtract 16 bytes if we are doing CTS */ + sub w8, w4, #0x10 + tst w4, #0xf + csel w4, w4, w8, eq + + ld1 {v4.16b}, [x6] + xts_load_mask v8 + xts_cts_skip_tw w7, .Lxtsdecskiptw + cbz w7, .Lxtsdecnotfirst + + enc_prepare w3, x5, x8 + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ +.Lxtsdecskiptw: + dec_prepare w3, x2, x8 + b .LxtsdecNx + +.Lxtsdecnotfirst: + dec_prepare w3, x2, x8 +.LxtsdecloopNx: + next_tweak v4, v4, v8 +.LxtsdecNx: + subs w4, w4, #64 + bmi .Lxtsdec1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ + next_tweak v5, v4, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v8 + eor v3.16b, v3.16b, v7.16b + bl aes_decrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsdecout + xts_reload_mask v8 + b .LxtsdecloopNx +.Lxtsdec1x: + adds w4, w4, #64 + beq .Lxtsdecout + subs w4, w4, #16 +.Lxtsdecloop: + ld1 {v0.16b}, [x1], #16 + bmi .Lxtsdeccts +.Lxtsdecctsout: + eor v0.16b, v0.16b, v4.16b + decrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x0], #16 + cbz w4, .Lxtsdecout + subs w4, w4, #16 + next_tweak v4, v4, v8 + b .Lxtsdecloop +.Lxtsdecout: + st1 {v4.16b}, [x6] + frame_pop + ret + +.Lxtsdeccts: + adr_l x8, .Lcts_permute_table + + add x1, x1, w4, sxtw /* rewind input pointer */ + add w4, w4, #16 /* # bytes in final block */ + add x9, x8, #32 + add x8, x8, x4 + sub x9, x9, x4 + add x4, x0, x4 /* output address of final block */ + + next_tweak v5, v4, v8 + + ld1 {v1.16b}, [x1] /* load final block */ + ld1 {v2.16b}, [x8] + ld1 {v3.16b}, [x9] + + eor v0.16b, v0.16b, v5.16b + decrypt_block v0, w3, x2, x8, w7 + eor v0.16b, v0.16b, v5.16b + + tbl v2.16b, {v0.16b}, v2.16b + tbx v0.16b, {v1.16b}, v3.16b + + st1 {v2.16b}, [x4] /* overlapping stores */ + mov w4, wzr + b .Lxtsdecctsout +AES_FUNC_END(aes_xts_decrypt) + + /* + * aes_mac_update(u8 const in[], u32 const rk[], int rounds, + * int blocks, u8 dg[], int enc_before, int enc_after) + */ +AES_FUNC_START(aes_mac_update) + ld1 {v0.16b}, [x4] /* get dg */ + enc_prepare w2, x1, x7 + cbz w5, .Lmacloop4x + + encrypt_block v0, w2, x1, x7, w8 + +.Lmacloop4x: + subs w3, w3, #4 + bmi .Lmac1x + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v2.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v3.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v4.16b + cmp w3, wzr + csinv w5, w6, wzr, eq + cbz w5, .Lmacout + encrypt_block v0, w2, x1, x7, w8 + st1 {v0.16b}, [x4] /* return dg */ + cond_yield .Lmacout, x7, x8 + b .Lmacloop4x +.Lmac1x: + add w3, w3, #4 +.Lmacloop: + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + + subs w3, w3, #1 + csinv w5, w6, wzr, eq + cbz w5, .Lmacout + +.Lmacenc: + encrypt_block v0, w2, x1, x7, w8 + b .Lmacloop + +.Lmacout: + st1 {v0.16b}, [x4] /* return dg */ + mov w0, w3 + ret +AES_FUNC_END(aes_mac_update) diff --git a/lib/crypto/arm64/aes-neon.S b/lib/crypto/arm64/aes-neon.S new file mode 100644 index 000000000000..f37b1dbd887f --- /dev/null +++ b/lib/crypto/arm64/aes-neon.S @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AES cipher for ARMv8 NEON + * + * Copyright (C) 2013 - 2017 Linaro Ltd. + */ + +#include +#include + +#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) +#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) + + xtsmask .req v7 + cbciv .req v7 + vctr .req v4 + + .macro xts_reload_mask, tmp + xts_load_mask \tmp + .endm + + /* special case for the neon-bs driver calling into this one for CTS */ + .macro xts_cts_skip_tw, reg, lbl + tbnz \reg, #1, \lbl + .endm + + /* multiply by polynomial 'x' in GF(2^8) */ + .macro mul_by_x, out, in, temp, const + sshr \temp, \in, #7 + shl \out, \in, #1 + and \temp, \temp, \const + eor \out, \out, \temp + .endm + + /* multiply by polynomial 'x^2' in GF(2^8) */ + .macro mul_by_x2, out, in, temp, const + ushr \temp, \in, #6 + shl \out, \in, #2 + pmul \temp, \temp, \const + eor \out, \out, \temp + .endm + + /* preload the entire Sbox */ + .macro prepare, sbox, shiftrows, temp + movi v12.16b, #0x1b + ldr_l q13, \shiftrows, \temp + ldr_l q14, .Lror32by8, \temp + adr_l \temp, \sbox + ld1 {v16.16b-v19.16b}, [\temp], #64 + ld1 {v20.16b-v23.16b}, [\temp], #64 + ld1 {v24.16b-v27.16b}, [\temp], #64 + ld1 {v28.16b-v31.16b}, [\temp] + .endm + + /* do preload for encryption */ + .macro enc_prepare, ignore0, ignore1, temp + prepare crypto_aes_sbox, .LForward_ShiftRows, \temp + .endm + + .macro enc_switch_key, ignore0, ignore1, temp + /* do nothing */ + .endm + + /* do preload for decryption */ + .macro dec_prepare, ignore0, ignore1, temp + prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp + .endm + + /* apply SubBytes transformation using the preloaded Sbox */ + .macro sub_bytes, in + sub v9.16b, \in\().16b, v15.16b + tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b + sub v10.16b, v9.16b, v15.16b + tbx \in\().16b, {v20.16b-v23.16b}, v9.16b + sub v11.16b, v10.16b, v15.16b + tbx \in\().16b, {v24.16b-v27.16b}, v10.16b + tbx \in\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + /* apply MixColumns transformation */ + .macro mix_columns, in, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b + eor \in\().16b, \in\().16b, v8.16b + rev32 v8.8h, v8.8h + eor \in\().16b, \in\().16b, v8.16b + .endif + + mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b + rev32 v8.8h, \in\().8h + eor v8.16b, v8.16b, v9.16b + eor \in\().16b, \in\().16b, v8.16b + tbl \in\().16b, {\in\().16b}, v14.16b + eor \in\().16b, \in\().16b, v8.16b + .endm + + .macro do_block, enc, in, rounds, rk, rkp, i + ld1 {v15.4s}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +.La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 + tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ + sub_bytes \in + sub \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 + cbz \i, .Lb\@ + mix_columns \in, \enc + b .La\@ +.Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block, in, rounds, rk, rkp, i + do_block 1, \in, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block, in, rounds, rk, rkp, i + do_block 0, \in, \rounds, \rk, \rkp, \i + .endm + + /* + * Interleaved versions: functionally equivalent to the + * ones above, but applied to AES states in parallel. + */ + + .macro sub_bytes_4x, in0, in1, in2, in3 + sub v8.16b, \in0\().16b, v15.16b + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + sub v9.16b, \in1\().16b, v15.16b + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b + sub v10.16b, \in2\().16b, v15.16b + tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b + sub v11.16b, \in3\().16b, v15.16b + tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b + sub v8.16b, v8.16b, v15.16b + tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b + sub v9.16b, v9.16b, v15.16b + tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b + sub v10.16b, v10.16b, v15.16b + tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b + sub v11.16b, v11.16b, v15.16b + tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b + sub v8.16b, v8.16b, v15.16b + tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b + sub v9.16b, v9.16b, v15.16b + tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b + sub v10.16b, v10.16b, v15.16b + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b + sub v11.16b, v11.16b, v15.16b + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b + tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b + tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const + sshr \tmp0\().16b, \in0\().16b, #7 + shl \out0\().16b, \in0\().16b, #1 + sshr \tmp1\().16b, \in1\().16b, #7 + and \tmp0\().16b, \tmp0\().16b, \const\().16b + shl \out1\().16b, \in1\().16b, #1 + and \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b + .endm + + .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const + ushr \tmp0\().16b, \in0\().16b, #6 + shl \out0\().16b, \in0\().16b, #2 + ushr \tmp1\().16b, \in1\().16b, #6 + pmul \tmp0\().16b, \tmp0\().16b, \const\().16b + shl \out1\().16b, \in1\().16b, #2 + pmul \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b + .endm + + .macro mix_columns_2x, in0, in1, enc + .if \enc == 0 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 + eor \in0\().16b, \in0\().16b, v8.16b + rev32 v8.8h, v8.8h + eor \in1\().16b, \in1\().16b, v9.16b + rev32 v9.8h, v9.8h + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + .endif + + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 + rev32 v10.8h, \in0\().8h + rev32 v11.8h, \in1\().8h + eor v10.16b, v10.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b + tbl \in0\().16b, {\in0\().16b}, v14.16b + tbl \in1\().16b, {\in1\().16b}, v14.16b + eor \in0\().16b, \in0\().16b, v10.16b + eor \in1\().16b, \in1\().16b, v11.16b + .endm + + .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i + ld1 {v15.4s}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +.La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + movi v15.16b, #0x40 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ + tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ + tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ + sub_bytes_4x \in0, \in1, \in2, \in3 + sub \i, \i, #1 + ld1 {v15.4s}, [\rkp], #16 + cbz \i, .Lb\@ + mix_columns_2x \in0, \in1, \enc + mix_columns_2x \in2, \in3, \enc + b .La\@ +.Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + +#include "aes-modes.S" + + .section ".rodata", "a" + .align 4 +.LForward_ShiftRows: + .octa 0x0b06010c07020d08030e09040f0a0500 + +.LReverse_ShiftRows: + .octa 0x0306090c0f0205080b0e0104070a0d00 + +.Lror32by8: + .octa 0x0c0f0e0d080b0a090407060500030201 diff --git a/lib/crypto/arm64/aes.h b/lib/crypto/arm64/aes.h index 63eea6271ef9..69f465c668f0 100644 --- a/lib/crypto/arm64/aes.h +++ b/lib/crypto/arm64/aes.h @@ -126,6 +126,36 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, } EXPORT_SYMBOL(ce_aes_expandkey); +#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) +EXPORT_SYMBOL_NS_GPL(neon_aes_ecb_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_ecb_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_cbc_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_cbc_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_cbc_cts_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_cbc_cts_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_ctr_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_xctr_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_xts_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_xts_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_essiv_cbc_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_essiv_cbc_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(neon_aes_mac_update, "CRYPTO_INTERNAL"); + +EXPORT_SYMBOL_NS_GPL(ce_aes_ecb_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_ecb_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_cbc_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_cbc_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_cbc_cts_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_cbc_cts_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_ctr_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_xctr_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_xts_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_xts_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_essiv_cbc_encrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_essiv_cbc_decrypt, "CRYPTO_INTERNAL"); +EXPORT_SYMBOL_NS_GPL(ce_aes_mac_update, "CRYPTO_INTERNAL"); +#endif + static void aes_encrypt_arch(const struct aes_enckey *key, u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]) -- cgit v1.2.3 From 58286738b159ca93d41438a6ddcc2ea5333191b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Feb 2026 13:34:51 -0800 Subject: lib/crypto: arm64/aes: Migrate optimized CBC-based MACs into library Instead of exposing the arm64-optimized CMAC, XCBC-MAC, and CBC-MAC code via arm64-specific crypto_shash algorithms, instead just implement the aes_cbcmac_blocks_arch() library function. This is much simpler, it makes the corresponding library functions be arm64-optimized, and it fixes the longstanding issue where this optimized code was disabled by default. The corresponding algorithms still remain available through crypto_shash, but individual architectures no longer need to handle it. Note that to be compatible with the library using 'size_t' lengths, the type of the return value and 'blocks' parameter to the assembly functions had to be changed to 'size_t', and the assembly code had to be updated accordingly to use the corresponding 64-bit registers. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260218213501.136844-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/aes-glue.c | 214 +------------------------------------------ include/crypto/aes.h | 9 +- lib/crypto/arm64/aes-modes.S | 19 ++-- lib/crypto/arm64/aes.h | 48 +++++++++- 5 files changed, 61 insertions(+), 231 deletions(-) (limited to 'include') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 81ed892b3b72..82794afaffc9 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -144,7 +144,7 @@ config CRYPTO_AES_ARM64_CE_CCM select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE_BLK select CRYPTO_AEAD - select CRYPTO_LIB_AES + select CRYPTO_LIB_AES_CBC_MACS help AEAD cipher: AES cipher algorithms (FIPS-197) with CCM (Counter with Cipher Block Chaining-Message Authentication Code) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index fd7c3a560a71..85497cfe76d8 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -37,7 +36,6 @@ #define aes_xctr_encrypt ce_aes_xctr_encrypt #define aes_xts_encrypt ce_aes_xts_encrypt #define aes_xts_decrypt ce_aes_xts_decrypt -#define aes_mac_update ce_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions"); #else #define MODE "neon" @@ -54,7 +52,6 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions"); #define aes_xctr_encrypt neon_aes_xctr_encrypt #define aes_xts_encrypt neon_aes_xts_encrypt #define aes_xts_decrypt neon_aes_xts_decrypt -#define aes_mac_update neon_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON"); #endif #if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) @@ -66,9 +63,6 @@ MODULE_ALIAS_CRYPTO("xctr(aes)"); #endif MODULE_ALIAS_CRYPTO("cts(cbc(aes))"); MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)"); -MODULE_ALIAS_CRYPTO("cmac(aes)"); -MODULE_ALIAS_CRYPTO("xcbc(aes)"); -MODULE_ALIAS_CRYPTO("cbcmac(aes)"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); @@ -84,15 +78,6 @@ struct crypto_aes_essiv_cbc_ctx { struct crypto_aes_ctx __aligned(8) key2; }; -struct mac_tfm_ctx { - struct crypto_aes_ctx key; - u8 __aligned(8) consts[]; -}; - -struct mac_desc_ctx { - u8 dg[AES_BLOCK_SIZE]; -}; - static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -723,211 +708,14 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = essiv_cbc_decrypt, } }; -static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); - - return aes_expandkey(&ctx->key, in_key, key_len); -} - -static void cmac_gf128_mul_by_x(be128 *y, const be128 *x) -{ - u64 a = be64_to_cpu(x->a); - u64 b = be64_to_cpu(x->b); - - y->a = cpu_to_be64((a << 1) | (b >> 63)); - y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); -} - -static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); - be128 *consts = (be128 *)ctx->consts; - int rounds = 6 + key_len / 4; - int err; - - err = cbcmac_setkey(tfm, in_key, key_len); - if (err) - return err; - - /* encrypt the zero vector */ - scoped_ksimd() - aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, - ctx->key.key_enc, rounds, 1); - - cmac_gf128_mul_by_x(consts, consts); - cmac_gf128_mul_by_x(consts + 1, consts); - - return 0; -} - -static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key, - unsigned int key_len) -{ - static u8 const ks[3][AES_BLOCK_SIZE] = { - { [0 ... AES_BLOCK_SIZE - 1] = 0x1 }, - { [0 ... AES_BLOCK_SIZE - 1] = 0x2 }, - { [0 ... AES_BLOCK_SIZE - 1] = 0x3 }, - }; - - struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); - int rounds = 6 + key_len / 4; - u8 key[AES_BLOCK_SIZE]; - int err; - - err = cbcmac_setkey(tfm, in_key, key_len); - if (err) - return err; - - scoped_ksimd() { - aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1); - aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2); - } - - return cbcmac_setkey(tfm, key, sizeof(key)); -} - -static int mac_init(struct shash_desc *desc) -{ - struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - - memset(ctx->dg, 0, AES_BLOCK_SIZE); - return 0; -} - -static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, - u8 dg[], int enc_before) -{ - int rounds = 6 + ctx->key_length / 4; - int rem; - - do { - scoped_ksimd() - rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, - dg, enc_before, !enc_before); - in += (blocks - rem) * AES_BLOCK_SIZE; - blocks = rem; - } while (blocks); -} - -static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) -{ - struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int blocks = len / AES_BLOCK_SIZE; - - len %= AES_BLOCK_SIZE; - mac_do_update(&tctx->key, p, blocks, ctx->dg, 0); - return len; -} - -static int cbcmac_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *out) -{ - struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - - if (len) { - crypto_xor(ctx->dg, src, len); - mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1); - } - memcpy(out, ctx->dg, AES_BLOCK_SIZE); - return 0; -} - -static int cmac_finup(struct shash_desc *desc, const u8 *src, unsigned int len, - u8 *out) -{ - struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - u8 *consts = tctx->consts; - - crypto_xor(ctx->dg, src, len); - if (len != AES_BLOCK_SIZE) { - ctx->dg[len] ^= 0x80; - consts += AES_BLOCK_SIZE; - } - mac_do_update(&tctx->key, consts, 1, ctx->dg, 0); - memcpy(out, ctx->dg, AES_BLOCK_SIZE); - return 0; -} - -static struct shash_alg mac_algs[] = { { - .base.cra_name = "cmac(aes)", - .base.cra_driver_name = "cmac-aes-" MODE, - .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINAL_NONZERO, - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + - 2 * AES_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = AES_BLOCK_SIZE, - .init = mac_init, - .update = mac_update, - .finup = cmac_finup, - .setkey = cmac_setkey, - .descsize = sizeof(struct mac_desc_ctx), -}, { - .base.cra_name = "xcbc(aes)", - .base.cra_driver_name = "xcbc-aes-" MODE, - .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINAL_NONZERO, - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + - 2 * AES_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = AES_BLOCK_SIZE, - .init = mac_init, - .update = mac_update, - .finup = cmac_finup, - .setkey = xcbc_setkey, - .descsize = sizeof(struct mac_desc_ctx), -}, { - .base.cra_name = "cbcmac(aes)", - .base.cra_driver_name = "cbcmac-aes-" MODE, - .base.cra_priority = PRIO, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .base.cra_blocksize = AES_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), - .base.cra_module = THIS_MODULE, - - .digestsize = AES_BLOCK_SIZE, - .init = mac_init, - .update = mac_update, - .finup = cbcmac_finup, - .setkey = cbcmac_setkey, - .descsize = sizeof(struct mac_desc_ctx), -} }; - static void aes_exit(void) { - crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs)); crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } static int __init aes_init(void) { - int err; - - err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); - if (err) - return err; - - err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs)); - if (err) - goto unregister_ciphers; - - return 0; - -unregister_ciphers: - crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); - return err; + return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } #ifdef USE_V8_CRYPTO_EXTENSIONS diff --git a/include/crypto/aes.h b/include/crypto/aes.h index 91bf4667d3e9..3feb4105c2a2 100644 --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -200,9 +200,6 @@ asmlinkage void neon_aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int blocks, u8 iv[], u32 const rk2[]); -asmlinkage int neon_aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); asmlinkage void ce_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int blocks); @@ -233,9 +230,9 @@ asmlinkage void ce_aes_essiv_cbc_encrypt(u8 out[], u8 const in[], asmlinkage void ce_aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int blocks, u8 iv[], u32 const rk2[]); -asmlinkage int ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); +asmlinkage size_t ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds, + size_t blocks, u8 dg[], int enc_before, + int enc_after); #elif defined(CONFIG_PPC) void ppc_expand_key_128(u32 *key_enc, const u8 *key); void ppc_expand_key_192(u32 *key_enc, const u8 *key); diff --git a/lib/crypto/arm64/aes-modes.S b/lib/crypto/arm64/aes-modes.S index f4df6f84a3c7..fc89cd02b642 100644 --- a/lib/crypto/arm64/aes-modes.S +++ b/lib/crypto/arm64/aes-modes.S @@ -815,9 +815,11 @@ AES_FUNC_START(aes_xts_decrypt) b .Lxtsdecctsout AES_FUNC_END(aes_xts_decrypt) +#if IS_ENABLED(CONFIG_CRYPTO_LIB_AES_CBC_MACS) /* - * aes_mac_update(u8 const in[], u32 const rk[], int rounds, - * int blocks, u8 dg[], int enc_before, int enc_after) + * size_t aes_mac_update(u8 const in[], u32 const rk[], int rounds, + * size_t blocks, u8 dg[], int enc_before, + * int enc_after); */ AES_FUNC_START(aes_mac_update) ld1 {v0.16b}, [x4] /* get dg */ @@ -827,7 +829,7 @@ AES_FUNC_START(aes_mac_update) encrypt_block v0, w2, x1, x7, w8 .Lmacloop4x: - subs w3, w3, #4 + subs x3, x3, #4 bmi .Lmac1x ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ @@ -837,7 +839,7 @@ AES_FUNC_START(aes_mac_update) eor v0.16b, v0.16b, v3.16b encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v4.16b - cmp w3, wzr + cmp x3, xzr csinv w5, w6, wzr, eq cbz w5, .Lmacout encrypt_block v0, w2, x1, x7, w8 @@ -845,13 +847,13 @@ AES_FUNC_START(aes_mac_update) cond_yield .Lmacout, x7, x8 b .Lmacloop4x .Lmac1x: - add w3, w3, #4 + add x3, x3, #4 .Lmacloop: - cbz w3, .Lmacout + cbz x3, .Lmacout ld1 {v1.16b}, [x0], #16 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - subs w3, w3, #1 + subs x3, x3, #1 csinv w5, w6, wzr, eq cbz w5, .Lmacout @@ -861,6 +863,7 @@ AES_FUNC_START(aes_mac_update) .Lmacout: st1 {v0.16b}, [x4] /* return dg */ - mov w0, w3 + mov x0, x3 ret AES_FUNC_END(aes_mac_update) +#endif /* CONFIG_CRYPTO_LIB_AES_CBC_MACS */ diff --git a/lib/crypto/arm64/aes.h b/lib/crypto/arm64/aes.h index 69f465c668f0..78e7b4e5f120 100644 --- a/lib/crypto/arm64/aes.h +++ b/lib/crypto/arm64/aes.h @@ -11,6 +11,7 @@ #include #include +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes); struct aes_block { @@ -28,6 +29,9 @@ asmlinkage void __aes_ce_decrypt(const u32 inv_rk[], u8 out[AES_BLOCK_SIZE], asmlinkage u32 __aes_ce_sub(u32 l); asmlinkage void __aes_ce_invert(struct aes_block *out, const struct aes_block *in); +asmlinkage size_t neon_aes_mac_update(u8 const in[], u32 const rk[], int rounds, + size_t blocks, u8 dg[], int enc_before, + int enc_after); /* * Expand an AES key using the crypto extensions if supported and usable or @@ -139,7 +143,6 @@ EXPORT_SYMBOL_NS_GPL(neon_aes_xts_encrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(neon_aes_xts_decrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(neon_aes_essiv_cbc_encrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(neon_aes_essiv_cbc_decrypt, "CRYPTO_INTERNAL"); -EXPORT_SYMBOL_NS_GPL(neon_aes_mac_update, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(ce_aes_ecb_encrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(ce_aes_ecb_decrypt, "CRYPTO_INTERNAL"); @@ -153,6 +156,8 @@ EXPORT_SYMBOL_NS_GPL(ce_aes_xts_encrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(ce_aes_xts_decrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(ce_aes_essiv_cbc_encrypt, "CRYPTO_INTERNAL"); EXPORT_SYMBOL_NS_GPL(ce_aes_essiv_cbc_decrypt, "CRYPTO_INTERNAL"); +#endif +#if IS_MODULE(CONFIG_CRYPTO_AES_ARM64_CE_CCM) EXPORT_SYMBOL_NS_GPL(ce_aes_mac_update, "CRYPTO_INTERNAL"); #endif @@ -184,11 +189,48 @@ static void aes_decrypt_arch(const struct aes_key *key, } } +#if IS_ENABLED(CONFIG_CRYPTO_LIB_AES_CBC_MACS) +#define aes_cbcmac_blocks_arch aes_cbcmac_blocks_arch +static bool aes_cbcmac_blocks_arch(u8 h[AES_BLOCK_SIZE], + const struct aes_enckey *key, const u8 *data, + size_t nblocks, bool enc_before, + bool enc_after) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + size_t rem; + + scoped_ksimd() { + if (static_branch_likely(&have_aes)) + rem = ce_aes_mac_update( + data, key->k.rndkeys, + key->nrounds, nblocks, h, + enc_before, enc_after); + else + rem = neon_aes_mac_update( + data, key->k.rndkeys, + key->nrounds, nblocks, h, + enc_before, enc_after); + } + data += (nblocks - rem) * AES_BLOCK_SIZE; + nblocks = rem; + enc_before = false; + } while (nblocks); + return true; + } + return false; +} +#endif /* CONFIG_CRYPTO_LIB_AES_CBC_MACS */ + #ifdef CONFIG_KERNEL_MODE_NEON #define aes_mod_init_arch aes_mod_init_arch static void aes_mod_init_arch(void) { - if (cpu_have_named_feature(AES)) - static_branch_enable(&have_aes); + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_neon); + if (cpu_have_named_feature(AES)) + static_branch_enable(&have_aes); + } } #endif /* CONFIG_KERNEL_MODE_NEON */ -- cgit v1.2.3 From aca086ff27c3f67e81617e4b063d1126544a4f19 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:58 +0100 Subject: sed-opal: add IOC_OPAL_REACTIVATE_LSP. This adds the 'Reactivate' method as described in the "TCG Storage Opal SSC Feature Set: Single User Mode" document (ch. 3.1.1.1). The method enables switching an already active SED OPAL2 device, with appropriate firmware support for Single User Mode (SUM), to or from SUM. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 99 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 14 ++++++ 4 files changed, 115 insertions(+) (limited to 'include') diff --git a/block/opal_proto.h b/block/opal_proto.h index 3ccee5977c10..d138785b8198 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -155,6 +155,7 @@ enum opal_method { OPAL_AUTHENTICATE, OPAL_RANDOM, OPAL_ERASE, + OPAL_REACTIVATE, }; enum opal_token { diff --git a/block/sed-opal.c b/block/sed-opal.c index 83bee47aa29f..5d06f5f433bf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -220,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = { { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, [OPAL_ERASE] = { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, + [OPAL_REACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 }, }; static int end_opal_session_error(struct opal_dev *dev); @@ -2287,6 +2289,74 @@ static int activate_lsp(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int reactivate_lsp(struct opal_dev *dev, void *data) +{ + struct opal_lr_react *opal_react = data; + u8 user_lr[OPAL_UID_LENGTH]; + int err, i; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REACTIVATE]); + + if (err) { + pr_debug("Error building Reactivate LockingSP command.\n"); + return err; + } + + /* + * If neither 'entire_table' nor 'num_lrs' is set, the device + * gets reactivated with SUM disabled. Only Admin1PIN will change + * if set. + */ + if (opal_react->entire_table) { + /* Entire Locking table (all locking ranges) will be put in SUM. */ + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + } else if (opal_react->num_lrs) { + /* Subset of Locking table (selected locking range(s)) to be put in SUM */ + err = build_locking_range(user_lr, sizeof(user_lr), + opal_react->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_react->num_lrs; i++) { + user_lr[7] = opal_react->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* Skipping the rangle policy parameter is same as setting its value to zero */ + if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* + * Optional parameter. If set, it changes the Admin1 PIN even when SUM + * is being disabled. + */ + if (opal_react->new_admin_key.key_len) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN); + add_token_bytestring(&err, dev, opal_react->new_admin_key.key, + opal_react->new_admin_key.key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + return finalize_and_send(dev, parse_and_check_status); +} + /* Determine if we're in the Manufactured Inactive or Active state */ static int get_lsp_lifecycle(struct opal_dev *dev, void *data) { @@ -2957,6 +3027,32 @@ static int opal_activate_lsp(struct opal_dev *dev, return ret; } +static int opal_reactivate_lsp(struct opal_dev *dev, + struct opal_lr_react *opal_lr_react) +{ + const struct opal_step active_steps[] = { + { start_admin1LSP_opal_session, &opal_lr_react->key }, + { reactivate_lsp, opal_lr_react }, + /* No end_opal_session. The controller terminates the session */ + }; + int ret; + + /* use either 'entire_table' parameter or set of locking ranges */ + if (opal_lr_react->num_lrs > OPAL_MAX_LRS || + (opal_lr_react->num_lrs && opal_lr_react->entire_table)) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lr_react->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_setup_locking_range(struct opal_dev *dev, struct opal_user_lr_setup *opal_lrs) { @@ -3315,6 +3411,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SET_SID_PW: ret = opal_set_new_sid_pw(dev, p); break; + case IOC_OPAL_REACTIVATE_LSP: + ret = opal_reactivate_lsp(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 80f33a93f944..2ae5e6b0ac21 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -53,6 +53,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: + case IOC_OPAL_REACTIVATE_LSP: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9025dd5a4f0f..d03e590b6501 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -74,6 +74,19 @@ struct opal_lr_act { __u8 align[2]; /* Align to 8 byte boundary */ }; +struct opal_lr_react { + struct opal_key key; + struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */ + __u8 num_lrs; /* + * Configure selected ranges (from lr[]) in SUM. + * If num_lrs > 0 the 'entire_table' must be 0 + */ + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */ + __u8 entire_table; /* Set all locking objects in SUM */ + __u8 align[4]; /* Align to 8 byte boundary */ +}; + struct opal_session_info { __u32 sum; __u32 who; @@ -216,5 +229,6 @@ struct opal_revert_lsp { #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) +#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 8e3d34a7ce7386b01947dd649bd24775544e4d3e Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:00 +0100 Subject: sed-opal: add IOC_OPAL_LR_SET_START_LEN ioctl. This ioctl is used to set up locking range start (offset) and locking range length attributes only. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. Therefore, we need a separate function for setting up locking range start and locking range length because it may require two different authorities (and sessions) if the RangeStartRangeLengthPolicy attribute is set. With the IOC_OPAL_LR_SET_START_LEN ioctl, the opal_user_lr_setup members 'RLE' and 'WLE' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 28 ++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 30 insertions(+) (limited to 'include') diff --git a/block/sed-opal.c b/block/sed-opal.c index 7be72f621952..55c8a0953d78 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3091,6 +3091,31 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; } +static int opal_setup_locking_range_start_length(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_locking_range_start_length, opal_lrs }, + { end_opal_session, } + }; + int ret; + + /* we can not set global locking range offset or length */ + if (opal_lrs->session.opal_key.lr == 0) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3431,6 +3456,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_REACTIVATE_LSP: ret = opal_reactivate_lsp(dev, p); break; + case IOC_OPAL_LR_SET_START_LEN: + ret = opal_setup_locking_range_start_length(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ae5e6b0ac21..a0df6819b0a9 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -54,6 +54,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: + case IOC_OPAL_LR_SET_START_LEN: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index d03e590b6501..82de38f3fbeb 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -230,5 +230,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) +#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From a441a9d22433fea561de131e27fff41715c2d186 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:01 +0100 Subject: sed-opal: add IOC_OPAL_ENABLE_DISABLE_LR. This ioctl is used to set up RLE (read lock enabled) and WLE (write lock enabled) parameters of the Locking object. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. With the IOC_OPAL_ENABLE_DISABLE_LR ioctl, the opal_user_lr_setup members 'range_start' and 'range_length' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 24 ++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 26 insertions(+) (limited to 'include') diff --git a/block/sed-opal.c b/block/sed-opal.c index 55c8a0953d78..53a73422911e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3116,6 +3116,27 @@ static int opal_setup_locking_range_start_length(struct opal_dev *dev, return ret; } +static int opal_enable_disable_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }; + int ret; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3459,6 +3480,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_LR_SET_START_LEN: ret = opal_setup_locking_range_start_length(dev, p); break; + case IOC_OPAL_ENABLE_DISABLE_LR: + ret = opal_enable_disable_range(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index a0df6819b0a9..1d63479838cf 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -55,6 +55,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: + case IOC_OPAL_ENABLE_DISABLE_LR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 82de38f3fbeb..bde023ae2295 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -231,5 +231,6 @@ struct opal_revert_lsp { #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) +#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 0cc9293bccb234552b81c3ebc074f5839f019e01 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:03 +0100 Subject: sed-opal: add IOC_OPAL_GET_SUM_STATUS ioctl. This adds a function for retrieving the set of Locking objects enabled for Single User Mode (SUM) and the value of the RangeStartRangeLengthPolicy parameter. It retrieves data from the LockingInfo table, specifically the columns SingleUserModeRanges and RangeStartLengthPolicy, which were added according to the TCG Opal Feature Set: Single User Mode, as described in chapters 4.4.3.1 and 4.4.3.2. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 159 ++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 13 ++++ 3 files changed, 173 insertions(+) (limited to 'include') diff --git a/block/sed-opal.c b/block/sed-opal.c index 6146a1b30421..c34d19e91201 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1757,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data) OPAL_ADMINSP_UID, NULL, 0); } +static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_LOCKINGSP_UID, NULL, 0); +} + static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) { int ret; @@ -3389,6 +3395,156 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data) return 0; } +static int get_sum_ranges(struct opal_dev *dev, void *data) +{ + const char *lr_uid; + size_t lr_uid_len; + u64 val; + const struct opal_resp_tok *tok; + int err, tok_n = 2; + struct opal_sum_ranges *sranges = data; + const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; + + err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST, + OPAL_SUM_RANGE_POLICY); + if (err) { + pr_debug("Couldn't get locking info table columns %d to %d.\n", + OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY); + return err; + } + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) { + pr_debug("Token %d does not match expected column %u.\n", + tok_n, OPAL_SUM_SET_LIST); + return OPAL_INVAL_PARAM; + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + /* + * The OPAL_SUM_SET_LIST response contains two distinct values: + * + * - the list of individual locking ranges (UIDs) put in SUM. The list + * may also be empty signaling the SUM is disabled. + * + * - the Locking table UID if the entire Locking table is put in SUM. + */ + if (response_token_matches(tok, OPAL_STARTLIST)) { + sranges->num_lrs = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + while (!response_token_matches(tok, OPAL_ENDLIST)) { + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) { + if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) { + pr_debug("Unexpected byte %d at LR UUID position 5.\n", + lr_uid[5]); + return OPAL_INVAL_PARAM; + } + sranges->lr[sranges->num_lrs++] = lr_uid[7]; + } else + sranges->lr[sranges->num_lrs++] = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + } + } else { + /* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */ + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) { + pr_debug("Unexpected response UID.\n"); + return OPAL_INVAL_PARAM; + } + + /* sed-opal kernel API already provides following limit in Activate command */ + sranges->num_lrs = OPAL_MAX_LRS; + memcpy(sranges->lr, lr_all, OPAL_MAX_LRS); + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val); + if (err) + return err; + + sranges->range_policy = val ? 1 : 0; + + return 0; +} + +static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs, + void __user *data) +{ + const struct opal_step admin_steps[] = { + { start_admin1LSP_opal_session, &opal_sum_rngs->key }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }, anybody_steps[] = { + { start_anybodyLSP_opal_session, NULL }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (opal_sum_rngs->key.key_len) + /* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */ + ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps)); + else + /* Use Anybody session (no key) to retrieve LockingInfo columns */ + ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs), + (void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs), + sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) { + pr_debug("Error copying SUM ranges info to userspace\n"); + return -EFAULT; + } + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -3483,6 +3639,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_ENABLE_DISABLE_LR: ret = opal_enable_disable_range(dev, p); break; + case IOC_OPAL_GET_SUM_STATUS: + ret = opal_get_sum_ranges(dev, p, arg); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1d63479838cf..aa006edb612b 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -56,6 +56,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: + case IOC_OPAL_GET_SUM_STATUS: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index bde023ae2295..9830298ec51c 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -111,6 +111,18 @@ struct opal_lr_status { __u8 align[4]; }; +struct opal_sum_ranges { + /* + * Initiate Admin1 session if key_len > 0, + * use Anybody session otherwise. + */ + struct opal_key key; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; + __u8 align[5]; /* Align to 8 byte boundary */ +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -232,5 +244,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) +#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 0ee8ab5d4dc51704be1157470f3df8090629f9fc Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 25 Feb 2026 20:51:05 +0000 Subject: block: annotate struct request_queue with __counted_by_ptr The queue_hw_ctx field in struct request_queue is an array of pointers to struct blk_mq_hw_ctx. The number of elements in this array is tracked by the nr_hw_queues field. The array is allocated in __blk_mq_realloc_hw_ctxs() using kcalloc_node() with set->nr_hw_queues elements. q->nr_hw_queues is subsequently updated to set->nr_hw_queues. When growing the array, the new array is assigned to queue_hw_ctx before nr_hw_queues is updated. This is safe because nr_hw_queues (the old smaller count) is used for bounds checking, which is within the new larger allocation. When shrinking the array, nr_hw_queues is updated to the smaller value, while queue_hw_ctx retains the larger allocation. This is also safe as the count is within the allocation bounds. Annotating queue_hw_ctx with __counted_by_ptr(nr_hw_queues) allows the compiler (with kSAN) to verify that accesses to queue_hw_ctx are within the valid range defined by nr_hw_queues. This patch was generated by CodeMender and reviewed by Bill Wendling. Tested by running blktests. Reviewed-by: Daniel Wagner Signed-off-by: Bill Wendling [axboe: massage commit message] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..540c2c6c9afd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -502,7 +502,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues); struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From b7cbc30e93e3a64ea058230f6d0c764d6d80276f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:48 +0900 Subject: block: rename struct gendisk zone_wplugs_lock field Rename struct gendisk zone_wplugs_lock field to zone_wplugs_hash_lock to clearly indicates that this is the spinlock used for manipulating the hash table of zone write plugs. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 23 ++++++++++++----------- include/linux/blkdev.h | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 26c2aa79faf6..78810e726222 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -514,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, + flags); return false; } } @@ -529,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * necessarilly in the active condition. */ zones_cond = rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); if (zones_cond) zwplug->cond = zones_cond[zwplug->zone_no]; else @@ -537,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); return true; } @@ -590,13 +591,13 @@ static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)), + lockdep_is_held(&disk->zone_wplugs_hash_lock)), zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } @@ -1739,7 +1740,7 @@ put_zwplug: void disk_init_zone_resources(struct gendisk *disk) { - spin_lock_init(&disk->zone_wplugs_lock); + spin_lock_init(&disk->zone_wplugs_hash_lock); } /* @@ -1829,10 +1830,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { unsigned long flags; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); kfree_rcu_mightsleep(zones_cond); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 540c2c6c9afd..a49a1e38c6e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,7 +200,7 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; -- cgit v1.2.3 From 1365b6904fd050bf22ab9f3df375a396de5837a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:49 +0900 Subject: block: allow submitting all zone writes from a single context In order to maintain sequential write patterns per zone with zoned block devices, zone write plugging issues only a single write BIO per zone at any time. This works well but has the side effect that when large sequential write streams are issued by the user and these streams cross zone boundaries, the device ends up receiving a discontiguous set of write commands for different zones. The same also happens when a user writes simultaneously at high queue depth multiple zones: the device does not see all sequential writes per zone and receives discontiguous writes to different zones. While this does not affect the performance of solid state zoned block devices, when using an SMR HDD, this pattern change from sequential writes to discontiguous writes to different zones significantly increases head seek which results in degraded write throughput. In order to reduce this seek overhead for rotational media devices, introduce a per disk zone write plugs kernel thread to issue all write BIOs to zones. This single zone write issuing context is enabled for any zoned block device that has a request queue flagged with the new QUEUE_ZONED_QD1_WRITES flag. The flag QUEUE_ZONED_QD1_WRITES is visible as the sysfs queue attribute zoned_qd1_writes for zoned devices. For regular block devices, this attribute is not visible. For zoned block devices, a user can override the default value set to force the global write maximum queue depth of 1 for a zoned block device, or clear this attribute to fallback to the default behavior of zone write plugging which limits writes to QD=1 per sequential zone. Writing to a zoned block device flagged with QUEUE_ZONED_QD1_WRITES is implemented using a list of zone write plugs that have a non-empty BIO list. Listed zone write plugs are processed by the disk zone write plugs worker kthread in FIFO order, and all BIOs of a zone write plug are all processed before switching to the next listed zone write plug. A newly submitted BIO for a non-FULL zone write plug that is not yet listed causes the addition of the zone write plug at the end of the disk list of zone write plugs. Since the write BIOs queued in a zone write plug BIO list are necessarilly sequential, for rotational media, using the single zone write plugs kthread to issue all BIOs maintains a sequential write pattern and thus reduces seek overhead and improves write throughput. This processing essentially result in always writing to HDDs at QD=1, which is not an issue for HDDs operating with write caching enabled. Performance with write cache disabled is also not degraded thanks to the efficient write handling of modern SMR HDDs. A disk list of zone write plugs is defined using the new struct gendisk zone_wplugs_list, and accesses to this list is protected using the zone_wplugs_list_lock spinlock. The per disk kthread (zone_wplugs_worker) code is implemented by the function disk_zone_wplugs_worker(). A reference on listed zone write plugs is always held until all BIOs of the zone write plug are processed by the worker kthread. BIO issuing at QD=1 is driven using a completion structure (zone_wplugs_worker_bio_done) and calls to blk_io_wait(). With this change, performance when sequentially writing the zones of a 30 TB SMR SATA HDD connected to an AHCI adapter changes as follows (1MiB direct I/Os, results in MB/s unit): +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Sequential write | Baseline | Patched | | Queue Depth | 6.19-rc8 | | +------------------+----------+---------+ | 1 | 244 | 245 | | 2 | 244 | 245 | | 4 | 245 | 245 | | 8 | 242 | 245 | | 16 | 222 | 246 | | 32 | 211 | 245 | | 64 | 193 | 244 | | 128 | 112 | 246 | +------------------+----------+---------+ With the current code (baseline), as the sequential write stream crosses a zone boundary, higher queue depth creates a gap between the last IO to the previous zone and the first IOs to the following zones, causing head seeks and degrading performance. Using the disk zone write plugs worker thread, this pattern disappears and the maximum throughput of the drive is maintained, leading to over 100% improvements in throughput for high queue depth write. Using 16 fio jobs all writing to randomly chosen zones at QD=32 with 1 MiB direct IOs, write throughput also increases significantly. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Random write | Baseline | Patched | | Number of zones | 6.19-rc7 | | +------------------+----------+---------+ | 1 | 191 | 192 | | 2 | 101 | 128 | | 4 | 115 | 123 | | 8 | 90 | 120 | | 16 | 64 | 115 | | 32 | 58 | 105 | | 64 | 56 | 101 | | 128 | 55 | 99 | +------------------+----------+---------+ Tests using XFS shows that buffered write speed with 8 jobs writing files increases by 12% to 35% depending on the workload. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 212 | 238 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 213 | 243 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 179 | 242 | | random file size | | | +------------------+----------+---------+ Performance gains are even more significant when using an HBA that limits the maximum size of commands to a small value, e.g. HBAs controlled with the mpi3mr driver limit commands to a maximum of 1 MiB. In such case, the write throughput gains are over 40%. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 175 | 245 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 174 | 244 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 171 | 243 | | random file size | | | +------------------+----------+---------+ Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + block/blk-sysfs.c | 35 ++++++++- block/blk-zoned.c | 190 +++++++++++++++++++++++++++++++++++++++++++------ include/linux/blkdev.h | 8 +++ 4 files changed, 212 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..ca8033e6d699 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -754,6 +785,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; @@ -786,7 +818,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 78810e726222..e1a23c8b676d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -623,7 +628,19 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } -static void blk_zone_wplug_bio_work(struct work_struct *work); +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); + + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ + disk_put_zone_wplug(zwplug); +} /* * Get a zone write plug for the zone containing @sector. @@ -658,6 +675,7 @@ again: zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; /* @@ -690,6 +708,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -703,6 +722,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1137,8 +1170,8 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); @@ -1151,6 +1184,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) disk_put_zone_wplug(zwplug); @@ -1190,6 +1224,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1423,6 +1473,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1445,7 +1502,10 @@ queue_bio: if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1586,16 +1646,22 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + /* + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. + */ + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1685,10 +1751,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1704,7 +1769,7 @@ again: if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1718,14 +1783,15 @@ again: goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1733,14 +1799,78 @@ again: blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1756,6 +1886,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1781,8 +1912,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1790,7 +1934,7 @@ free_hash: kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1840,6 +1984,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a49a1e38c6e7..ef6457487d23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,10 @@ struct gendisk { struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From ecd92cfec5349876d6a80f8188ea98c5920094b6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Feb 2026 16:54:48 +0900 Subject: block: remove bdev_nonrot() bdev_nonrot() is simply the negative return value of bdev_rot(). So replace all call sites of bdev_nonrot() with calls to bdev_rot() and remove bdev_nonrot(). Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Paul Menzel Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- drivers/target/target_core_file.c | 2 +- drivers/target/target_core_iblock.c | 2 +- fs/btrfs/volumes.c | 4 ++-- fs/ext4/mballoc-test.c | 2 +- fs/ext4/mballoc.c | 2 +- include/linux/blkdev.h | 5 ----- mm/swapfile.c | 2 +- 10 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 181400e147c0..cda6af0712b9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1878,7 +1878,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, if (info->rdev) return false; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { set_bit(Nonrot, &rdev->flags); WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0653b5d8545a..cfbd345805ca 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = bdev_nonrot(rdev->bdev); + nonrot = !bdev_rot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..ba9d6d05b089 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7541,7 +7541,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 3ae1f7137d9d..d6e3e5214652 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev) */ dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bdev)) + if (!bdev_rot(bdev)) dev->dev_attrib.is_nonrot = 1; } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3c92f94497b4..1087d1d17c36 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bd)) + if (!bdev_rot(bd)) dev->dev_attrib.is_nonrot = 1; target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 648bb09fc416..353c9caa8ab9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(file_bdev(bdev_file))) + if (bdev_rot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) @@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(device->bdev)) + if (bdev_rot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 9fbdf6a09489..b9f22e3a8d5c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -72,7 +72,7 @@ static int mbt_mb_init(struct super_block *sb) ext4_fsblk_t block; int ret; - /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + /* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */ sb->s_bdev = kzalloc_obj(*sb->s_bdev); if (sb->s_bdev == NULL) return -ENOMEM; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..8a4dfe19878c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3836,7 +3836,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (bdev_nonrot(sb->s_bdev)) + if (!bdev_rot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ef6457487d23..8d93d8e356d8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,11 +1475,6 @@ static inline bool bdev_rot(struct block_device *bdev) return blk_queue_rot(bdev_get_queue(bdev)); } -static inline bool bdev_nonrot(struct block_device *bdev) -{ - return !bdev_rot(bdev); -} - static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..60e21414624b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3460,7 +3460,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; - if (si->bdev && bdev_nonrot(si->bdev)) { + if (si->bdev && !bdev_rot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); -- cgit v1.2.3 From 588e7c048d7d2bfcbe7776ee0888ee248adf01d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:09 -0800 Subject: ext4, fscrypt: merge fscrypt_mergeable_bio_bh into io_submit_need_new_bio ext4 already has the inode and folio and can't have a NULL folio->mapping in this path. Open code fscrypt_mergeable_bio_bh in io_submit_need_new_bio based on these simplifying assumptions. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-5-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/inline_crypt.c | 23 ----------------------- fs/ext4/page-io.c | 7 +++++-- include/linux/fscrypt.h | 9 --------- 3 files changed, 5 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index c0852b920dbc..0da53956a9b1 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -406,29 +406,6 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); -/** - * fscrypt_mergeable_bio_bh() - test whether data can be added to a bio - * @bio: the bio being built up - * @next_bh: the next buffer_head for which I/O will be submitted - * - * Same as fscrypt_mergeable_bio(), except this takes a buffer_head instead of - * an inode and block number directly. - * - * Return: true iff the I/O is mergeable - */ -bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh) -{ - const struct inode *inode; - u64 next_lblk; - - if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk)) - return !bio->bi_crypt_context; - - return fscrypt_mergeable_bio(bio, inode, next_lblk); -} -EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); - /** * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an * inode, as far as encryption is concerned diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 58cdbd836fd6..293314d7f236 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -440,11 +440,14 @@ static void io_submit_init_bio(struct ext4_io_submit *io, } static bool io_submit_need_new_bio(struct ext4_io_submit *io, + struct inode *inode, + struct folio *folio, struct buffer_head *bh) { if (bh->b_blocknr != io->io_next_block) return true; - if (!fscrypt_mergeable_bio_bh(io->io_bio, bh)) + if (!fscrypt_mergeable_bio(io->io_bio, inode, + (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits)) return true; return false; } @@ -455,7 +458,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io, struct folio *io_folio, struct buffer_head *bh) { - if (io->io_bio && io_submit_need_new_bio(io, bh)) { + if (io->io_bio && io_submit_need_new_bio(io, inode, folio, bh)) { submit_and_retry: ext4_io_submit(io); } diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 516aba5b858b..6af3c1907adc 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -876,9 +876,6 @@ void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); -bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh); - bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); @@ -906,12 +903,6 @@ static inline bool fscrypt_mergeable_bio(struct bio *bio, return true; } -static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh) -{ - return true; -} - static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); -- cgit v1.2.3 From a18b1ab81654b06e7ff402e5d0b85249e9504bcb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:10 -0800 Subject: fscrypt: move fscrypt_set_bio_crypt_ctx_bh to buffer.c fscrypt_set_bio_crypt_ctx_bh is only used by submit_bh_wbc now. Move it there and merge bh_get_inode_and_lblk_num into it. Note that this does not add ifdefs for fscrypt as the compiler will optimize away the dead code if it is not built in. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-6-hch@lst.de Signed-off-by: Eric Biggers --- fs/buffer.c | 21 ++++++++++++++++++++- fs/crypto/inline_crypt.c | 45 --------------------------------------------- include/linux/fscrypt.h | 9 --------- 3 files changed, 20 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 22b43642ba57..b6504ec7fa4c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2774,6 +2774,24 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } +static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, + gfp_t gfp_mask) +{ + const struct address_space *mapping = folio_mapping(bh->b_folio); + const struct inode *inode; + u64 lblk; + + /* + * The ext4 journal (jbd2) can submit a buffer_head it directly created + * for a non-pagecache page. fscrypt doesn't care about these. + */ + if (!mapping) + return; + inode = mapping->host; + lblk = (folio_pos(bh->b_folio) + bh_offset(bh)) >> inode->i_blkbits; + fscrypt_set_bio_crypt_ctx(bio, inode, lblk, gfp_mask); +} + static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) @@ -2800,7 +2818,8 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); - fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); + if (IS_ENABLED(CONFIG_FS_ENCRYPTION)) + buffer_set_crypto_ctx(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 0da53956a9b1..702d13d138aa 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -314,51 +314,6 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); -/* Extract the inode and logical block number from a buffer_head. */ -static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, - const struct inode **inode_ret, - u64 *lblk_num_ret) -{ - struct folio *folio = bh->b_folio; - const struct address_space *mapping; - const struct inode *inode; - - /* - * The ext4 journal (jbd2) can submit a buffer_head it directly created - * for a non-pagecache page. fscrypt doesn't care about these. - */ - mapping = folio_mapping(folio); - if (!mapping) - return false; - inode = mapping->host; - - *inode_ret = inode; - *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits; - return true; -} - -/** - * fscrypt_set_bio_crypt_ctx_bh() - prepare a file contents bio for inline - * crypto - * @bio: a bio which will eventually be submitted to the file - * @first_bh: the first buffer_head for which I/O will be submitted - * @gfp_mask: memory allocation flags - * - * Same as fscrypt_set_bio_crypt_ctx(), except this takes a buffer_head instead - * of an inode and block number directly. - */ -void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask) -{ - const struct inode *inode; - u64 first_lblk; - - if (bh_get_inode_and_lblk_num(first_bh, &inode, &first_lblk)) - fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk, gfp_mask); -} -EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh); - /** * fscrypt_mergeable_bio() - test whether data can be added to a bio * @bio: the bio being built up diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 6af3c1907adc..26561b7994e0 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -869,10 +869,6 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); -void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask); - bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); @@ -891,11 +887,6 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } -static inline void fscrypt_set_bio_crypt_ctx_bh( - struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask) { } - static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) -- cgit v1.2.3 From 22be86a23c5956254b752e4e98f0ef2799565a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:12 -0800 Subject: fscrypt: pass a byte offset to fscrypt_mergeable_bio Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_mergeable_bio to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-8-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 3 ++- fs/crypto/inline_crypt.c | 6 +++--- fs/ext4/page-io.c | 2 +- fs/ext4/readpage.c | 3 ++- fs/f2fs/data.c | 3 ++- include/linux/fscrypt.h | 4 ++-- 6 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 6da683ea69dc..0a701d4a17ef 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -100,7 +100,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, len -= blocks_this_page; lblk += blocks_this_page; sector += (bytes_this_page >> SECTOR_SHIFT); - if (!len || !fscrypt_mergeable_bio(bio, inode, lblk)) + if (!len || !fscrypt_mergeable_bio(bio, inode, + (loff_t)lblk << blockbits)) break; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 5279565e9846..b0954d17904b 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -316,7 +316,7 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); * fscrypt_mergeable_bio() - test whether data can be added to a bio * @bio: the bio being built up * @inode: the inode for the next part of the I/O - * @next_lblk: the next file logical block number in the I/O + * @pos: the next file position (in bytes) in the I/O * * When building a bio which may contain data which should undergo inline * encryption (or decryption) via fscrypt, filesystems should call this function @@ -334,7 +334,7 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); * Return: true iff the I/O is mergeable */ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk) + loff_t pos) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; const struct fscrypt_inode_info *ci; @@ -354,7 +354,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(ci, next_lblk << inode->i_blkbits, next_dun); + fscrypt_generate_dun(ci, pos, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 293314d7f236..50f507bab82c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -447,7 +447,7 @@ static bool io_submit_need_new_bio(struct ext4_io_submit *io, if (bh->b_blocknr != io->io_next_block) return true; if (!fscrypt_mergeable_bio(io->io_bio, inode, - (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits)) + folio_pos(folio) + bh_offset(bh))) return true; return false; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 830f3b8a321f..ba7cfddd6038 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -342,7 +342,8 @@ static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi, * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || - !fscrypt_mergeable_bio(bio, inode, next_block))) { + !fscrypt_mergeable_bio(bio, inode, + (loff_t)next_block << blkbits))) { submit_and_realloc: blk_crypto_submit_bio(bio); bio = NULL; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 338df7a2aea6..dca273fedfde 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -541,7 +541,8 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, if (fio && fio->encrypted_page) return !bio_has_crypt_ctx(bio); - return fscrypt_mergeable_bio(bio, inode, next_idx); + return fscrypt_mergeable_bio(bio, inode, + (loff_t)next_idx << inode->i_blkbits); } void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 26561b7994e0..98fb14660d40 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -870,7 +870,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk); + loff_t pos); bool fscrypt_dio_supported(struct inode *inode); @@ -889,7 +889,7 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk) + loff_t pos) { return true; } -- cgit v1.2.3 From 3c7eaa775d8e008135646bd4b7aa7db7c5e40a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:13 -0800 Subject: fscrypt: pass a byte offset to fscrypt_set_bio_crypt_ctx Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_set_bio_crypt_ctx to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-9-hch@lst.de Signed-off-by: Eric Biggers --- fs/buffer.c | 7 ++----- fs/crypto/bio.c | 8 ++++---- fs/crypto/inline_crypt.c | 6 +++--- fs/ext4/page-io.c | 5 ++--- fs/ext4/readpage.c | 4 ++-- fs/f2fs/data.c | 4 +++- fs/iomap/direct-io.c | 6 ++---- include/linux/fscrypt.h | 7 +++---- 8 files changed, 21 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index b6504ec7fa4c..1c8ee5a59f88 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2778,8 +2778,6 @@ static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, gfp_t gfp_mask) { const struct address_space *mapping = folio_mapping(bh->b_folio); - const struct inode *inode; - u64 lblk; /* * The ext4 journal (jbd2) can submit a buffer_head it directly created @@ -2787,9 +2785,8 @@ static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, */ if (!mapping) return; - inode = mapping->host; - lblk = (folio_pos(bh->b_folio) + bh_offset(bh)) >> inode->i_blkbits; - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, gfp_mask); + fscrypt_set_bio_crypt_ctx(bio, mapping->host, + folio_pos(bh->b_folio) + bh_offset(bh), gfp_mask); } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0a701d4a17ef..e7fb2fdd9728 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -75,6 +75,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); + loff_t pos = (loff_t)lblk << blockbits; struct fscrypt_zero_done done = { .pending = ATOMIC_INIT(1), .done = COMPLETION_INITIALIZER_ONSTACK(done.done), @@ -89,7 +90,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, bio->bi_iter.bi_sector = sector; bio->bi_private = &done; bio->bi_end_io = fscrypt_zeroout_range_end_io; - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); + fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_NOFS); for (n = 0; n < BIO_MAX_VECS; n++) { unsigned int blocks_this_page = @@ -98,10 +99,9 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, __bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); len -= blocks_this_page; - lblk += blocks_this_page; + pos += bytes_this_page; sector += (bytes_this_page >> SECTOR_SHIFT); - if (!len || !fscrypt_mergeable_bio(bio, inode, - (loff_t)lblk << blockbits)) + if (!len || !fscrypt_mergeable_bio(bio, inode, pos)) break; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index b0954d17904b..37d42d357925 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -285,7 +285,7 @@ static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, * fscrypt_set_bio_crypt_ctx() - prepare a file contents bio for inline crypto * @bio: a bio which will eventually be submitted to the file * @inode: the file's inode - * @first_lblk: the first file logical block number in the I/O + * @pos: the first file position (in bytes) in the I/O * @gfp_mask: memory allocation flags - these must be a waiting mask so that * bio_crypt_set_ctx can't fail. * @@ -298,7 +298,7 @@ static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, * The encryption context will be freed automatically when the bio is freed. */ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, - u64 first_lblk, gfp_t gfp_mask) + loff_t pos, gfp_t gfp_mask) { const struct fscrypt_inode_info *ci; u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, return; ci = fscrypt_get_inode_info_raw(inode); - fscrypt_generate_dun(ci, first_lblk << inode->i_blkbits, dun); + fscrypt_generate_dun(ci, pos, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 50f507bab82c..181cda58d387 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -427,9 +427,8 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); - fscrypt_set_bio_crypt_ctx(bio, inode, - (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits, - GFP_NOIO); + fscrypt_set_bio_crypt_ctx(bio, inode, folio_pos(folio) + bh_offset(bh), + GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index ba7cfddd6038..fbfa4d830d9a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -355,8 +355,8 @@ static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi, */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); - fscrypt_set_bio_crypt_ctx(bio, inode, next_block, - GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, + (loff_t)next_block << blkbits, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, vi); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dca273fedfde..07b4ed6bb0cc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -527,7 +527,9 @@ static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, * read/write raw data without encryption. */ if (!fio || !fio->encrypted_page) - fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); + fscrypt_set_bio_crypt_ctx(bio, inode, + (loff_t)first_idx << inode->i_blkbits, + gfp_mask); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e911daedff65..9da5d862ef9e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -311,8 +311,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); - fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, - GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -342,8 +341,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op); - fscrypt_set_bio_crypt_ctx(bio, iter->inode, - pos >> iter->inode->i_blkbits, GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, iter->inode, pos, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_write_hint = iter->inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 98fb14660d40..90f75fe0e1c9 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -865,9 +865,8 @@ static inline void fscrypt_set_ops(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); -void fscrypt_set_bio_crypt_ctx(struct bio *bio, - const struct inode *inode, u64 first_lblk, - gfp_t gfp_mask); +void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + loff_t pos, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, loff_t pos); @@ -885,7 +884,7 @@ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, - u64 first_lblk, gfp_t gfp_mask) { } + loff_t pos, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, -- cgit v1.2.3 From cd7db2e7dfeef99c901156f58ab4a38256b0c3f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:16 -0800 Subject: fscrypt: pass a byte offset to fscrypt_zeroout_range Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_zeroout_range to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-12-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 7 +++---- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 4 +++- include/linux/fscrypt.h | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 303b5acc66a9..a07ac8dcf851 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, /** * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode - * @lblk: the first file logical block to zero out + * @pos: the first file position (in bytes) to zero out * @pblk: the first filesystem physical block to zero out * @len: number of blocks to zero out * @@ -127,7 +127,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * * Return: 0 on success; -errno on failure. */ -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); @@ -135,9 +135,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; const unsigned int du_per_page = 1U << du_per_page_bits; - u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits); + u64 du_index = pos >> du_bits; u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); - loff_t pos = (loff_t)lblk << inode->i_blkbits; sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..945613c95ffa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -405,7 +405,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len); if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return fscrypt_zeroout_range(inode, lblk, pblk, len); + return fscrypt_zeroout_range(inode, + (loff_t)lblk << inode->i_blkbits, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8a2f17a8f11..239c2666ceb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4162,7 +4162,9 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) - ret = fscrypt_zeroout_range(inode, off, block, len); + ret = fscrypt_zeroout_range(inode, + (loff_t)off << inode->i_blkbits, block, + len); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90f75fe0e1c9..9fc15e1fbe57 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,7 +450,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len); /* hooks.c */ @@ -755,7 +755,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) return true; } -static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; -- cgit v1.2.3 From fb87ab4ad3d0df2397648e5ce2384de26463c183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:17 -0800 Subject: fscrypt: pass a byte length to fscrypt_zeroout_range Range lengths are usually expressed as bytes in the VFS, switch fscrypt_zeroout_range to this convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-13-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 11 ++++++----- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 2 +- include/linux/fscrypt.h | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a07ac8dcf851..9872408f4f52 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -115,12 +115,13 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * @inode: the file's inode * @pos: the first file position (in bytes) to zero out * @pblk: the first filesystem physical block to zero out - * @len: number of blocks to zero out + * @len: bytes to zero out * * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be * both logically and physically contiguous. It's also assumed that the - * filesystem only uses a single block device, ->s_bdev. + * filesystem only uses a single block device, ->s_bdev. @len must be a + * multiple of the file system logical block size. * * Note that since each block uses a different IV, this involves writing a * different ciphertext to each block; we can't simply reuse the same one. @@ -128,7 +129,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * Return: 0 on success; -errno on failure. */ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len) + sector_t pblk, u64 len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; @@ -136,7 +137,7 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; const unsigned int du_per_page = 1U << du_per_page_bits; u64 du_index = pos >> du_bits; - u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); + u64 du_remaining = len >> du_bits; sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; @@ -150,7 +151,7 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, if (fscrypt_inode_uses_inline_crypto(inode)) return fscrypt_zeroout_range_inline_crypt(inode, pos, sector, - (u64)len << inode->i_blkbits); + len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(u64, ARRAY_SIZE(pages), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 945613c95ffa..8ef61198e14c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -406,7 +406,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, - (loff_t)lblk << inode->i_blkbits, pblk, len); + (loff_t)lblk << inode->i_blkbits, pblk, + (u64)len << inode->i_blkbits); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 239c2666ceb5..8785f7c13657 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4164,7 +4164,7 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (IS_ENCRYPTED(inode)) ret = fscrypt_zeroout_range(inode, (loff_t)off << inode->i_blkbits, block, - len); + (u64)len << inode->i_blkbits); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 9fc15e1fbe57..90ac62fda926 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, + u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len) + sector_t pblk, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 5ca1a1f017ea0f0e0bcb6ec52064735f2ac1c393 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:18 -0800 Subject: fscrypt: pass a real sector_t to fscrypt_zeroout_range While the pblk argument to fscrypt_zeroout_range is declared as a sector_t, it actually is interpreted as a logical block size unit, which is highly unusual. Switch to passing the 512 byte units that sector_t is defined for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-14-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 5 ++--- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 2 +- include/linux/fscrypt.h | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 9872408f4f52..d07740680602 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -114,7 +114,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode * @pos: the first file position (in bytes) to zero out - * @pblk: the first filesystem physical block to zero out + * @sector: the first sector to zero out * @len: bytes to zero out * * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write @@ -129,7 +129,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * Return: 0 on success; -errno on failure. */ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, u64 len) + sector_t sector, u64 len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; @@ -138,7 +138,6 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, const unsigned int du_per_page = 1U << du_per_page_bits; u64 du_index = pos >> du_bits; u64 du_remaining = len >> du_bits; - sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; unsigned int i; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8ef61198e14c..fe258ffd4840 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -406,7 +406,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, - (loff_t)lblk << inode->i_blkbits, pblk, + (loff_t)lblk << inode->i_blkbits, + pblk << (inode->i_blkbits - SECTOR_SHIFT), (u64)len << inode->i_blkbits); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8785f7c13657..a264771cfbc2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4163,7 +4163,7 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) ret = fscrypt_zeroout_range(inode, - (loff_t)off << inode->i_blkbits, block, + (loff_t)off << inode->i_blkbits, sector, (u64)len << inode->i_blkbits); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90ac62fda926..54712ec61ffb 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, - u64 len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, + sector_t sector, u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, u64 len) + sector_t sector, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 102c8b26b54e363f85c4c86099ca049a0a76bb58 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 17 Feb 2026 08:08:35 -0800 Subject: PCI: Allow all bus devices to use the same slot A PCIe hotplug slot applies to the entire secondary bus. Thus, pciehp only allocates a single hotplug_slot for the bridge to that bus. The existing PCI slot, though, would only match to functions on device 0, meaning any devices beyond that, e.g., ARI functions, are not matched to any slot even though they share it. A slot reset will break all the missing devices because the handling skips them. For example, ARI devices with more than 8 functions fail because their state is not properly handled, nor is the attached driver notified of the reset. In the best case, the device will appear unresponsive to the driver, resulting in unexpected errors. A worse possibility may panic the kernel if in-flight transactions trigger hardware reported errors like this real observation: vfio-pci 0000:01:00.0: resetting vfio-pci 0000:01:00.0: reset done {1}[Hardware Error]: Error 1, type: fatal {1}[Hardware Error]: section_type: PCIe error {1}[Hardware Error]: port_type: 0, PCIe end point {1}[Hardware Error]: version: 0.2 {1}[Hardware Error]: command: 0x0140, status: 0x0010 {1}[Hardware Error]: device_id: 0000:01:01.0 {1}[Hardware Error]: slot: 0 {1}[Hardware Error]: secondary_bus: 0x00 {1}[Hardware Error]: vendor_id: 0x1d9b, device_id: 0x0207 {1}[Hardware Error]: class_code: 020000 {1}[Hardware Error]: bridge: secondary_status: 0x0000, control: 0x0000 {1}[Hardware Error]: aer_cor_status: 0x00008000, aer_cor_mask: 0x00002000 {1}[Hardware Error]: aer_uncor_status: 0x00010000, aer_uncor_mask: 0x00100000 {1}[Hardware Error]: aer_uncor_severity: 0x006f6030 {1}[Hardware Error]: TLP Header: 0a412800 00192080 60000004 00000004 GHES: Fatal hardware error but panic disabled Kernel panic - not syncing: GHES: Fatal hardware error Allow a slot to be created to claim all devices on a bus, not just a matching device. This is done by introducing a sentinel value, named PCI_SLOT_ALL_DEVICES, which then has the PCI slot match to any device on the bus. This fixes slot resets for pciehp. Since 0xff already has special meaning, the chosen value for this new feature is 0xfe. This will not clash with any actual slot number since they are limited to 5 bits. Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260217160836.2709885-3-kbusch@meta.com --- drivers/pci/hotplug/pciehp_core.c | 3 ++- drivers/pci/slot.c | 31 +++++++++++++++++++++++++++---- include/linux/pci.h | 10 +++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 1e9158d7bac7..2cafd3b26f34 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -79,7 +79,8 @@ static int init_slot(struct controller *ctrl) snprintf(name, SLOT_NAME_SIZE, "%u", PSN(ctrl)); retval = pci_hp_initialize(&ctrl->hotplug_slot, - ctrl->pcie->port->subordinate, 0, name); + ctrl->pcie->port->subordinate, + PCI_SLOT_ALL_DEVICES, name); if (retval) { ctrl_err(ctrl, "pci_hp_initialize failed: error %d\n", retval); kfree(ops); diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 787311614e5b..e0b7fb43423c 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -42,6 +42,15 @@ static ssize_t address_read_file(struct pci_slot *slot, char *buf) pci_domain_nr(slot->bus), slot->bus->number); + /* + * Preserve legacy ABI expectations that hotplug drivers that manage + * multiple devices per slot emit 0 for the device number. + */ + if (slot->number == PCI_SLOT_ALL_DEVICES) + return sysfs_emit(buf, "%04x:%02x:00\n", + pci_domain_nr(slot->bus), + slot->bus->number); + return sysfs_emit(buf, "%04x:%02x:%02x\n", pci_domain_nr(slot->bus), slot->bus->number, @@ -73,7 +82,8 @@ static void pci_slot_release(struct kobject *kobj) down_read(&pci_bus_sem); list_for_each_entry(dev, &slot->bus->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (slot->number == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot->number) dev->slot = NULL; up_read(&pci_bus_sem); @@ -166,7 +176,8 @@ void pci_dev_assign_slot(struct pci_dev *dev) mutex_lock(&pci_slot_mutex); list_for_each_entry(slot, &dev->bus->slots, list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (slot->number == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot->number) dev->slot = slot; mutex_unlock(&pci_slot_mutex); } @@ -188,7 +199,8 @@ static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr) /** * pci_create_slot - create or increment refcount for physical PCI slot * @parent: struct pci_bus of parent bridge - * @slot_nr: PCI_SLOT(pci_dev->devfn) or -1 for placeholder + * @slot_nr: PCI_SLOT(pci_dev->devfn), -1 for placeholder, or + * PCI_SLOT_ALL_DEVICES * @name: user visible string presented in /sys/bus/pci/slots/ * @hotplug: set if caller is hotplug driver, NULL otherwise * @@ -222,6 +234,16 @@ static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr) * consist solely of a dddd:bb tuple, where dddd is the PCI domain of the * %struct pci_bus and bb is the bus number. In other words, the devfn of * the 'placeholder' slot will not be displayed. + * + * Bus-wide slots: + * For PCIe hotplug, the physical slot encompasses the entire secondary + * bus, not just a single device number. If the device supports ARI and ARI + * Forwarding is enabled in the upstream bridge, a multi-function device + * may include functions that appear to have several different device + * numbers, i.e., PCI_SLOT() values. Pass @slot_nr == PCI_SLOT_ALL_DEVICES + * to create a slot that matches all devices on the bus. Unlike placeholder + * slots, bus-wide slots go through normal slot lookup and reuse existing + * slots if present. */ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, @@ -285,7 +307,8 @@ placeholder: down_read(&pci_bus_sem); list_for_each_entry(dev, &parent->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot_nr) + if (slot_nr == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot_nr) dev->slot = slot; up_read(&pci_bus_sem); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..5ae2dfdb2d6f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -72,12 +72,20 @@ /* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */ #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff) +/* + * PCI_SLOT_ALL_DEVICES indicates a slot that covers all devices on the bus. + * Used for PCIe hotplug where the physical slot is the entire secondary bus, + * and, if ARI Forwarding is enabled, functions may appear to be on multiple + * devices. + */ +#define PCI_SLOT_ALL_DEVICES 0xfe + /* pci_slot represents a physical slot */ struct pci_slot { struct pci_bus *bus; /* Bus this slot is on */ struct list_head list; /* Node in list of slots */ struct hotplug_slot *hotplug; /* Hotplug info (move here) */ - unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ + unsigned char number; /* Device nr, or PCI_SLOT_ALL_DEVICES */ struct kobject kobj; }; -- cgit v1.2.3 From 50636e5ff8861c35ebb190521ba540074ab583ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Mar 2026 13:11:30 +0000 Subject: tcp: move tcp_v4_early_demux() to net/ipv4/ip_input.c tcp_v4_early_demux() has a single caller : ip_rcv_finish_core(). Move it to net/ipv4/ip_input.c and mark it static, for possible compiler/linker optimizations. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260306131130.654991-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - net/ipv4/ip_input.c | 39 +++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 38 -------------------------------------- 3 files changed, 39 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a64641423806..f07aef7faa65 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -363,7 +363,6 @@ int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 19d3141dad1f..9860178752b8 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -319,6 +319,45 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, ip_hdr(hint)->tos == iph->tos; } +static int tcp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net_rcu(skb->dev); + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return 0; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr))) + return 0; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return 0; + + sk = __inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + skb->skb_iif, inet_sdif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk_fullsock(sk)) { + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); + + if (dst) + dst = dst_check(dst, 0); + if (dst && + sk->sk_rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } + return 0; +} + static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 190e8a238876..f27995a64561 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1914,44 +1914,6 @@ err_discard: } EXPORT_SYMBOL(tcp_v4_do_rcv); -int tcp_v4_early_demux(struct sk_buff *skb) -{ - struct net *net = dev_net_rcu(skb->dev); - const struct iphdr *iph; - const struct tcphdr *th; - struct sock *sk; - - if (skb->pkt_type != PACKET_HOST) - return 0; - - if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return 0; - - iph = ip_hdr(skb); - th = tcp_hdr(skb); - - if (th->doff < sizeof(struct tcphdr) / 4) - return 0; - - sk = __inet_lookup_established(net, iph->saddr, th->source, - iph->daddr, ntohs(th->dest), - skb->skb_iif, inet_sdif(skb)); - if (sk) { - skb->sk = sk; - skb->destructor = sock_edemux; - if (sk_fullsock(sk)) { - struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); - - if (dst) - dst = dst_check(dst, 0); - if (dst && - sk->sk_rx_dst_ifindex == skb->skb_iif) - skb_dst_set_noref(skb, dst); - } - } - return 0; -} - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { -- cgit v1.2.3 From 47e8dbb6e763e5ccfed2ab4aa55cbb163382aec1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 16:34:30 +0000 Subject: net/sched: do not reset queues in graft operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following typical script is extremely disruptive, because each graft operation calls dev_deactivate() which resets all the queues of the device. QPARAM="limit 100000 flow_limit 1000 buckets 4096" TXQS=64 for ETH in eth1 do tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 $TXQS` do slot=$( printf %x $(( i )) ) tc qd add dev $ETH parent 1:$slot fq $QPARAM done done One can add "ip link set dev $ETH down/up" to reduce the disruption time: QPARAM="limit 100000 flow_limit 1000 buckets 4096" TXQS=64 for ETH in eth1 do ip link set dev $ETH down tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 $TXQS` do slot=$( printf %x $(( i )) ) tc qd add dev $ETH parent 1:$slot fq $QPARAM done ip link set dev $ETH up done Or we can add a @reset_needed flag to dev_deactivate() and dev_deactivate_many(). This flag is set to true at device dismantle or linkwatch_do_dev(), and to false for graft operations. In the future, we might only stop one queue instead of the whole device, ie call dev_deactivate_queue() instead of dev_deactivate(). I think the problem (quadratic behavior) was added in commit 2fb541c862c9 ("net: sch_generic: aviod concurrent reset and enqueue op for lockless qdisc") but this does not look serious enough to deserve risky backports. Signed-off-by: Eric Dumazet Cc: Yunsheng Lin Reviewed-by: Jamal Hadi Salim Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20260307163430.470644-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 4 ++-- net/core/dev.c | 2 +- net/core/link_watch.c | 2 +- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 20 ++++++++++++-------- net/sched/sch_htb.c | 4 ++-- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 2 +- net/sched/sch_taprio.c | 2 +- 9 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c355300893a3..16beba40914e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -710,8 +710,8 @@ void dev_qdisc_change_real_num_tx(struct net_device *dev, void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); -void dev_deactivate(struct net_device *dev); -void dev_deactivate_many(struct list_head *head); +void dev_deactivate(struct net_device *dev, bool reset_needed); +void dev_deactivate_many(struct list_head *head, bool reset_needed); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); diff --git a/net/core/dev.c b/net/core/dev.c index 203dc36aaed5..6fc9350f0be8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1756,7 +1756,7 @@ static void __dev_close_many(struct list_head *head) smp_mb__after_atomic(); /* Commit netif_running(). */ } - dev_deactivate_many(head); + dev_deactivate_many(head, true); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 25c455c10a01..ff2c1d4538ef 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -181,7 +181,7 @@ static void linkwatch_do_dev(struct net_device *dev) if (netif_carrier_ok(dev)) dev_activate(dev); else - dev_deactivate(dev); + dev_deactivate(dev, true); netif_state_change(dev); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index cc43e3f7574f..c0bab092ea80 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1120,7 +1120,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, } if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); qdisc_offload_graft_root(dev, new, old, extack); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 556e0d800316..d4fe907c4ad5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1370,11 +1370,12 @@ static bool some_qdisc_is_busy(struct net_device *dev) /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate + * @reset_needed: qdisc should be reset if true. * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ -void dev_deactivate_many(struct list_head *head) +void dev_deactivate_many(struct list_head *head, bool reset_needed) { bool sync_needed = false; struct net_device *dev; @@ -1393,11 +1394,14 @@ void dev_deactivate_many(struct list_head *head) if (sync_needed) synchronize_net(); - list_for_each_entry(dev, head, close_list) { - netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); + if (reset_needed) { + list_for_each_entry(dev, head, close_list) { + netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); - if (dev_ingress_queue(dev)) - dev_reset_queue(dev, dev_ingress_queue(dev), NULL); + if (dev_ingress_queue(dev)) + dev_reset_queue(dev, dev_ingress_queue(dev), + NULL); + } } /* Wait for outstanding qdisc_run calls. */ @@ -1412,12 +1416,12 @@ void dev_deactivate_many(struct list_head *head) } } -void dev_deactivate(struct net_device *dev) +void dev_deactivate(struct net_device *dev, bool reset_needed) { LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_deactivate_many(&single); + dev_deactivate_many(&single, reset_needed); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); @@ -1473,7 +1477,7 @@ int dev_qdisc_change_tx_queue_len(struct net_device *dev) int ret = 0; if (up) - dev_deactivate(dev); + dev_deactivate(dev, false); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cf6cd4ccfa20..eb12381795ce 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1387,7 +1387,7 @@ htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q) struct Qdisc *old_q; if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); old_q = dev_graft_qdisc(dev_queue, new_q); if (new_q) new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -1421,7 +1421,7 @@ static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old, struct Qdisc *qdisc; if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); qdisc = dev_graft_qdisc(queue_old, NULL); WARN_ON(qdisc != cl_old->leaf.q); } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0ed199fa18f0..a0133a7b9d3b 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -201,7 +201,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); *old = dev_graft_qdisc(dev_queue, new); if (new) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b83276409416..002add5ce9e0 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -469,7 +469,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return -EINVAL; if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); *old = dev_graft_qdisc(dev_queue, new); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index f721c03514f6..8e3752811950 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2184,7 +2184,7 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, return -EINVAL; if (dev->flags & IFF_UP) - dev_deactivate(dev); + dev_deactivate(dev, false); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order -- cgit v1.2.3 From abb0eb0b033a0a8980eb9215e02626e4801ead3f Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Mar 2026 17:36:49 +0800 Subject: ppp: simplify input error handling Currently, ppp_input_error() indicates an error by allocating a 0-length skb and calling ppp_do_recv(). It takes an error code argument, which is stored in skb->cb, but not used by ppp_receive_frame(). Simplify the error handling by removing the unused parameter and the unnecessary skb allocation. Instead, call ppp_receive_error() directly from ppp_input_error() under the recv lock, and the length check in ppp_receive_frame() can be removed. Signed-off-by: Qingfang Deng Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_async.c | 2 +- drivers/net/ppp/ppp_generic.c | 31 ++++++++++--------------------- drivers/net/ppp/ppp_synctty.c | 2 +- include/linux/ppp_channel.h | 2 +- net/atm/pppoatm.c | 2 +- 5 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c index b4cf2d09c6bd..93a7b0f6c4e7 100644 --- a/drivers/net/ppp/ppp_async.c +++ b/drivers/net/ppp/ppp_async.c @@ -491,7 +491,7 @@ static void ppp_async_process(struct tasklet_struct *t) /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->cb[0]) - ppp_input_error(&ap->chan, 0); + ppp_input_error(&ap->chan); ppp_input(&ap->chan, skb); } diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 2081da6c2144..6344c5eb0f98 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2383,12 +2383,10 @@ done: rcu_read_unlock_bh(); } -/* Put a 0-length skb in the receive queue as an error indication */ void -ppp_input_error(struct ppp_channel *chan, int code) +ppp_input_error(struct ppp_channel *chan) { struct channel *pch = chan->ppp; - struct sk_buff *skb; struct ppp *ppp; if (!pch) @@ -2397,12 +2395,9 @@ ppp_input_error(struct ppp_channel *chan, int code) rcu_read_lock_bh(); ppp = rcu_dereference_bh(pch->ppp); if (ppp) { - skb = alloc_skb(0, GFP_ATOMIC); - if (skb) { - skb->len = 0; /* probably unnecessary */ - skb->cb[0] = code; - ppp_do_recv(ppp, skb, pch); - } + ppp_recv_lock(ppp); + ppp_receive_error(ppp); + ppp_recv_unlock(ppp); } rcu_read_unlock_bh(); } @@ -2414,20 +2409,14 @@ ppp_input_error(struct ppp_channel *chan, int code) static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { - /* note: a 0-length skb is used as an error indication */ - if (skb->len > 0) { - skb_checksum_complete_unset(skb); + skb_checksum_complete_unset(skb); #ifdef CONFIG_PPP_MULTILINK - /* XXX do channel-level decompression here */ - if (PPP_PROTO(skb) == PPP_MP) - ppp_receive_mp_frame(ppp, skb, pch); - else + /* XXX do channel-level decompression here */ + if (PPP_PROTO(skb) == PPP_MP) + ppp_receive_mp_frame(ppp, skb, pch); + else #endif /* CONFIG_PPP_MULTILINK */ - ppp_receive_nonmp_frame(ppp, skb); - } else { - kfree_skb(skb); - ppp_receive_error(ppp); - } + ppp_receive_nonmp_frame(ppp, skb); } static void diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index c2063961f395..b7f243b416f8 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -483,7 +483,7 @@ static void ppp_sync_process(struct tasklet_struct *t) while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->len == 0) { /* zero length buffers indicate error */ - ppp_input_error(&ap->chan, 0); + ppp_input_error(&ap->chan); kfree_skb(skb); } else diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index f73fbea0dbc2..ca8ad03eeef0 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -55,7 +55,7 @@ extern void ppp_input(struct ppp_channel *, struct sk_buff *); /* Called by the channel when an input error occurs, indicating that we may have missed a packet. */ -extern void ppp_input_error(struct ppp_channel *, int code); +extern void ppp_input_error(struct ppp_channel *); /* Attach a channel to a given PPP unit in specified net. */ extern int ppp_register_net_channel(struct net *, struct ppp_channel *); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 2574aae3e066..e3c422dc533a 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -228,7 +228,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb) error: kfree_skb(skb); - ppp_input_error(&pvcc->chan, 0); + ppp_input_error(&pvcc->chan); } static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) -- cgit v1.2.3 From e8eb33d650cd5e60b008f9d958262e489de6e7a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 09:22:14 +0000 Subject: tcp: move sysctl_tcp_shrink_window to netns_ipv4_read_txrx group Commit 18fd64d25422 ("netns-ipv4: reorganize netns_ipv4 fast path variables") missed that __tcp_select_window() is reading net->ipv4.sysctl_tcp_shrink_window. Move this field to netns_ipv4_read_txrx group, as __tcp_select_window() is used both in tx and rx paths. Saves a potential cache line miss. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260307092214.2433548-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4c249aeaf7f1..38624beff9b3 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -74,6 +74,7 @@ struct netns_ipv4 { /* TXRX readonly hotpath cache lines */ __cacheline_group_begin(netns_ipv4_read_txrx); + u8 sysctl_tcp_shrink_window; __cacheline_group_end(netns_ipv4_read_txrx); /* RX readonly hotpath cache line */ @@ -122,7 +123,6 @@ struct netns_ipv4 { #endif bool fib_has_custom_local_routes; bool fib_offload_disabled; - u8 sysctl_tcp_shrink_window; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_t fib_num_tclassid_users; #endif -- cgit v1.2.3 From f2db7b80b03f268ff65fe825a7c761a8f551aa48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Mar 2026 13:36:01 +0000 Subject: net/sched: refine indirect call mitigation in tc_wrapper.h Some modern cpus disable X86_FEATURE_RETPOLINE feature, even if a direct call can still be beneficial. Even when IBRS is present, an indirect call is more expensive than a direct one: Direct Calls: Compilers can perform powerful optimizations like inlining, where the function body is directly inserted at the call site, eliminating call overhead entirely. Indirect Calls: Inlining is much harder, if not impossible, because the compiler doesn't know the target function at compile time. Techniques like Indirect Call Promotion can help by using profile-guided optimization to turn frequently taken indirect calls into conditional direct calls, but they still add complexity and potential overhead compared to a truly direct call. In this patch, I split tc_skip_wrapper in two different static keys, one for tc_act() (tc_skip_wrapper_act) and one for tc_classify() (tc_skip_wrapper_cls). Then I enable the tc_skip_wrapper_cls only if the count of builtin classifiers is above one. I enable tc_skip_wrapper_act only it the count of builtin actions is above one. In our production kernels, we only have CONFIG_NET_CLS_BPF=y and CONFIG_NET_ACT_BPF=y. Other are modules or are not compiled. Tested on AMD Turin cpus, cls_bpf_classify() cost went from 1% down to 0.18 %, and FDO will be able to inline it in tcf_classify() for further gains. Signed-off-by: Eric Dumazet Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20260307133601.3863071-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_wrapper.h | 47 ++++++++++++++++++++++++++++++++++++++++++----- net/sched/sch_api.c | 3 ++- 2 files changed, 44 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index ffe58a02537c..4ebb053bb0dd 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -12,7 +12,8 @@ #define TC_INDIRECT_SCOPE -extern struct static_key_false tc_skip_wrapper; +extern struct static_key_false tc_skip_wrapper_act; +extern struct static_key_false tc_skip_wrapper_cls; /* TC Actions */ #ifdef CONFIG_NET_CLS_ACT @@ -46,7 +47,7 @@ TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - if (static_branch_likely(&tc_skip_wrapper)) + if (static_branch_likely(&tc_skip_wrapper_act)) goto skip; #if IS_BUILTIN(CONFIG_NET_ACT_GACT) @@ -153,7 +154,7 @@ TC_INDIRECT_FILTER_DECLARE(u32_classify); static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - if (static_branch_likely(&tc_skip_wrapper)) + if (static_branch_likely(&tc_skip_wrapper_cls)) goto skip; #if IS_BUILTIN(CONFIG_NET_CLS_BPF) @@ -202,8 +203,44 @@ skip: static inline void tc_wrapper_init(void) { #ifdef CONFIG_X86 - if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) - static_branch_enable(&tc_skip_wrapper); + int cnt_cls = IS_BUILTIN(CONFIG_NET_CLS_BPF) + + IS_BUILTIN(CONFIG_NET_CLS_U32) + + IS_BUILTIN(CONFIG_NET_CLS_FLOWER) + + IS_BUILTIN(CONFIG_NET_CLS_FW) + + IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) + + IS_BUILTIN(CONFIG_NET_CLS_BASIC) + + IS_BUILTIN(CONFIG_NET_CLS_CGROUP) + + IS_BUILTIN(CONFIG_NET_CLS_FLOW) + + IS_BUILTIN(CONFIG_NET_CLS_ROUTE4); + + int cnt_act = IS_BUILTIN(CONFIG_NET_ACT_GACT) + + IS_BUILTIN(CONFIG_NET_ACT_MIRRED) + + IS_BUILTIN(CONFIG_NET_ACT_PEDIT) + + IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) + + IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) + + IS_BUILTIN(CONFIG_NET_ACT_POLICE) + + IS_BUILTIN(CONFIG_NET_ACT_BPF) + + IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) + + IS_BUILTIN(CONFIG_NET_ACT_CSUM) + + IS_BUILTIN(CONFIG_NET_ACT_CT) + + IS_BUILTIN(CONFIG_NET_ACT_CTINFO) + + IS_BUILTIN(CONFIG_NET_ACT_GATE) + + IS_BUILTIN(CONFIG_NET_ACT_MPLS) + + IS_BUILTIN(CONFIG_NET_ACT_NAT) + + IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) + + IS_BUILTIN(CONFIG_NET_ACT_VLAN) + + IS_BUILTIN(CONFIG_NET_ACT_IFE) + + IS_BUILTIN(CONFIG_NET_ACT_SIMP) + + IS_BUILTIN(CONFIG_NET_ACT_SAMPLE); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + return; + + if (cnt_cls > 1) + static_branch_enable(&tc_skip_wrapper_cls); + + if (cnt_act > 1) + static_branch_enable(&tc_skip_wrapper_act); #endif } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c0bab092ea80..ed869a5ffc73 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2479,7 +2479,8 @@ static struct pernet_operations psched_net_ops = { }; #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) -DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); +DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_act); +DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper_cls); #endif static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { -- cgit v1.2.3 From 4b78c9cbd8f1fbb9517aee48b372646f4cf05442 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Mar 2026 12:23:02 +0000 Subject: tcp: move tp->chrono_type next tp->chrono_stat[] chrono_type is currently in tcp_sock_read_txrx group, which is supposed to hold read-mostly fields. But chrono_type is mostly written in tx path, it should be moved to tcp_sock_write_tx group, close to other chrono fields (chrono_stat[], chrono_start). Note this adds holes, but data locality is far more important. Use a full u8 for the time being, compiler can generate more efficient code. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260308122302.2895067-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f72eef31fa23..c44cf9ae8d16 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -228,8 +228,7 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ u8 scaling_ratio; /* see tcp_win_from_space() */ - u8 chrono_type : 2, /* current chronograph type */ - repair : 1, + u8 repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ @@ -264,6 +263,7 @@ struct tcp_sock { * total number of data bytes sent. */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u8 chrono_type; /* current chronograph type */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From d6d4ff335db2d9242937ca474d292010acd35c38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Mar 2026 12:35:49 +0000 Subject: tcp: inline tcp_chrono_start() tcp_chrono_start() is small enough, and used in TCP sendmsg() fast path (from tcp_skb_entail()). Note clang is already inlining it from functions in tcp_output.c. Inlining it improves performance and reduces bloat : $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 1/0 up/down: 1/-84 (-83) Function old new delta tcp_skb_entail 280 281 +1 __pfx_tcp_chrono_start 16 - -16 tcp_chrono_start 68 - -68 Total: Before=25192434, After=25192351, chg -0.00% Note that tcp_chrono_stop() is too big. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260308123549.2924460-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 25 ++++++++++++++++++++++++- net/ipv4/tcp_output.c | 24 ------------------------ 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f07aef7faa65..9f0aee9e5d76 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2159,7 +2159,30 @@ enum tcp_chrono { __TCP_CHRONO_MAX, }; -void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); +static inline void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) +{ + const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; + + if (old > TCP_CHRONO_UNSPEC) + tp->chrono_stat[old - 1] += now - tp->chrono_start; + tp->chrono_start = now; + tp->chrono_type = new; +} + +static inline void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If there are multiple conditions worthy of tracking in a + * chronograph then the highest priority enum takes precedence + * over the other conditions. So that if something "more interesting" + * starts happening, stop the previous chrono and start a new one. + */ + if (type > tp->chrono_type) + tcp_chrono_set(tp, type); +} + void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4377b3673da9..a53802f28dd1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2903,30 +2903,6 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, return false; } -static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) -{ - const u32 now = tcp_jiffies32; - enum tcp_chrono old = tp->chrono_type; - - if (old > TCP_CHRONO_UNSPEC) - tp->chrono_stat[old - 1] += now - tp->chrono_start; - tp->chrono_start = now; - tp->chrono_type = new; -} - -void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) -{ - struct tcp_sock *tp = tcp_sk(sk); - - /* If there are multiple conditions worthy of tracking in a - * chronograph then the highest priority enum takes precedence - * over the other conditions. So that if something "more interesting" - * starts happening, stop the previous chrono and start a new one. - */ - if (type > tp->chrono_type) - tcp_chrono_set(tp, type); -} - void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 1227a8f6c34e297b5fada96aa140129eced771dc Mon Sep 17 00:00:00 2001 From: Anirudh Srinivasan Date: Fri, 6 Mar 2026 11:12:17 -0600 Subject: dt-bindings: clk: tenstorrent: Add tenstorrent,atlantis-prcm-rcpu Document bindings for Tenstorrent Atlantis PRCM that manages clocks and resets. This block is instantiated multiple times in the SoC. This commit documents the clocks from the RCPU PRCM block. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Anirudh Srinivasan Reviewed-by: Drew Fustini Signed-off-by: Drew Fustini --- .../clock/tenstorrent,atlantis-prcm-rcpu.yaml | 54 +++++++++++ MAINTAINERS | 2 + .../clock/tenstorrent,atlantis-prcm-rcpu.h | 103 +++++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/tenstorrent,atlantis-prcm-rcpu.yaml create mode 100644 include/dt-bindings/clock/tenstorrent,atlantis-prcm-rcpu.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/tenstorrent,atlantis-prcm-rcpu.yaml b/Documentation/devicetree/bindings/clock/tenstorrent,atlantis-prcm-rcpu.yaml new file mode 100644 index 000000000000..7fa16526efce --- /dev/null +++ b/Documentation/devicetree/bindings/clock/tenstorrent,atlantis-prcm-rcpu.yaml @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/tenstorrent,atlantis-prcm-rcpu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Tenstorrent Atlantis PRCM (Power, Reset, Clock Management) Module + +maintainers: + - Anirudh Srinivasan + +description: + Multifunctional register block found in Tenstorrent Atlantis SoC whose main + function is to control clocks and resets. This block is instantiated multiple + times in the SoC, each block controls clock and resets for a different + subsystem. RCPU prcm serves low speed IO interfaces. + +properties: + compatible: + enum: + - tenstorrent,atlantis-prcm-rcpu + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + "#clock-cells": + const: 1 + description: + See for valid indices. + + "#reset-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - "#clock-cells" + - "#reset-cells" + +additionalProperties: false + +examples: + - | + clock-controller@a8000000 { + compatible = "tenstorrent,atlantis-prcm-rcpu"; + reg = <0xa8000000 0x10000>; + clocks = <&osc_24m>; + #clock-cells = <1>; + #reset-cells = <1>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..40c179c8de1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22809,8 +22809,10 @@ M: Joel Stanley L: linux-riscv@lists.infradead.org S: Maintained T: git https://github.com/tenstorrent/linux.git +F: Documentation/devicetree/bindings/clock/tenstorrent,atlantis-prcm-rcpu.yaml F: Documentation/devicetree/bindings/riscv/tenstorrent.yaml F: arch/riscv/boot/dts/tenstorrent/ +F: include/dt-bindings/clock/tenstorrent,atlantis-prcm-rcpu.h RISC-V THEAD SoC SUPPORT M: Drew Fustini diff --git a/include/dt-bindings/clock/tenstorrent,atlantis-prcm-rcpu.h b/include/dt-bindings/clock/tenstorrent,atlantis-prcm-rcpu.h new file mode 100644 index 000000000000..c1c875e016f8 --- /dev/null +++ b/include/dt-bindings/clock/tenstorrent,atlantis-prcm-rcpu.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Tenstorrent Atlantis PRCM Clock and Reset Indices + * + * Copyright (c) 2026 Tenstorrent + */ + +#ifndef _DT_BINDINGS_ATLANTIS_PRCM_RCPU_H +#define _DT_BINDINGS_ATLANTIS_PRCM_RCPU_H + +/* + * RCPU Domain Clock IDs + */ +#define CLK_RCPU_PLL 0 +#define CLK_RCPU_ROOT 1 +#define CLK_RCPU_DIV2 2 +#define CLK_RCPU_DIV4 3 +#define CLK_RCPU_RTC 4 +#define CLK_SMNDMA0_ACLK 5 +#define CLK_SMNDMA1_ACLK 6 +#define CLK_WDT0_PCLK 7 +#define CLK_WDT1_PCLK 8 +#define CLK_TIMER_PCLK 9 +#define CLK_PVTC_PCLK 10 +#define CLK_PMU_PCLK 11 +#define CLK_MAILBOX_HCLK 12 +#define CLK_SEC_SPACC_HCLK 13 +#define CLK_SEC_OTP_HCLK 14 +#define CLK_TRNG_PCLK 15 +#define CLK_SEC_CRC_HCLK 16 +#define CLK_SMN_HCLK 17 +#define CLK_AHB0_HCLK 18 +#define CLK_SMN_PCLK 19 +#define CLK_SMN_CLK 20 +#define CLK_SCRATCHPAD_CLK 21 +#define CLK_RCPU_CORE_CLK 22 +#define CLK_RCPU_ROM_CLK 23 +#define CLK_OTP_LOAD_CLK 24 +#define CLK_NOC_PLL 25 +#define CLK_NOCC_CLK 26 +#define CLK_NOCC_DIV2 27 +#define CLK_NOCC_DIV4 28 +#define CLK_NOCC_RTC 29 +#define CLK_NOCC_CAN 30 +#define CLK_QSPI_SCLK 31 +#define CLK_QSPI_HCLK 32 +#define CLK_I2C0_PCLK 33 +#define CLK_I2C1_PCLK 34 +#define CLK_I2C2_PCLK 35 +#define CLK_I2C3_PCLK 36 +#define CLK_I2C4_PCLK 37 +#define CLK_UART0_PCLK 38 +#define CLK_UART1_PCLK 39 +#define CLK_UART2_PCLK 40 +#define CLK_UART3_PCLK 41 +#define CLK_UART4_PCLK 42 +#define CLK_SPI0_PCLK 43 +#define CLK_SPI1_PCLK 44 +#define CLK_SPI2_PCLK 45 +#define CLK_SPI3_PCLK 46 +#define CLK_GPIO_PCLK 47 +#define CLK_CAN0_HCLK 48 +#define CLK_CAN0_CLK 49 +#define CLK_CAN1_HCLK 50 +#define CLK_CAN1_CLK 51 +#define CLK_CAN0_TIMER_CLK 52 +#define CLK_CAN1_TIMER_CLK 53 + +/* RCPU domain reset */ +#define RST_SMNDMA0 0 +#define RST_SMNDMA1 1 +#define RST_WDT0 2 +#define RST_WDT1 3 +#define RST_TMR 4 +#define RST_PVTC 5 +#define RST_PMU 6 +#define RST_MAILBOX 7 +#define RST_SPACC 8 +#define RST_OTP 9 +#define RST_TRNG 10 +#define RST_CRC 11 +#define RST_QSPI 12 +#define RST_I2C0 13 +#define RST_I2C1 14 +#define RST_I2C2 15 +#define RST_I2C3 16 +#define RST_I2C4 17 +#define RST_UART0 18 +#define RST_UART1 19 +#define RST_UART2 20 +#define RST_UART3 21 +#define RST_UART4 22 +#define RST_SPI0 23 +#define RST_SPI1 24 +#define RST_SPI2 25 +#define RST_SPI3 26 +#define RST_GPIO 27 +#define RST_CAN0 28 +#define RST_CAN1 29 +#define RST_I2S0 30 +#define RST_I2S1 31 + +#endif /* _DT_BINDINGS_ATLANTIS_PRCM_RCPU_H */ -- cgit v1.2.3 From 4d25c7d68896b4002c4ab5cd646775392bb7fbb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:09 -0800 Subject: iomap: pass the iomap_iter to ->submit_read This provides additional context for file systems. Rename the fuse instance to match the method name while we're at it. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-10-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/fuse/file.c | 5 +++-- fs/iomap/bio.c | 3 ++- fs/iomap/buffered-io.c | 4 ++-- fs/ntfs3/inode.c | 3 ++- include/linux/iomap.h | 3 ++- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b1bb7153cb78..a9c836d7f586 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -947,7 +947,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, return ret; } -static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +static void fuse_iomap_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct fuse_fill_read_data *data = ctx->read_ctx; @@ -958,7 +959,7 @@ static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) static const struct iomap_read_ops fuse_iomap_read_ops = { .read_folio_range = fuse_iomap_read_folio_range_async, - .submit_read = fuse_iomap_read_submit, + .submit_read = fuse_iomap_submit_read, }; static int fuse_read_folio(struct file *file, struct folio *folio) diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 578b1202e037..cb60d1facb5a 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -18,7 +18,8 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } -static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +static void iomap_bio_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 00f0efaf12b2..f4ee2b1cb877 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -597,7 +597,7 @@ void iomap_read_folio(const struct iomap_ops *ops, &bytes_submitted); if (ctx->ops->submit_read) - ctx->ops->submit_read(ctx); + ctx->ops->submit_read(&iter, ctx); if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, bytes_submitted); @@ -664,7 +664,7 @@ void iomap_readahead(const struct iomap_ops *ops, &cur_bytes_submitted); if (ctx->ops->submit_read) - ctx->ops->submit_read(ctx); + ctx->ops->submit_read(&iter, ctx); if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, cur_bytes_submitted); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6e65066ebcc1..511967ef7ec9 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -651,7 +651,8 @@ static int ntfs_iomap_bio_read_folio_range(const struct iomap_iter *iter, return 0; } -static void ntfs_iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +static void ntfs_iomap_bio_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..6fbe121e2adf 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -512,7 +512,8 @@ struct iomap_read_ops { * * This is optional. */ - void (*submit_read)(struct iomap_read_folio_ctx *ctx); + void (*submit_read)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx); }; /* -- cgit v1.2.3 From 5f4fe046cb3c84eed719f7becbe822000e1a589e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:11 -0800 Subject: iomap: allow file systems to hook into buffered read bio submission File systems such as btrfs have additional operations with bios such as verifying data checksums. Allow file systems to hook into submission of the bio to allow for this processing by replacing the direct submit_bio call in iomap_read_alloc_bio with a call into ->submit_read and exporting iomap_read_alloc_bio. Also add a new field to struct iomap_read_folio_ctx to track the file logic offset of the current read context. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-12-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 15 +++++++++------ include/linux/iomap.h | 4 ++++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 80bbd328bd3c..903cb9fe759e 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -32,10 +32,11 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, struct folio *folio = ctx->cur_folio; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; - struct bio *bio = ctx->read_ctx; + struct bio *bio; - if (bio) - submit_bio(bio); + /* Submit the existing range if there was one. */ + if (ctx->read_ctx) + ctx->ops->submit_read(iter, ctx); /* Same as readahead_gfp_mask: */ if (ctx->rac) @@ -56,9 +57,10 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, bio_add_folio_nofail(bio, folio, plen, offset_in_folio(folio, iter->pos)); ctx->read_ctx = bio; + ctx->read_ctx_file_offset = iter->pos; } -static int iomap_bio_read_folio_range(const struct iomap_iter *iter, +int iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { struct folio *folio = ctx->cur_folio; @@ -70,10 +72,11 @@ static int iomap_bio_read_folio_range(const struct iomap_iter *iter, iomap_read_alloc_bio(iter, ctx, plen); return 0; } +EXPORT_SYMBOL_GPL(iomap_bio_read_folio_range); const struct iomap_read_ops iomap_bio_read_ops = { - .read_folio_range = iomap_bio_read_folio_range, - .submit_read = iomap_bio_submit_read, + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, }; EXPORT_SYMBOL_GPL(iomap_bio_read_ops); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fbe121e2adf..b2b9e649a3b8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -493,6 +493,7 @@ struct iomap_read_folio_ctx { struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; + loff_t read_ctx_file_offset; }; struct iomap_read_ops { @@ -599,6 +600,9 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; #ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen); + extern const struct iomap_read_ops iomap_bio_read_ops; static inline void iomap_bio_read_folio(struct folio *folio, -- cgit v1.2.3 From 57287771fa8d77841149bf847b629f29acbad35b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:13 -0800 Subject: iomap: add a bioset pointer to iomap_read_folio_ops Optionally allocate the bio from the bioset provided in iomap_read_folio_ops. If no bioset is provided, fs_bio_set is still used, which is the standard bioset for file systems. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-14-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 14 ++++++++++++-- include/linux/iomap.h | 6 ++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 903cb9fe759e..259a2bf95a43 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -24,11 +24,19 @@ static void iomap_bio_submit_read(const struct iomap_iter *iter, submit_bio(ctx->read_ctx); } +static struct bio_set *iomap_read_bio_set(struct iomap_read_folio_ctx *ctx) +{ + if (ctx->ops && ctx->ops->bio_set) + return ctx->ops->bio_set; + return &fs_bio_set; +} + static void iomap_read_alloc_bio(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { const struct iomap *iomap = &iter->iomap; unsigned int nr_vecs = DIV_ROUND_UP(iomap_length(iter), PAGE_SIZE); + struct bio_set *bio_set = iomap_read_bio_set(ctx); struct folio *folio = ctx->cur_folio; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; @@ -47,9 +55,11 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, * having to deal with partial page reads. This emulates what * do_mpage_read_folio does. */ - bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, gfp); + bio = bio_alloc_bioset(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp, bio_set); if (!bio) - bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + bio = bio_alloc_bioset(iomap->bdev, 1, REQ_OP_READ, orig_gfp, + bio_set); if (ctx->rac) bio->bi_opf |= REQ_RAHEAD; bio->bi_iter.bi_sector = iomap_sector(iomap, iter->pos); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b2b9e649a3b8..387a1174522f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -515,6 +515,12 @@ struct iomap_read_ops { */ void (*submit_read)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx); + + /* + * Optional, allows filesystem to specify own bio_set, so new bio's + * can be allocated from the provided bio_set. + */ + struct bio_set *bio_set; }; /* -- cgit v1.2.3 From 0b10a370529cbd7b918c1eef43d409e43d9e0b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:15 -0800 Subject: iomap: support T10 protection information Add support for generating / verifying protection information in iomap. This is done by hooking into the bio submission and then using the generic PI helpers. Compared to just using the block layer auto PI this extends the protection envelope and also prepares for eventually passing through PI from userspace at least for direct I/O. To generate or verify PI, the file system needs to set the IOMAP_F_INTEGRITY flag on the iomap for the request, and ensure the ioends are used for all integrity I/O. Additionally the file system must defer read I/O completions to user context so that the guard tag validation isn't run from interrupt context. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-16-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 24 +++++++++++++++++++++--- fs/iomap/direct-io.c | 15 ++++++++++++++- fs/iomap/internal.h | 13 +++++++++++++ fs/iomap/ioend.c | 20 ++++++++++++++++++-- include/linux/iomap.h | 7 +++++++ 5 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index b4de67bdd513..f989ffcaac96 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (C) 2016-2023 Christoph Hellwig. */ +#include #include #include #include "internal.h" @@ -17,6 +18,8 @@ static u32 __iomap_read_end_io(struct bio *bio, int error) iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); folio_count++; } + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); bio_put(bio); return folio_count; } @@ -34,7 +37,11 @@ u32 iomap_finish_ioend_buffered_read(struct iomap_ioend *ioend) static void iomap_bio_submit_read(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx) { - submit_bio(ctx->read_ctx); + struct bio *bio = ctx->read_ctx; + + if (iter->iomap.flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_alloc(bio); + submit_bio(bio); } static struct bio_set *iomap_read_bio_set(struct iomap_read_folio_ctx *ctx) @@ -91,6 +98,7 @@ int iomap_bio_read_folio_range(const struct iomap_iter *iter, if (!bio || bio_end_sector(bio) != iomap_sector(&iter->iomap, iter->pos) || + bio->bi_iter.bi_size > iomap_max_bio_size(&iter->iomap) - plen || !bio_add_folio(bio, folio, plen, offset_in_folio(folio, iter->pos))) iomap_read_alloc_bio(iter, ctx, plen); return 0; @@ -107,11 +115,21 @@ int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len) { const struct iomap *srcmap = iomap_iter_srcmap(iter); + sector_t sector = iomap_sector(srcmap, pos); struct bio_vec bvec; struct bio bio; + int error; bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio.bi_iter.bi_sector = sector; bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); + if (srcmap->flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_alloc(&bio); + error = submit_bio_wait(&bio); + if (srcmap->flags & IOMAP_F_INTEGRITY) { + if (!error) + error = fs_bio_integrity_verify(&bio, sector, len); + fs_bio_integrity_free(&bio); + } + return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 2cb0c0f43215..c24d94349ca5 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -240,6 +241,9 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); + if (dio->flags & IOMAP_DIO_BOUNCE) { bio_iov_iter_unbounce(bio, !!dio->error, dio->flags & IOMAP_DIO_USER_BACKED); @@ -350,8 +354,10 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->flags & IOMAP_DIO_BOUNCE) - ret = bio_iov_iter_bounce(bio, dio->submit.iter, BIO_MAX_SIZE); + ret = bio_iov_iter_bounce(bio, dio->submit.iter, + iomap_max_bio_size(&iter->iomap)); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); @@ -368,6 +374,13 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, goto out_put_bio; } + if (iter->iomap.flags & IOMAP_F_INTEGRITY) { + if (dio->flags & IOMAP_DIO_WRITE) + fs_bio_integrity_generate(bio); + else + fs_bio_integrity_alloc(bio); + } + if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(ret); else if ((dio->flags & IOMAP_DIO_USER_BACKED) && diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index b39dbc17e3f0..74e898b196dc 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -4,6 +4,19 @@ #define IOEND_BATCH_SIZE 4096 +/* + * Normally we can build bios as big as the data structure supports. + * + * But for integrity protected I/O we need to respect the maximum size of the + * single contiguous allocation for the integrity buffer. + */ +static inline size_t iomap_max_bio_size(const struct iomap *iomap) +{ + if (iomap->flags & IOMAP_F_INTEGRITY) + return max_integrity_io_size(bdev_limits(iomap->bdev)); + return BIO_MAX_SIZE; +} + u32 iomap_finish_ioend_buffered_read(struct iomap_ioend *ioend); u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 450ab002eb91..7c034b6a583e 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -65,6 +66,8 @@ static u32 iomap_finish_ioend_buffered_write(struct iomap_ioend *ioend) folio_count++; } + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); bio_put(bio); /* frees the ioend */ return folio_count; } @@ -144,6 +147,8 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) return error; } + if (wpc->iomap.flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_generate(&ioend->io_bio); submit_bio(&ioend->io_bio); return 0; } @@ -165,10 +170,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, - u16 ioend_flags) + unsigned int map_len, u16 ioend_flags) { struct iomap_ioend *ioend = wpc->wb_ctx; + if (ioend->io_bio.bi_iter.bi_size > + iomap_max_bio_size(&wpc->iomap) - map_len) + return false; if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != @@ -234,7 +242,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; - if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { + if (!ioend || !iomap_can_add_to_ioend(wpc, pos, map_len, ioend_flags)) { new_ioend: if (ioend) { error = wpc->ops->writeback_submit(wpc, 0); @@ -311,6 +319,14 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + + if (!ioend->io_error && + bio_integrity(&ioend->io_bio) && + bio_op(&ioend->io_bio) == REQ_OP_READ) { + ioend->io_error = fs_bio_integrity_verify(&ioend->io_bio, + ioend->io_sector, ioend->io_size); + } + if (ioend->io_flags & IOMAP_IOEND_DIRECT) return iomap_finish_ioend_direct(ioend); if (bio_op(&ioend->io_bio) == REQ_OP_READ) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 387a1174522f..531f9ebdeeae 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -65,6 +65,8 @@ struct vm_fault; * * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic * bio, i.e. set REQ_ATOMIC. + * + * IOMAP_F_INTEGRITY indicates that the filesystems handles integrity metadata. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -79,6 +81,11 @@ struct vm_fault; #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +#ifdef CONFIG_BLK_DEV_INTEGRITY +#define IOMAP_F_INTEGRITY (1U << 9) +#else +#define IOMAP_F_INTEGRITY 0 +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * Flag reserved for file system specific usage -- cgit v1.2.3 From 1b50f42049d8270986a952e621415278e0945ce4 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Mon, 9 Mar 2026 16:45:29 -0400 Subject: RDMA/hfi1: Remove opa_vnic OPA Vnic has been abandoned and left to rot. Time to excise. Signed-off-by: Dennis Dalessandro Link: https://patch.msgid.link/177308912950.1280237.15051663328388849915.stgit@awdrv-04.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- Documentation/driver-api/infiniband.rst | 15 - Documentation/infiniband/index.rst | 1 - Documentation/infiniband/opa_vnic.rst | 159 --- .../translations/zh_CN/infiniband/index.rst | 1 - .../translations/zh_CN/infiniband/opa_vnic.rst | 156 --- MAINTAINERS | 6 - drivers/infiniband/Kconfig | 2 - drivers/infiniband/hw/hfi1/Makefile | 4 +- drivers/infiniband/hw/hfi1/aspm.c | 2 +- drivers/infiniband/hw/hfi1/chip.c | 54 +- drivers/infiniband/hw/hfi1/chip.h | 2 - drivers/infiniband/hw/hfi1/driver.c | 13 +- drivers/infiniband/hw/hfi1/hfi.h | 20 - drivers/infiniband/hw/hfi1/init.c | 4 +- drivers/infiniband/hw/hfi1/mad.c | 1 - drivers/infiniband/hw/hfi1/msix.c | 4 +- drivers/infiniband/hw/hfi1/netdev.h | 8 +- drivers/infiniband/hw/hfi1/netdev_rx.c | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 2 - drivers/infiniband/hw/hfi1/vnic.h | 126 --- drivers/infiniband/hw/hfi1/vnic_main.c | 615 ------------ drivers/infiniband/hw/hfi1/vnic_sdma.c | 282 ------ drivers/infiniband/ulp/Makefile | 1 - drivers/infiniband/ulp/opa_vnic/Kconfig | 9 - drivers/infiniband/ulp/opa_vnic/Makefile | 9 - drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c | 513 ---------- drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h | 524 ---------- drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c | 183 ---- .../infiniband/ulp/opa_vnic/opa_vnic_internal.h | 329 ------ drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c | 400 -------- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 1056 -------------------- .../infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c | 390 -------- include/rdma/ib_verbs.h | 6 - include/rdma/opa_vnic.h | 96 -- 34 files changed, 20 insertions(+), 4976 deletions(-) delete mode 100644 Documentation/infiniband/opa_vnic.rst delete mode 100644 Documentation/translations/zh_CN/infiniband/opa_vnic.rst delete mode 100644 drivers/infiniband/hw/hfi1/vnic.h delete mode 100644 drivers/infiniband/hw/hfi1/vnic_main.c delete mode 100644 drivers/infiniband/hw/hfi1/vnic_sdma.c delete mode 100644 drivers/infiniband/ulp/opa_vnic/Kconfig delete mode 100644 drivers/infiniband/ulp/opa_vnic/Makefile delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c delete mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c delete mode 100644 include/rdma/opa_vnic.h (limited to 'include') diff --git a/Documentation/driver-api/infiniband.rst b/Documentation/driver-api/infiniband.rst index 10d8be9e74fe..d48f246774d2 100644 --- a/Documentation/driver-api/infiniband.rst +++ b/Documentation/driver-api/infiniband.rst @@ -92,21 +92,6 @@ iSCSI Extensions for RDMA (iSER) .. kernel-doc:: drivers/infiniband/ulp/iser/iser_verbs.c :internal: -Omni-Path (OPA) Virtual NIC support ------------------------------------ - -.. kernel-doc:: drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h - :internal: - -.. kernel-doc:: drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h - :internal: - -.. kernel-doc:: drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c - :internal: - -.. kernel-doc:: drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c - :internal: - InfiniBand SCSI RDMA protocol target support -------------------------------------------- diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst index c11049d25703..f57387a92338 100644 --- a/Documentation/infiniband/index.rst +++ b/Documentation/infiniband/index.rst @@ -9,7 +9,6 @@ InfiniBand core_locking ipoib - opa_vnic sysfs tag_matching ucaps diff --git a/Documentation/infiniband/opa_vnic.rst b/Documentation/infiniband/opa_vnic.rst deleted file mode 100644 index 2f888d9ffec0..000000000000 --- a/Documentation/infiniband/opa_vnic.rst +++ /dev/null @@ -1,159 +0,0 @@ -================================================================= -Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) -================================================================= - -Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature -supports Ethernet functionality over Omni-Path fabric by encapsulating -the Ethernet packets between HFI nodes. - -Architecture -============= -The patterns of exchanges of Omni-Path encapsulated Ethernet packets -involves one or more virtual Ethernet switches overlaid on the Omni-Path -fabric topology. A subset of HFI nodes on the Omni-Path fabric are -permitted to exchange encapsulated Ethernet packets across a particular -virtual Ethernet switch. The virtual Ethernet switches are logical -abstractions achieved by configuring the HFI nodes on the fabric for -header generation and processing. In the simplest configuration all HFI -nodes across the fabric exchange encapsulated Ethernet packets over a -single virtual Ethernet switch. A virtual Ethernet switch, is effectively -an independent Ethernet network. The configuration is performed by an -Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM) -application. HFI nodes can have multiple VNICs each connected to a -different virtual Ethernet switch. The below diagram presents a case -of two virtual Ethernet switches with two HFI nodes:: - - +-------------------+ - | Subnet/ | - | Ethernet | - | Manager | - +-------------------+ - / / - / / - / / - / / - +-----------------------------+ +------------------------------+ - | Virtual Ethernet Switch | | Virtual Ethernet Switch | - | +---------+ +---------+ | | +---------+ +---------+ | - | | VPORT | | VPORT | | | | VPORT | | VPORT | | - +--+---------+----+---------+-+ +-+---------+----+---------+---+ - | \ / | - | \ / | - | \/ | - | / \ | - | / \ | - +-----------+------------+ +-----------+------------+ - | VNIC | VNIC | | VNIC | VNIC | - +-----------+------------+ +-----------+------------+ - | HFI | | HFI | - +------------------------+ +------------------------+ - - -The Omni-Path encapsulated Ethernet packet format is as described below. - -==================== ================================ -Bits Field -==================== ================================ -Quad Word 0: -0-19 SLID (lower 20 bits) -20-30 Length (in Quad Words) -31 BECN bit -32-51 DLID (lower 20 bits) -52-56 SC (Service Class) -57-59 RC (Routing Control) -60 FECN bit -61-62 L2 (=10, 16B format) -63 LT (=1, Link Transfer Head Flit) - -Quad Word 1: -0-7 L4 type (=0x78 ETHERNET) -8-11 SLID[23:20] -12-15 DLID[23:20] -16-31 PKEY -32-47 Entropy -48-63 Reserved - -Quad Word 2: -0-15 Reserved -16-31 L4 header -32-63 Ethernet Packet - -Quad Words 3 to N-1: -0-63 Ethernet packet (pad extended) - -Quad Word N (last): -0-23 Ethernet packet (pad extended) -24-55 ICRC -56-61 Tail -62-63 LT (=01, Link Transfer Tail Flit) -==================== ================================ - -Ethernet packet is padded on the transmit side to ensure that the VNIC OPA -packet is quad word aligned. The 'Tail' field contains the number of bytes -padded. On the receive side the 'Tail' field is read and the padding is -removed (along with ICRC, Tail and OPA header) before passing packet up -the network stack. - -The L4 header field contains the virtual Ethernet switch id the VNIC port -belongs to. On the receive side, this field is used to de-multiplex the -received VNIC packets to different VNIC ports. - -Driver Design -============== -Intel OPA VNIC software design is presented in the below diagram. -OPA VNIC functionality has a HW dependent component and a HW -independent component. - -The support has been added for IB device to allocate and free the RDMA -netdev devices. The RDMA netdev supports interfacing with the network -stack thus creating standard network interfaces. OPA_VNIC is an RDMA -netdev device type. - -The HW dependent VNIC functionality is part of the HFI1 driver. It -implements the verbs to allocate and free the OPA_VNIC RDMA netdev. -It involves HW resource allocation/management for VNIC functionality. -It interfaces with the network stack and implements the required -net_device_ops functions. It expects Omni-Path encapsulated Ethernet -packets in the transmit path and provides HW access to them. It strips -the Omni-Path header from the received packets before passing them up -the network stack. It also implements the RDMA netdev control operations. - -The OPA VNIC module implements the HW independent VNIC functionality. -It consists of two parts. The VNIC Ethernet Management Agent (VEMA) -registers itself with IB core as an IB client and interfaces with the -IB MAD stack. It exchanges the management information with the Ethernet -Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees -the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions -set by HW dependent VNIC driver where required to accommodate any control -operation. It also handles the encapsulation of Ethernet packets with an -Omni-Path header in the transmit path. For each VNIC interface, the -information required for encapsulation is configured by the EM via VEMA MAD -interface. It also passes any control information to the HW dependent driver -by invoking the RDMA netdev control operations:: - - +-------------------+ +----------------------+ - | | | Linux | - | IB MAD | | Network | - | | | Stack | - +-------------------+ +----------------------+ - | | | - | | | - +----------------------------+ | - | | | - | OPA VNIC Module | | - | (OPA VNIC RDMA Netdev | | - | & EMA functions) | | - | | | - +----------------------------+ | - | | - | | - +------------------+ | - | IB core | | - +------------------+ | - | | - | | - +--------------------------------------------+ - | | - | HFI1 Driver with VNIC support | - | | - +--------------------------------------------+ diff --git a/Documentation/translations/zh_CN/infiniband/index.rst b/Documentation/translations/zh_CN/infiniband/index.rst index 5634cc48379f..aeeea0b49939 100644 --- a/Documentation/translations/zh_CN/infiniband/index.rst +++ b/Documentation/translations/zh_CN/infiniband/index.rst @@ -24,7 +24,6 @@ infiniband core_locking ipoib - opa_vnic sysfs tag_matching user_mad diff --git a/Documentation/translations/zh_CN/infiniband/opa_vnic.rst b/Documentation/translations/zh_CN/infiniband/opa_vnic.rst deleted file mode 100644 index 12b147fbf792..000000000000 --- a/Documentation/translations/zh_CN/infiniband/opa_vnic.rst +++ /dev/null @@ -1,156 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/infiniband/opa_vnic.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - 王普宇 Puyu Wang - 时奎亮 Alex Shi - -.. _cn_infiniband_opa_vnic: - -============================================= -英特尔全路径(OPA)虚拟网络接口控制器(VNIC) -============================================= - -英特尔全路径(OPA)虚拟网络接口控制器(VNIC)功能通过封装HFI节点之间的以 -太网数据包,支持Omni-Path结构上的以太网功能。 - -体系结构 -======== - -Omni-Path封装的以太网数据包的交换模式涉及Omni-Path结构拓扑上覆盖的一个或 -多个虚拟以太网交换机。Omni-Path结构上的HFI节点的一个子集被允许在特定的虚 -拟以太网交换机上交换封装的以太网数据包。虚拟以太网交换机是通过配置结构上的 -HFI节点实现的逻辑抽象,用于生成和处理报头。在最简单的配置中,整个结构的所有 -HFI节点通过一个虚拟以太网交换机交换封装的以太网数据包。一个虚拟以太网交换机, -实际上是一个独立的以太网网络。该配置由以太网管理器(EM)执行,它是可信的结 -构管理器(FM)应用程序的一部分。HFI节点可以有多个VNIC,每个连接到不同的虚 -拟以太网交换机。下图介绍了两个虚拟以太网交换机与两个HFI节点的情况:: - - +-------------------+ - | 子网/ | - | 以太网 | - | 管理 | - +-------------------+ - / / - / / - / / - / / - +-----------------------------+ +------------------------------+ - | 虚拟以太网切换 | | 虚拟以太网切换 | - | +---------+ +---------+ | | +---------+ +---------+ | - | | VPORT | | VPORT | | | | VPORT | | VPORT | | - +--+---------+----+---------+-+ +-+---------+----+---------+---+ - | \ / | - | \ / | - | \/ | - | / \ | - | / \ | - +-----------+------------+ +-----------+------------+ - | VNIC | VNIC | | VNIC | VNIC | - +-----------+------------+ +-----------+------------+ - | HFI | | HFI | - +------------------------+ +------------------------+ - - -Omni-Path封装的以太网数据包格式如下所述。 - -==================== ================================ -位 域 -==================== ================================ -Quad Word 0: -0-19 SLID (低20位) -20-30 长度 (以四字为单位) -31 BECN 位 -32-51 DLID (低20位) -52-56 SC (服务级别) -57-59 RC (路由控制) -60 FECN 位 -61-62 L2 (=10, 16B 格式) -63 LT (=1, 链路传输头 Flit) - -Quad Word 1: -0-7 L4 type (=0x78 ETHERNET) -8-11 SLID[23:20] -12-15 DLID[23:20] -16-31 PKEY -32-47 熵 -48-63 保留 - -Quad Word 2: -0-15 保留 -16-31 L4 头 -32-63 以太网数据包 - -Quad Words 3 to N-1: -0-63 以太网数据包 (pad拓展) - -Quad Word N (last): -0-23 以太网数据包 (pad拓展) -24-55 ICRC -56-61 尾 -62-63 LT (=01, 链路传输尾 Flit) -==================== ================================ - -以太网数据包在传输端被填充,以确保VNIC OPA数据包是四字对齐的。“尾”字段 -包含填充的字节数。在接收端,“尾”字段被读取,在将数据包向上传递到网络堆 -栈之前,填充物被移除(与ICRC、尾和OPA头一起)。 - -L4头字段包含VNIC端口所属的虚拟以太网交换机ID。在接收端,该字段用于将收 -到的VNIC数据包去多路复用到不同的VNIC端口。 - -驱动设计 -======== - -英特尔OPA VNIC的软件设计如下图所示。OPA VNIC功能有一个依赖于硬件的部分 -和一个独立于硬件的部分。 - -对IB设备分配和释放RDMA netdev设备的支持已经被加入。RDMA netdev支持与 -网络堆栈的对接,从而创建标准的网络接口。OPA_VNIC是一个RDMA netdev设备 -类型。 - -依赖于HW的VNIC功能是HFI1驱动的一部分。它实现了分配和释放OPA_VNIC RDMA -netdev的动作。它涉及VNIC功能的HW资源分配/管理。它与网络堆栈接口并实现所 -需的net_device_ops功能。它在传输路径中期待Omni-Path封装的以太网数据包, -并提供对它们的HW访问。在将数据包向上传递到网络堆栈之前,它把Omni-Path头 -从接收的数据包中剥离。它还实现了RDMA netdev控制操作。 - -OPA VNIC模块实现了独立于硬件的VNIC功能。它由两部分组成。VNIC以太网管理 -代理(VEMA)作为一个IB客户端向IB核心注册,并与IB MAD栈接口。它与以太网 -管理器(EM)和VNIC netdev交换管理信息。VNIC netdev部分分配和释放OPA_VNIC -RDMA netdev设备。它在需要时覆盖由依赖HW的VNIC驱动设置的net_device_ops函数, -以适应任何控制操作。它还处理以太网数据包的封装,在传输路径中使用Omni-Path头。 -对于每个VNIC接口,封装所需的信息是由EM通过VEMA MAD接口配置的。它还通过调用 -RDMA netdev控制操作将任何控制信息传递给依赖于HW的驱动程序:: - - +-------------------+ +----------------------+ - | | | Linux | - | IB MAD | | 网络 | - | | | 栈 | - +-------------------+ +----------------------+ - | | | - | | | - +----------------------------+ | - | | | - | OPA VNIC 模块 | | - | (OPA VNIC RDMA Netdev | | - | & EMA 函数) | | - | | | - +----------------------------+ | - | | - | | - +------------------+ | - | IB 核心 | | - +------------------+ | - | | - | | - +--------------------------------------------+ - | | - | HFI1 驱动和 VNIC 支持 | - | | - +--------------------------------------------+ diff --git a/MAINTAINERS b/MAINTAINERS index 61bf550fd37c..61e69cbf0f71 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19764,12 +19764,6 @@ L: linux-rtc@vger.kernel.org S: Maintained F: drivers/rtc/rtc-optee.c -OPA-VNIC DRIVER -M: Dennis Dalessandro -L: linux-rdma@vger.kernel.org -S: Supported -F: drivers/infiniband/ulp/opa_vnic - OPEN ALLIANCE 10BASE-T1S MACPHY SERIAL INTERFACE FRAMEWORK M: Parthiban Veerasooran L: netdev@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 78ac2ff5befd..aa85ec57f2a7 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -112,6 +112,4 @@ source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/ulp/rtrs/Kconfig" -source "drivers/infiniband/ulp/opa_vnic/Kconfig" - endif # INFINIBAND diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 5d977f363684..b5551bd4703b 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -49,9 +49,7 @@ hfi1-y := \ user_pages.o \ user_sdma.o \ verbs.o \ - verbs_txreq.o \ - vnic_main.o \ - vnic_sdma.o + verbs_txreq.o ifdef CONFIG_DEBUG_FS hfi1-y += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c index 79990d09522b..61455d4ac6c2 100644 --- a/drivers/infiniband/hw/hfi1/aspm.c +++ b/drivers/infiniband/hw/hfi1/aspm.c @@ -179,7 +179,7 @@ static void aspm_ctx_timer_function(struct timer_list *t) } /* - * Disable interrupt processing for verbs contexts when PSM or VNIC contexts + * Disable interrupt processing for verbs contexts when PSM contexts * are open. */ void aspm_disable_all(struct hfi1_devdata *dd) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 6a9db4d17c5a..44c524e45396 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -85,12 +85,12 @@ struct flag_table { /* * RSM instance allocation * 0 - User Fecn Handling - * 1 - Vnic + * 1 - Deprecated * 2 - AIP * 3 - Verbs */ #define RSM_INS_FECN 0 -#define RSM_INS_VNIC 1 +#define RSM_INS_DEPRECATED 1 #define RSM_INS_AIP 2 #define RSM_INS_VERBS 3 @@ -152,15 +152,6 @@ struct flag_table { #define DETH_AIP_SQPN_SELECT_OFFSET \ DETH_AIP_SQPN_OFFSET(DETH_AIP_SQPN_BIT_OFFSET) -/* RSM fields for Vnic */ -/* L2_TYPE: QW 0, OFFSET 61 - for match */ -#define L2_TYPE_QW 0ull -#define L2_TYPE_BIT_OFFSET 61ull -#define L2_TYPE_OFFSET(off) ((L2_TYPE_QW << QW_SHIFT) | (off)) -#define L2_TYPE_MATCH_OFFSET L2_TYPE_OFFSET(L2_TYPE_BIT_OFFSET) -#define L2_TYPE_MASK 3ull -#define L2_16B_VALUE 2ull - /* L4_TYPE QW 1, OFFSET 0 - for match */ #define L4_TYPE_QW 1ull #define L4_TYPE_BIT_OFFSET 0ull @@ -6844,9 +6835,9 @@ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) for (i = 0; i < dd->num_rcv_contexts; i++) { rcd = hfi1_rcd_get_by_index(dd, i); - /* Ensure all non-user contexts(including vnic) are enabled */ + /* Ensure all non-user contexts are enabled */ if (!rcd || - (i >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic)) { + (i >= dd->first_dyn_alloc_ctxt)) { hfi1_rcd_put(rcd); continue; } @@ -8467,7 +8458,7 @@ int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget) return work_done; } -/* Receive packet napi handler for netdevs VNIC and AIP */ +/* Receive packet napi handler for netdevs AIP */ irqreturn_t receive_context_interrupt_napi(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; @@ -14506,7 +14497,7 @@ static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) int ctxt_count = hfi1_netdev_ctxt_count(dd); /* We already have contexts mapped in RMT */ - if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) { + if (has_rsm_rule(dd, RSM_INS_AIP)) { dd_dev_info(dd, "Contexts are already mapped in RMT\n"); return true; } @@ -14587,37 +14578,6 @@ void hfi1_init_aip_rsm(struct hfi1_devdata *dd) } } -/* Initialize RSM for VNIC */ -void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) -{ - int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); - struct rsm_rule_data rrd = { - /* Add rule for vnic */ - .offset = rmt_start, - .pkt_type = 4, - /* Match 16B packets */ - .field1_off = L2_TYPE_MATCH_OFFSET, - .mask1 = L2_TYPE_MASK, - .value1 = L2_16B_VALUE, - /* Match ETH L4 packets */ - .field2_off = L4_TYPE_MATCH_OFFSET, - .mask2 = L4_16B_TYPE_MASK, - .value2 = L4_16B_ETH_VALUE, - /* Calc context from veswid and entropy */ - .index1_off = L4_16B_HDR_VESWID_OFFSET, - .index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES), - .index2_off = L2_16B_ENTROPY_OFFSET, - .index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES) - }; - - hfi1_enable_rsm_rule(dd, RSM_INS_VNIC, &rrd); -} - -void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd) -{ - clear_rsm_rule(dd, RSM_INS_VNIC); -} - void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd) { /* only actually clear the rule if it's the last user asking to do so */ @@ -15195,7 +15155,7 @@ int hfi1_init_dd(struct hfi1_devdata *dd) (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); - /* alloc VNIC/AIP rx data */ + /* alloc AIP rx data */ ret = hfi1_alloc_rx(dd); if (ret) goto bail_cleanup; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 6992f6d40255..56e03d486ace 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1392,8 +1392,6 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt, u16 pkey); int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt); void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); -void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); -void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd); irqreturn_t general_interrupt(int irq, void *data); irqreturn_t sdma_interrupt(int irq, void *data); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 06487e20f723..c7259cc39013 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -20,7 +20,6 @@ #include "qp.h" #include "sdma.h" #include "debugfs.h" -#include "vnic.h" #include "fault.h" #include "ipoib.h" @@ -909,11 +908,11 @@ static void set_all_fastpath(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) u16 i; /* - * For dynamically allocated kernel contexts (like vnic) switch + * For dynamically allocated kernel contexts switch * interrupt handler only for that context. Otherwise, switch * interrupt handler for all statically allocated kernel contexts. */ - if (rcd->ctxt >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic) { + if (rcd->ctxt >= dd->first_dyn_alloc_ctxt) { hfi1_rcd_get(rcd); hfi1_set_fast(rcd); hfi1_rcd_put(rcd); @@ -922,7 +921,7 @@ static void set_all_fastpath(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) { rcd = hfi1_rcd_get_by_index(dd, i); - if (rcd && (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic)) + if (rcd && (i < dd->first_dyn_alloc_ctxt)) hfi1_set_fast(rcd); hfi1_rcd_put(rcd); } @@ -938,7 +937,7 @@ void set_all_slowpath(struct hfi1_devdata *dd) rcd = hfi1_rcd_get_by_index(dd, i); if (!rcd) continue; - if (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic) + if (i < dd->first_dyn_alloc_ctxt) rcd->do_interrupt = rcd->slow_handler; hfi1_rcd_put(rcd); @@ -1400,7 +1399,7 @@ int hfi1_reset_device(int unit) goto bail; } - /* If there are any user/vnic contexts, we cannot reset */ + /* If there are any user contexts, we cannot reset */ mutex_lock(&hfi1_mutex); if (dd->rcd) if (hfi1_stats.sps_ctxts) { @@ -1899,7 +1898,7 @@ const rhf_rcv_function_ptr netdev_rhf_rcv_functions[] = { [RHF_RCV_TYPE_EAGER] = process_receive_invalid, [RHF_RCV_TYPE_IB] = hfi1_ipoib_ib_rcv, [RHF_RCV_TYPE_ERROR] = process_receive_error, - [RHF_RCV_TYPE_BYPASS] = hfi1_vnic_bypass_rcv, + [RHF_RCV_TYPE_BYPASS] = process_receive_invalid, [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index cb630551cf1a..5a0310f758dc 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -212,10 +212,6 @@ struct hfi1_ctxtdata { u8 rhf_offset; /* dynamic receive available interrupt timeout */ u8 rcvavail_timeout; - /* Indicates that this is vnic context */ - bool is_vnic; - /* vnic queue index this context is mapped to */ - u8 vnic_q_idx; /* Is ASPM interrupt supported for this context */ bool aspm_intr_supported; /* ASPM state (enabled/disabled) for this context */ @@ -402,7 +398,6 @@ struct hfi1_packet { #define OPA_16B_L4_FM 0x08 #define OPA_16B_L4_IB_LOCAL 0x09 #define OPA_16B_L4_IB_GLOBAL 0x0A -#define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR /* * OPA 16B Management @@ -997,14 +992,6 @@ struct hfi1_asic_data { #define NUM_MAP_ENTRIES 256 #define NUM_MAP_REGS 32 -/* Virtual NIC information */ -struct hfi1_vnic_data { - struct kmem_cache *txreq_cache; - u8 num_vports; -}; - -struct hfi1_vnic_vport_info; - /* device data struct now contains only "general per-device" info. * fields related to a physical IB port are in a hfi1_pportdata struct. */ @@ -1298,9 +1285,6 @@ struct hfi1_devdata { send_routine process_dma_send; void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, const void *from, size_t count); - int (*process_vnic_dma_send)(struct hfi1_devdata *dd, u8 q_idx, - struct hfi1_vnic_vport_info *vinfo, - struct sk_buff *skb, u64 pbc, u8 plen); /* hfi1_pportdata, points to array of (physical) port-specific * data structs, indexed by pidx (0..n-1) */ @@ -1314,7 +1298,6 @@ struct hfi1_devdata { u16 flags; /* Number of physical ports available */ u8 num_pports; - /* Lowest context number which can be used by user processes or VNIC */ u8 first_dyn_alloc_ctxt; /* adding a new field here would make it part of this cacheline */ @@ -1353,11 +1336,8 @@ struct hfi1_devdata { bool aspm_enabled; /* ASPM state: enabled/disabled */ struct rhashtable *sdma_rht; - /* vnic data */ - struct hfi1_vnic_data vnic; /* Lock to protect IRQ SRC register access */ spinlock_t irq_src_lock; - int vnic_num_vports; struct hfi1_netdev_rx *netdev_rx; struct hfi1_affinity_node *affinity_entry; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 07333dd37652..8b5a5b32b0fa 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -26,7 +26,6 @@ #include "verbs.h" #include "aspm.h" #include "affinity.h" -#include "vnic.h" #include "exp_rcv.h" #include "netdev.h" @@ -349,7 +348,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, * We do this here because we have to take into account all * the RcvArray entries that previous context would have * taken and we have to account for any extra groups assigned - * to the static (kernel) or dynamic (vnic/user) contexts. + * to the static (kernel) or dynamic (user) contexts. */ if (ctxt < dd->first_dyn_alloc_ctxt) { if (ctxt < kctxt_ngroups) { @@ -851,7 +850,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) dd->process_pio_send = hfi1_verbs_send_pio; dd->process_dma_send = hfi1_verbs_send_dma; dd->pio_inline_send = pio_copy; - dd->process_vnic_dma_send = hfi1_vnic_send_dma; if (is_ax(dd)) { atomic_set(&dd->drop_packet, DROP_PACKET_ON); diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 03467e6c19a0..585f1d99b91b 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -12,7 +12,6 @@ #include "mad.h" #include "trace.h" #include "qp.h" -#include "vnic.h" /* the reset value from the FM is supposed to be 0xffff, handle both */ #define OPA_LINK_WIDTH_RESET_OLD 0x0fff diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index 3ac50ca4afcc..c06f4741c89e 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -24,7 +24,6 @@ int msix_initialize(struct hfi1_devdata *dd) * one for the general, "slow path" interrupt * one per used SDMA engine * one per kernel receive context - * one for each VNIC context * ...any new IRQs should be added here. */ total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_netdev_contexts; @@ -127,8 +126,7 @@ static int msix_request_rcd_irq_common(struct hfi1_ctxtdata *rcd, irq_handler_t thread, const char *name) { - int nr = msix_request_irq(rcd->dd, rcd, handler, thread, - rcd->is_vnic ? IRQ_NETDEVCTXT : IRQ_RCVCTXT, + int nr = msix_request_irq(rcd->dd, rcd, handler, thread, IRQ_RCVCTXT, name); if (nr < 0) return nr; diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index 07c8f77c9181..c6440bd07d2e 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -14,7 +14,7 @@ /** * struct hfi1_netdev_rxq - Receive Queue for HFI - * Both IPoIB and VNIC netdevices will be working on the rx abstraction. + * IPoIB netdevices will be working on the rx abstraction. * @napi: napi object * @rx: ptr to netdev_rx * @rcd: ptr to receive context data @@ -25,10 +25,6 @@ struct hfi1_netdev_rxq { struct hfi1_ctxtdata *rcd; }; -/* - * Number of netdev contexts used. Ensure it is less than or equal to - * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE). - */ #define HFI1_MAX_NETDEV_CTXTS 8 /* Number of NETDEV RSM entries */ @@ -42,7 +38,7 @@ struct hfi1_netdev_rxq { * @num_rx_q: number of receive queues * @rmt_index: first free index in RMT Array * @msix_start: first free MSI-X interrupt vector. - * @dev_tbl: netdev table for unique identifier VNIC and IPoIb VLANs. + * @dev_tbl: netdev table for unique identifier IPoIb VLANs. * @enabled: atomic counter of netdevs enabling receive queues. * When 0 NAPI will be disabled. * @netdevs: atomic counter of netdevs using dummy netdev. diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 8608044203bb..ca2ae52b21e3 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -78,7 +78,6 @@ static int hfi1_netdev_allocate_ctxt(struct hfi1_devdata *dd, uctxt->fast_handler = handle_receive_interrupt_napi_fp; uctxt->slow_handler = handle_receive_interrupt_napi_sp; hfi1_set_seq_cnt(uctxt, 1); - uctxt->is_vnic = true; hfi1_stats.sps_ctxts++; @@ -427,7 +426,7 @@ void hfi1_netdev_disable_queues(struct hfi1_devdata *dd) /** * hfi1_netdev_add_data - Registers data with unique identifier - * to be requested later this is needed for VNIC and IPoIB VLANs + * to be requested later this is needed for IPoIB VLANs * implementations. * This call is protected by mutex idr_lock. * diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 3cbbfccdd8cd..e569b647d611 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -21,7 +21,6 @@ #include "qp.h" #include "verbs_txreq.h" #include "debugfs.h" -#include "vnic.h" #include "fault.h" #include "affinity.h" #include "ipoib.h" @@ -1729,7 +1728,6 @@ static const struct ib_device_ops hfi1_dev_ops = { .alloc_hw_device_stats = hfi1_alloc_hw_device_stats, .alloc_hw_port_stats = hfi_alloc_hw_port_stats, - .alloc_rdma_netdev = hfi1_vnic_alloc_rn, .device_group = &ib_hfi1_attr_group, .get_dev_fw_str = hfi1_get_dev_fw_str, .get_hw_stats = get_hw_stats, diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h deleted file mode 100644 index bbafeb5fc0ec..000000000000 --- a/drivers/infiniband/hw/hfi1/vnic.h +++ /dev/null @@ -1,126 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright(c) 2017 - 2020 Intel Corporation. - */ - -#ifndef _HFI1_VNIC_H -#define _HFI1_VNIC_H -#include -#include "hfi.h" -#include "sdma.h" - -#define HFI1_VNIC_MAX_TXQ 16 -#define HFI1_VNIC_MAX_PAD 12 - -/* L4 header definitions */ -#define HFI1_VNIC_L4_HDR_OFFSET OPA_VNIC_L2_HDR_LEN - -#define HFI1_VNIC_GET_L4_HDR(data) \ - (*((u16 *)((u8 *)(data) + HFI1_VNIC_L4_HDR_OFFSET))) - -#define HFI1_VNIC_GET_VESWID(data) \ - (HFI1_VNIC_GET_L4_HDR(data) & 0xFFF) - -/* Service class */ -#define HFI1_VNIC_SC_OFFSET_LOW 6 -#define HFI1_VNIC_SC_OFFSET_HI 7 -#define HFI1_VNIC_SC_SHIFT 4 - -#define HFI1_VNIC_MAX_QUEUE 16 -#define HFI1_NUM_VNIC_CTXT 8 - -/** - * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information - * @dd - device data pointer - * @sde - sdma engine - * @vinfo - vnic info pointer - * @wait - iowait structure - * @stx - sdma tx request - * @state - vnic Tx ring SDMA state - * @q_idx - vnic Tx queue index - */ -struct hfi1_vnic_sdma { - struct hfi1_devdata *dd; - struct sdma_engine *sde; - struct hfi1_vnic_vport_info *vinfo; - struct iowait wait; - struct sdma_txreq stx; - unsigned int state; - u8 q_idx; - bool pkts_sent; -}; - -/** - * struct hfi1_vnic_rx_queue - HFI1 VNIC receive queue - * @idx: queue index - * @vinfo: pointer to vport information - * @netdev: network device - * @napi: netdev napi structure - * @skbq: queue of received socket buffers - */ -struct hfi1_vnic_rx_queue { - u8 idx; - struct hfi1_vnic_vport_info *vinfo; - struct net_device *netdev; - struct napi_struct napi; -}; - -/** - * struct hfi1_vnic_vport_info - HFI1 VNIC virtual port information - * @dd: device data pointer - * @netdev: net device pointer - * @flags: state flags - * @lock: vport lock - * @num_tx_q: number of transmit queues - * @num_rx_q: number of receive queues - * @vesw_id: virtual switch id - * @rxq: Array of receive queues - * @stats: per queue stats - * @sdma: VNIC SDMA structure per TXQ - */ -struct hfi1_vnic_vport_info { - struct hfi1_devdata *dd; - struct net_device *netdev; - unsigned long flags; - - /* Lock used around state updates */ - struct mutex lock; - - u8 num_tx_q; - u8 num_rx_q; - u16 vesw_id; - struct hfi1_vnic_rx_queue rxq[HFI1_NUM_VNIC_CTXT]; - - struct opa_vnic_stats stats[HFI1_VNIC_MAX_QUEUE]; - struct hfi1_vnic_sdma sdma[HFI1_VNIC_MAX_TXQ]; -}; - -#define v_dbg(format, arg...) \ - netdev_dbg(vinfo->netdev, format, ## arg) -#define v_err(format, arg...) \ - netdev_err(vinfo->netdev, format, ## arg) -#define v_info(format, arg...) \ - netdev_info(vinfo->netdev, format, ## arg) - -/* vnic hfi1 internal functions */ -void hfi1_vnic_setup(struct hfi1_devdata *dd); -int hfi1_vnic_txreq_init(struct hfi1_devdata *dd); -void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd); - -void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet); -void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo); -bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo, - u8 q_idx); - -/* vnic rdma netdev operations */ -struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, - u32 port_num, - enum rdma_netdev_t type, - const char *name, - unsigned char name_assign_type, - void (*setup)(struct net_device *)); -int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, - struct hfi1_vnic_vport_info *vinfo, - struct sk_buff *skb, u64 pbc, u8 plen); - -#endif /* _HFI1_VNIC_H */ diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c deleted file mode 100644 index 16a4c297a897..000000000000 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ /dev/null @@ -1,615 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -/* - * Copyright(c) 2017 - 2020 Intel Corporation. - */ - -/* - * This file contains HFI1 support for VNIC functionality - */ - -#include -#include - -#include "vnic.h" -#include "netdev.h" - -#define HFI_TX_TIMEOUT_MS 1000 - -#define HFI1_VNIC_RCV_Q_SIZE 1024 - -#define HFI1_VNIC_UP 0 - -static DEFINE_SPINLOCK(vport_cntr_lock); - -#define SUM_GRP_COUNTERS(stats, qstats, x_grp) do { \ - u64 *src64, *dst64; \ - for (src64 = &qstats->x_grp.unicast, \ - dst64 = &stats->x_grp.unicast; \ - dst64 <= &stats->x_grp.s_1519_max;) { \ - *dst64++ += *src64++; \ - } \ - } while (0) - -#define VNIC_MASK (0xFF) -#define VNIC_ID(val) ((1ull << 24) | ((val) & VNIC_MASK)) - -/* hfi1_vnic_update_stats - update statistics */ -static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo, - struct opa_vnic_stats *stats) -{ - struct net_device *netdev = vinfo->netdev; - u8 i; - - /* add tx counters on different queues */ - for (i = 0; i < vinfo->num_tx_q; i++) { - struct opa_vnic_stats *qstats = &vinfo->stats[i]; - struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats; - - stats->netstats.tx_fifo_errors += qnstats->tx_fifo_errors; - stats->netstats.tx_carrier_errors += qnstats->tx_carrier_errors; - stats->tx_drop_state += qstats->tx_drop_state; - stats->tx_dlid_zero += qstats->tx_dlid_zero; - - SUM_GRP_COUNTERS(stats, qstats, tx_grp); - stats->netstats.tx_packets += qnstats->tx_packets; - stats->netstats.tx_bytes += qnstats->tx_bytes; - } - - /* add rx counters on different queues */ - for (i = 0; i < vinfo->num_rx_q; i++) { - struct opa_vnic_stats *qstats = &vinfo->stats[i]; - struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats; - - stats->netstats.rx_fifo_errors += qnstats->rx_fifo_errors; - stats->netstats.rx_nohandler += qnstats->rx_nohandler; - stats->rx_drop_state += qstats->rx_drop_state; - stats->rx_oversize += qstats->rx_oversize; - stats->rx_runt += qstats->rx_runt; - - SUM_GRP_COUNTERS(stats, qstats, rx_grp); - stats->netstats.rx_packets += qnstats->rx_packets; - stats->netstats.rx_bytes += qnstats->rx_bytes; - } - - stats->netstats.tx_errors = stats->netstats.tx_fifo_errors + - stats->netstats.tx_carrier_errors + - stats->tx_drop_state + stats->tx_dlid_zero; - stats->netstats.tx_dropped = stats->netstats.tx_errors; - - stats->netstats.rx_errors = stats->netstats.rx_fifo_errors + - stats->netstats.rx_nohandler + - stats->rx_drop_state + stats->rx_oversize + - stats->rx_runt; - stats->netstats.rx_dropped = stats->netstats.rx_errors; - - netdev->stats.tx_packets = stats->netstats.tx_packets; - netdev->stats.tx_bytes = stats->netstats.tx_bytes; - netdev->stats.tx_fifo_errors = stats->netstats.tx_fifo_errors; - netdev->stats.tx_carrier_errors = stats->netstats.tx_carrier_errors; - netdev->stats.tx_errors = stats->netstats.tx_errors; - netdev->stats.tx_dropped = stats->netstats.tx_dropped; - - netdev->stats.rx_packets = stats->netstats.rx_packets; - netdev->stats.rx_bytes = stats->netstats.rx_bytes; - netdev->stats.rx_fifo_errors = stats->netstats.rx_fifo_errors; - netdev->stats.multicast = stats->rx_grp.mcastbcast; - netdev->stats.rx_length_errors = stats->rx_oversize + stats->rx_runt; - netdev->stats.rx_errors = stats->netstats.rx_errors; - netdev->stats.rx_dropped = stats->netstats.rx_dropped; -} - -/* update_len_counters - update pkt's len histogram counters */ -static inline void update_len_counters(struct opa_vnic_grp_stats *grp, - int len) -{ - /* account for 4 byte FCS */ - if (len >= 1515) - grp->s_1519_max++; - else if (len >= 1020) - grp->s_1024_1518++; - else if (len >= 508) - grp->s_512_1023++; - else if (len >= 252) - grp->s_256_511++; - else if (len >= 124) - grp->s_128_255++; - else if (len >= 61) - grp->s_65_127++; - else - grp->s_64++; -} - -/* hfi1_vnic_update_tx_counters - update transmit counters */ -static void hfi1_vnic_update_tx_counters(struct hfi1_vnic_vport_info *vinfo, - u8 q_idx, struct sk_buff *skb, int err) -{ - struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); - struct opa_vnic_stats *stats = &vinfo->stats[q_idx]; - struct opa_vnic_grp_stats *tx_grp = &stats->tx_grp; - u16 vlan_tci; - - stats->netstats.tx_packets++; - stats->netstats.tx_bytes += skb->len + ETH_FCS_LEN; - - update_len_counters(tx_grp, skb->len); - - /* rest of the counts are for good packets only */ - if (unlikely(err)) - return; - - if (is_multicast_ether_addr(mac_hdr->h_dest)) - tx_grp->mcastbcast++; - else - tx_grp->unicast++; - - if (!__vlan_get_tag(skb, &vlan_tci)) - tx_grp->vlan++; - else - tx_grp->untagged++; -} - -/* hfi1_vnic_update_rx_counters - update receive counters */ -static void hfi1_vnic_update_rx_counters(struct hfi1_vnic_vport_info *vinfo, - u8 q_idx, struct sk_buff *skb, int err) -{ - struct ethhdr *mac_hdr = (struct ethhdr *)skb->data; - struct opa_vnic_stats *stats = &vinfo->stats[q_idx]; - struct opa_vnic_grp_stats *rx_grp = &stats->rx_grp; - u16 vlan_tci; - - stats->netstats.rx_packets++; - stats->netstats.rx_bytes += skb->len + ETH_FCS_LEN; - - update_len_counters(rx_grp, skb->len); - - /* rest of the counts are for good packets only */ - if (unlikely(err)) - return; - - if (is_multicast_ether_addr(mac_hdr->h_dest)) - rx_grp->mcastbcast++; - else - rx_grp->unicast++; - - if (!__vlan_get_tag(skb, &vlan_tci)) - rx_grp->vlan++; - else - rx_grp->untagged++; -} - -/* This function is overloaded for opa_vnic specific implementation */ -static void hfi1_vnic_get_stats64(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct opa_vnic_stats *vstats = (struct opa_vnic_stats *)stats; - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - - hfi1_vnic_update_stats(vinfo, vstats); -} - -static u64 create_bypass_pbc(u32 vl, u32 dw_len) -{ - u64 pbc; - - pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT) - | PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN - | PBC_PACKET_BYPASS - | ((vl & PBC_VL_MASK) << PBC_VL_SHIFT) - | (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT; - - return pbc; -} - -/* hfi1_vnic_maybe_stop_tx - stop tx queue if required */ -static void hfi1_vnic_maybe_stop_tx(struct hfi1_vnic_vport_info *vinfo, - u8 q_idx) -{ - netif_stop_subqueue(vinfo->netdev, q_idx); - if (!hfi1_vnic_sdma_write_avail(vinfo, q_idx)) - return; - - netif_start_subqueue(vinfo->netdev, q_idx); -} - -static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff *skb, - struct net_device *netdev) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - u8 pad_len, q_idx = skb->queue_mapping; - struct hfi1_devdata *dd = vinfo->dd; - struct opa_vnic_skb_mdata *mdata; - u32 pkt_len, total_len; - int err = -EINVAL; - u64 pbc; - - v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len); - if (unlikely(!netif_oper_up(netdev))) { - vinfo->stats[q_idx].tx_drop_state++; - goto tx_finish; - } - - /* take out meta data */ - mdata = (struct opa_vnic_skb_mdata *)skb->data; - skb_pull(skb, sizeof(*mdata)); - if (unlikely(mdata->flags & OPA_VNIC_SKB_MDATA_ENCAP_ERR)) { - vinfo->stats[q_idx].tx_dlid_zero++; - goto tx_finish; - } - - /* add tail padding (for 8 bytes size alignment) and icrc */ - pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7; - pad_len += OPA_VNIC_ICRC_TAIL_LEN; - - /* - * pkt_len is how much data we have to write, includes header and data. - * total_len is length of the packet in Dwords plus the PBC should not - * include the CRC. - */ - pkt_len = (skb->len + pad_len) >> 2; - total_len = pkt_len + 2; /* PBC + packet */ - - pbc = create_bypass_pbc(mdata->vl, total_len); - - skb_get(skb); - v_dbg("pbc 0x%016llX len %d pad_len %d\n", pbc, skb->len, pad_len); - err = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb, pbc, pad_len); - if (unlikely(err)) { - if (err == -ENOMEM) - vinfo->stats[q_idx].netstats.tx_fifo_errors++; - else if (err != -EBUSY) - vinfo->stats[q_idx].netstats.tx_carrier_errors++; - } - /* remove the header before updating tx counters */ - skb_pull(skb, OPA_VNIC_HDR_LEN); - - if (unlikely(err == -EBUSY)) { - hfi1_vnic_maybe_stop_tx(vinfo, q_idx); - dev_kfree_skb_any(skb); - return NETDEV_TX_BUSY; - } - -tx_finish: - /* update tx counters */ - hfi1_vnic_update_tx_counters(vinfo, q_idx, skb, err); - dev_kfree_skb_any(skb); - return NETDEV_TX_OK; -} - -static u16 hfi1_vnic_select_queue(struct net_device *netdev, - struct sk_buff *skb, - struct net_device *sb_dev) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - struct opa_vnic_skb_mdata *mdata; - struct sdma_engine *sde; - - mdata = (struct opa_vnic_skb_mdata *)skb->data; - sde = sdma_select_engine_vl(vinfo->dd, mdata->entropy, mdata->vl); - return sde->this_idx; -} - -/* hfi1_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */ -static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq, - struct sk_buff *skb) -{ - struct hfi1_vnic_vport_info *vinfo = rxq->vinfo; - int max_len = vinfo->netdev->mtu + VLAN_ETH_HLEN; - int rc = -EFAULT; - - skb_pull(skb, OPA_VNIC_HDR_LEN); - - /* Validate Packet length */ - if (unlikely(skb->len > max_len)) - vinfo->stats[rxq->idx].rx_oversize++; - else if (unlikely(skb->len < ETH_ZLEN)) - vinfo->stats[rxq->idx].rx_runt++; - else - rc = 0; - return rc; -} - -static struct hfi1_vnic_vport_info *get_vnic_port(struct hfi1_devdata *dd, - int vesw_id) -{ - int vnic_id = VNIC_ID(vesw_id); - - return hfi1_netdev_get_data(dd, vnic_id); -} - -static struct hfi1_vnic_vport_info *get_first_vnic_port(struct hfi1_devdata *dd) -{ - struct hfi1_vnic_vport_info *vinfo; - int next_id = VNIC_ID(0); - - vinfo = hfi1_netdev_get_first_data(dd, &next_id); - - if (next_id > VNIC_ID(VNIC_MASK)) - return NULL; - - return vinfo; -} - -void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) -{ - struct hfi1_devdata *dd = packet->rcd->dd; - struct hfi1_vnic_vport_info *vinfo = NULL; - struct hfi1_vnic_rx_queue *rxq; - struct sk_buff *skb; - int l4_type, vesw_id = -1, rc; - u8 q_idx; - unsigned char *pad_info; - - l4_type = hfi1_16B_get_l4(packet->ebuf); - if (likely(l4_type == OPA_16B_L4_ETHR)) { - vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf); - vinfo = get_vnic_port(dd, vesw_id); - - /* - * In case of invalid vesw id, count the error on - * the first available vport. - */ - if (unlikely(!vinfo)) { - struct hfi1_vnic_vport_info *vinfo_tmp; - - vinfo_tmp = get_first_vnic_port(dd); - if (vinfo_tmp) { - spin_lock(&vport_cntr_lock); - vinfo_tmp->stats[0].netstats.rx_nohandler++; - spin_unlock(&vport_cntr_lock); - } - } - } - - if (unlikely(!vinfo)) { - dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n", - l4_type, vesw_id, packet->rcd->ctxt); - return; - } - - q_idx = packet->rcd->vnic_q_idx; - rxq = &vinfo->rxq[q_idx]; - if (unlikely(!netif_oper_up(vinfo->netdev))) { - vinfo->stats[q_idx].rx_drop_state++; - return; - } - - skb = netdev_alloc_skb(vinfo->netdev, packet->tlen); - if (unlikely(!skb)) { - vinfo->stats[q_idx].netstats.rx_fifo_errors++; - return; - } - - memcpy(skb->data, packet->ebuf, packet->tlen); - skb_put(skb, packet->tlen); - - pad_info = skb->data + skb->len - 1; - skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN - - ((*pad_info) & 0x7))); - - rc = hfi1_vnic_decap_skb(rxq, skb); - - /* update rx counters */ - hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc); - if (unlikely(rc)) { - dev_kfree_skb_any(skb); - return; - } - - skb_checksum_none_assert(skb); - skb->protocol = eth_type_trans(skb, rxq->netdev); - - napi_gro_receive(&rxq->napi, skb); -} - -static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo) -{ - struct hfi1_devdata *dd = vinfo->dd; - struct net_device *netdev = vinfo->netdev; - int rc; - - /* ensure virtual eth switch id is valid */ - if (!vinfo->vesw_id) - return -EINVAL; - - rc = hfi1_netdev_add_data(dd, VNIC_ID(vinfo->vesw_id), vinfo); - if (rc < 0) - return rc; - - rc = hfi1_netdev_rx_init(dd); - if (rc) - goto err_remove; - - netif_carrier_on(netdev); - netif_tx_start_all_queues(netdev); - set_bit(HFI1_VNIC_UP, &vinfo->flags); - - return 0; - -err_remove: - hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); - return rc; -} - -static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) -{ - struct hfi1_devdata *dd = vinfo->dd; - - clear_bit(HFI1_VNIC_UP, &vinfo->flags); - netif_carrier_off(vinfo->netdev); - netif_tx_disable(vinfo->netdev); - hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); - - hfi1_netdev_rx_destroy(dd); -} - -static int hfi1_netdev_open(struct net_device *netdev) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - int rc; - - mutex_lock(&vinfo->lock); - rc = hfi1_vnic_up(vinfo); - mutex_unlock(&vinfo->lock); - return rc; -} - -static int hfi1_netdev_close(struct net_device *netdev) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - - mutex_lock(&vinfo->lock); - if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) - hfi1_vnic_down(vinfo); - mutex_unlock(&vinfo->lock); - return 0; -} - -static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) -{ - struct hfi1_devdata *dd = vinfo->dd; - int rc = 0; - - mutex_lock(&hfi1_mutex); - if (!dd->vnic_num_vports) { - rc = hfi1_vnic_txreq_init(dd); - if (rc) - goto txreq_fail; - } - - rc = hfi1_netdev_rx_init(dd); - if (rc) { - dd_dev_err(dd, "Unable to initialize netdev contexts\n"); - goto alloc_fail; - } - - hfi1_init_vnic_rsm(dd); - - dd->vnic_num_vports++; - hfi1_vnic_sdma_init(vinfo); - -alloc_fail: - if (!dd->vnic_num_vports) - hfi1_vnic_txreq_deinit(dd); -txreq_fail: - mutex_unlock(&hfi1_mutex); - return rc; -} - -static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo) -{ - struct hfi1_devdata *dd = vinfo->dd; - - mutex_lock(&hfi1_mutex); - if (--dd->vnic_num_vports == 0) { - hfi1_deinit_vnic_rsm(dd); - hfi1_vnic_txreq_deinit(dd); - } - mutex_unlock(&hfi1_mutex); - hfi1_netdev_rx_destroy(dd); -} - -static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - bool reopen = false; - - /* - * If vesw_id is being changed, and if the vnic port is up, - * reset the vnic port to ensure new vesw_id gets picked up - */ - if (id != vinfo->vesw_id) { - mutex_lock(&vinfo->lock); - if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) { - hfi1_vnic_down(vinfo); - reopen = true; - } - - vinfo->vesw_id = id; - if (reopen) - hfi1_vnic_up(vinfo); - - mutex_unlock(&vinfo->lock); - } -} - -/* netdev ops */ -static const struct net_device_ops hfi1_netdev_ops = { - .ndo_open = hfi1_netdev_open, - .ndo_stop = hfi1_netdev_close, - .ndo_start_xmit = hfi1_netdev_start_xmit, - .ndo_select_queue = hfi1_vnic_select_queue, - .ndo_get_stats64 = hfi1_vnic_get_stats64, -}; - -static void hfi1_vnic_free_rn(struct net_device *netdev) -{ - struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); - - hfi1_vnic_deinit(vinfo); - mutex_destroy(&vinfo->lock); - free_netdev(netdev); -} - -struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, - u32 port_num, - enum rdma_netdev_t type, - const char *name, - unsigned char name_assign_type, - void (*setup)(struct net_device *)) -{ - struct hfi1_devdata *dd = dd_from_ibdev(device); - struct hfi1_vnic_vport_info *vinfo; - struct net_device *netdev; - struct rdma_netdev *rn; - int i, size, rc; - - if (!dd->num_netdev_contexts) - return ERR_PTR(-ENOMEM); - - if (!port_num || (port_num > dd->num_pports)) - return ERR_PTR(-EINVAL); - - if (type != RDMA_NETDEV_OPA_VNIC) - return ERR_PTR(-EOPNOTSUPP); - - size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo); - netdev = alloc_netdev_mqs(size, name, name_assign_type, setup, - chip_sdma_engines(dd), - dd->num_netdev_contexts); - if (!netdev) - return ERR_PTR(-ENOMEM); - - rn = netdev_priv(netdev); - vinfo = opa_vnic_dev_priv(netdev); - vinfo->dd = dd; - vinfo->num_tx_q = chip_sdma_engines(dd); - vinfo->num_rx_q = dd->num_netdev_contexts; - vinfo->netdev = netdev; - rn->free_rdma_netdev = hfi1_vnic_free_rn; - rn->set_id = hfi1_vnic_set_vesw_id; - - netdev->features = NETIF_F_HIGHDMA | NETIF_F_SG; - netdev->hw_features = netdev->features; - netdev->vlan_features = netdev->features; - netdev->watchdog_timeo = msecs_to_jiffies(HFI_TX_TIMEOUT_MS); - netdev->netdev_ops = &hfi1_netdev_ops; - mutex_init(&vinfo->lock); - - for (i = 0; i < vinfo->num_rx_q; i++) { - struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i]; - - rxq->idx = i; - rxq->vinfo = vinfo; - rxq->netdev = netdev; - } - - rc = hfi1_vnic_init(vinfo); - if (rc) - goto init_fail; - - return netdev; -init_fail: - mutex_destroy(&vinfo->lock); - free_netdev(netdev); - return ERR_PTR(rc); -} diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c deleted file mode 100644 index 6caf01ba0bca..000000000000 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ /dev/null @@ -1,282 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -/* - * Copyright(c) 2017 - 2018 Intel Corporation. - */ - -/* - * This file contains HFI1 support for VNIC SDMA functionality - */ - -#include "sdma.h" -#include "vnic.h" - -#define HFI1_VNIC_SDMA_Q_ACTIVE BIT(0) -#define HFI1_VNIC_SDMA_Q_DEFERRED BIT(1) - -#define HFI1_VNIC_TXREQ_NAME_LEN 32 -#define HFI1_VNIC_SDMA_DESC_WTRMRK 64 - -/* - * struct vnic_txreq - VNIC transmit descriptor - * @txreq: sdma transmit request - * @sdma: vnic sdma pointer - * @skb: skb to send - * @pad: pad buffer - * @plen: pad length - * @pbc_val: pbc value - */ -struct vnic_txreq { - struct sdma_txreq txreq; - struct hfi1_vnic_sdma *sdma; - - struct sk_buff *skb; - unsigned char pad[HFI1_VNIC_MAX_PAD]; - u16 plen; - __le64 pbc_val; -}; - -static void vnic_sdma_complete(struct sdma_txreq *txreq, - int status) -{ - struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); - struct hfi1_vnic_sdma *vnic_sdma = tx->sdma; - - sdma_txclean(vnic_sdma->dd, txreq); - dev_kfree_skb_any(tx->skb); - kmem_cache_free(vnic_sdma->dd->vnic.txreq_cache, tx); -} - -static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, - struct vnic_txreq *tx) -{ - int i, ret = 0; - - ret = sdma_txadd_kvaddr( - sde->dd, - &tx->txreq, - tx->skb->data, - skb_headlen(tx->skb)); - if (unlikely(ret)) - goto bail_txadd; - - for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) { - skb_frag_t *frag = &skb_shinfo(tx->skb)->frags[i]; - - /* combine physically continuous fragments later? */ - ret = sdma_txadd_page(sde->dd, - &tx->txreq, - skb_frag_page(frag), - skb_frag_off(frag), - skb_frag_size(frag), - NULL, NULL, NULL); - if (unlikely(ret)) - goto bail_txadd; - } - - if (tx->plen) - ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq, - tx->pad + HFI1_VNIC_MAX_PAD - tx->plen, - tx->plen); - -bail_txadd: - return ret; -} - -static int build_vnic_tx_desc(struct sdma_engine *sde, - struct vnic_txreq *tx, - u64 pbc) -{ - int ret = 0; - u16 hdrbytes = 2 << 2; /* PBC */ - - ret = sdma_txinit_ahg( - &tx->txreq, - 0, - hdrbytes + tx->skb->len + tx->plen, - 0, - 0, - NULL, - 0, - vnic_sdma_complete); - if (unlikely(ret)) - goto bail_txadd; - - /* add pbc */ - tx->pbc_val = cpu_to_le64(pbc); - ret = sdma_txadd_kvaddr( - sde->dd, - &tx->txreq, - &tx->pbc_val, - hdrbytes); - if (unlikely(ret)) - goto bail_txadd; - - /* add the ulp payload */ - ret = build_vnic_ulp_payload(sde, tx); -bail_txadd: - return ret; -} - -/* setup the last plen bypes of pad */ -static inline void hfi1_vnic_update_pad(unsigned char *pad, u8 plen) -{ - pad[HFI1_VNIC_MAX_PAD - 1] = plen - OPA_VNIC_ICRC_TAIL_LEN; -} - -int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, - struct hfi1_vnic_vport_info *vinfo, - struct sk_buff *skb, u64 pbc, u8 plen) -{ - struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx]; - struct sdma_engine *sde = vnic_sdma->sde; - struct vnic_txreq *tx; - int ret = -ECOMM; - - if (unlikely(READ_ONCE(vnic_sdma->state) != HFI1_VNIC_SDMA_Q_ACTIVE)) - goto tx_err; - - if (unlikely(!sde || !sdma_running(sde))) - goto tx_err; - - tx = kmem_cache_alloc(dd->vnic.txreq_cache, GFP_ATOMIC); - if (unlikely(!tx)) { - ret = -ENOMEM; - goto tx_err; - } - - tx->sdma = vnic_sdma; - tx->skb = skb; - hfi1_vnic_update_pad(tx->pad, plen); - tx->plen = plen; - ret = build_vnic_tx_desc(sde, tx, pbc); - if (unlikely(ret)) - goto free_desc; - - ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), - &tx->txreq, vnic_sdma->pkts_sent); - /* When -ECOMM, sdma callback will be called with ABORT status */ - if (unlikely(ret && unlikely(ret != -ECOMM))) - goto free_desc; - - if (!ret) { - vnic_sdma->pkts_sent = true; - iowait_starve_clear(vnic_sdma->pkts_sent, &vnic_sdma->wait); - } - return ret; - -free_desc: - sdma_txclean(dd, &tx->txreq); - kmem_cache_free(dd->vnic.txreq_cache, tx); -tx_err: - if (ret != -EBUSY) - dev_kfree_skb_any(skb); - else - vnic_sdma->pkts_sent = false; - return ret; -} - -/* - * hfi1_vnic_sdma_sleep - vnic sdma sleep function - * - * This function gets called from sdma_send_txreq() when there are not enough - * sdma descriptors available to send the packet. It adds Tx queue's wait - * structure to sdma engine's dmawait list to be woken up when descriptors - * become available. - */ -static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, - struct iowait_work *wait, - struct sdma_txreq *txreq, - uint seq, - bool pkts_sent) -{ - struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait->iow, struct hfi1_vnic_sdma, wait); - - write_seqlock(&sde->waitlock); - if (sdma_progress(sde, seq, txreq)) { - write_sequnlock(&sde->waitlock); - return -EAGAIN; - } - - vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; - if (list_empty(&vnic_sdma->wait.list)) { - iowait_get_priority(wait->iow); - iowait_queue(pkts_sent, wait->iow, &sde->dmawait); - } - write_sequnlock(&sde->waitlock); - return -EBUSY; -} - -/* - * hfi1_vnic_sdma_wakeup - vnic sdma wakeup function - * - * This function gets called when SDMA descriptors becomes available and Tx - * queue's wait structure was previously added to sdma engine's dmawait list. - * It notifies the upper driver about Tx queue wakeup. - */ -static void hfi1_vnic_sdma_wakeup(struct iowait *wait, int reason) -{ - struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait, struct hfi1_vnic_sdma, wait); - struct hfi1_vnic_vport_info *vinfo = vnic_sdma->vinfo; - - vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE; - if (__netif_subqueue_stopped(vinfo->netdev, vnic_sdma->q_idx)) - netif_wake_subqueue(vinfo->netdev, vnic_sdma->q_idx); -}; - -inline bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo, - u8 q_idx) -{ - struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx]; - - return (READ_ONCE(vnic_sdma->state) == HFI1_VNIC_SDMA_Q_ACTIVE); -} - -void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) -{ - int i; - - for (i = 0; i < vinfo->num_tx_q; i++) { - struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; - - iowait_init(&vnic_sdma->wait, 0, NULL, NULL, - hfi1_vnic_sdma_sleep, - hfi1_vnic_sdma_wakeup, NULL, NULL); - vnic_sdma->sde = &vinfo->dd->per_sdma[i]; - vnic_sdma->dd = vinfo->dd; - vnic_sdma->vinfo = vinfo; - vnic_sdma->q_idx = i; - vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE; - - /* Add a free descriptor watermark for wakeups */ - if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { - struct iowait_work *work; - - INIT_LIST_HEAD(&vnic_sdma->stx.list); - vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; - work = iowait_get_ib_work(&vnic_sdma->wait); - list_add_tail(&vnic_sdma->stx.list, &work->tx_head); - } - } -} - -int hfi1_vnic_txreq_init(struct hfi1_devdata *dd) -{ - char buf[HFI1_VNIC_TXREQ_NAME_LEN]; - - snprintf(buf, sizeof(buf), "hfi1_%u_vnic_txreq_cache", dd->unit); - dd->vnic.txreq_cache = kmem_cache_create(buf, - sizeof(struct vnic_txreq), - 0, SLAB_HWCACHE_ALIGN, - NULL); - if (!dd->vnic.txreq_cache) - return -ENOMEM; - return 0; -} - -void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd) -{ - kmem_cache_destroy(dd->vnic.txreq_cache); - dd->vnic.txreq_cache = NULL; -} diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index 4d0004b58377..51b0d41699b8 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -4,5 +4,4 @@ obj-$(CONFIG_INFINIBAND_SRP) += srp/ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ -obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic/ obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/ diff --git a/drivers/infiniband/ulp/opa_vnic/Kconfig b/drivers/infiniband/ulp/opa_vnic/Kconfig deleted file mode 100644 index 4d43d055fa8e..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/Kconfig +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config INFINIBAND_OPA_VNIC - tristate "Cornelis OPX VNIC support" - depends on X86_64 && INFINIBAND - help - This is Omni-Path Express (OPX) Virtual Network Interface Controller (VNIC) - driver for Ethernet over Omni-Path feature. It implements the HW - independent VNIC functionality. It interfaces with Linux stack for - data path and IB MAD for the control path. diff --git a/drivers/infiniband/ulp/opa_vnic/Makefile b/drivers/infiniband/ulp/opa_vnic/Makefile deleted file mode 100644 index 196183817cdc..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# Makefile - Cornelis Omni-Path Express Virtual Network Controller driver -# Copyright(c) 2017, Intel Corporation. -# Copyright(c) 2021, Cornelis Networks. -# -obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic.o - -opa_vnic-y := opa_vnic_netdev.o opa_vnic_encap.o opa_vnic_ethtool.o \ - opa_vnic_vema.o opa_vnic_vema_iface.o diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c deleted file mode 100644 index 53dcf06fbee0..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPA VNIC encapsulation/decapsulation function. - */ - -#include -#include - -#include "opa_vnic_internal.h" - -/* OPA 16B Header fields */ -#define OPA_16B_LID_MASK 0xFFFFFull -#define OPA_16B_SLID_HIGH_SHFT 8 -#define OPA_16B_SLID_MASK 0xF00ull -#define OPA_16B_DLID_MASK 0xF000ull -#define OPA_16B_DLID_HIGH_SHFT 12 -#define OPA_16B_LEN_SHFT 20 -#define OPA_16B_SC_SHFT 20 -#define OPA_16B_RC_SHFT 25 -#define OPA_16B_PKEY_SHFT 16 - -#define OPA_VNIC_L4_HDR_SHFT 16 - -/* L2+L4 hdr len is 20 bytes (5 quad words) */ -#define OPA_VNIC_HDR_QW_LEN 5 - -static inline void opa_vnic_make_header(u8 *hdr, u32 slid, u32 dlid, u16 len, - u16 pkey, u16 entropy, u8 sc, u8 rc, - u8 l4_type, u16 l4_hdr) -{ - /* h[1]: LT=1, 16B L2=10 */ - u32 h[OPA_VNIC_HDR_QW_LEN] = {0, 0xc0000000, 0, 0, 0}; - - h[2] = l4_type; - h[3] = entropy; - h[4] = l4_hdr << OPA_VNIC_L4_HDR_SHFT; - - /* Extract and set 4 upper bits and 20 lower bits of the lids */ - h[0] |= (slid & OPA_16B_LID_MASK); - h[2] |= ((slid >> (20 - OPA_16B_SLID_HIGH_SHFT)) & OPA_16B_SLID_MASK); - - h[1] |= (dlid & OPA_16B_LID_MASK); - h[2] |= ((dlid >> (20 - OPA_16B_DLID_HIGH_SHFT)) & OPA_16B_DLID_MASK); - - h[0] |= (len << OPA_16B_LEN_SHFT); - h[1] |= (rc << OPA_16B_RC_SHFT); - h[1] |= (sc << OPA_16B_SC_SHFT); - h[2] |= ((u32)pkey << OPA_16B_PKEY_SHFT); - - memcpy(hdr, h, OPA_VNIC_HDR_LEN); -} - -/* - * Using a simple hash table for mac table implementation with the last octet - * of mac address as a key. - */ -static void opa_vnic_free_mac_tbl(struct hlist_head *mactbl) -{ - struct opa_vnic_mac_tbl_node *node; - struct hlist_node *tmp; - int bkt; - - if (!mactbl) - return; - - vnic_hash_for_each_safe(mactbl, bkt, tmp, node, hlist) { - hash_del(&node->hlist); - kfree(node); - } - kfree(mactbl); -} - -static struct hlist_head *opa_vnic_alloc_mac_tbl(void) -{ - u32 size = sizeof(struct hlist_head) * OPA_VNIC_MAC_TBL_SIZE; - struct hlist_head *mactbl; - - mactbl = kzalloc(size, GFP_KERNEL); - if (!mactbl) - return ERR_PTR(-ENOMEM); - - vnic_hash_init(mactbl); - return mactbl; -} - -/* opa_vnic_release_mac_tbl - empty and free the mac table */ -void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter) -{ - struct hlist_head *mactbl; - - mutex_lock(&adapter->mactbl_lock); - mactbl = rcu_access_pointer(adapter->mactbl); - rcu_assign_pointer(adapter->mactbl, NULL); - synchronize_rcu(); - opa_vnic_free_mac_tbl(mactbl); - adapter->info.vport.mac_tbl_digest = 0; - mutex_unlock(&adapter->mactbl_lock); -} - -/* - * opa_vnic_query_mac_tbl - query the mac table for a section - * - * This function implements query of specific function of the mac table. - * The function also expects the requested range to be valid. - */ -void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter, - struct opa_veswport_mactable *tbl) -{ - struct opa_vnic_mac_tbl_node *node; - struct hlist_head *mactbl; - int bkt; - u16 loffset, lnum_entries; - - rcu_read_lock(); - mactbl = rcu_dereference(adapter->mactbl); - if (!mactbl) - goto get_mac_done; - - loffset = be16_to_cpu(tbl->offset); - lnum_entries = be16_to_cpu(tbl->num_entries); - - vnic_hash_for_each(mactbl, bkt, node, hlist) { - struct __opa_vnic_mactable_entry *nentry = &node->entry; - struct opa_veswport_mactable_entry *entry; - - if ((node->index < loffset) || - (node->index >= (loffset + lnum_entries))) - continue; - - /* populate entry in the tbl corresponding to the index */ - entry = &tbl->tbl_entries[node->index - loffset]; - memcpy(entry->mac_addr, nentry->mac_addr, - ARRAY_SIZE(entry->mac_addr)); - memcpy(entry->mac_addr_mask, nentry->mac_addr_mask, - ARRAY_SIZE(entry->mac_addr_mask)); - entry->dlid_sd = cpu_to_be32(nentry->dlid_sd); - } - tbl->mac_tbl_digest = cpu_to_be32(adapter->info.vport.mac_tbl_digest); -get_mac_done: - rcu_read_unlock(); -} - -/* - * opa_vnic_update_mac_tbl - update mac table section - * - * This function updates the specified section of the mac table. - * The procedure includes following steps. - * - Allocate a new mac (hash) table. - * - Add the specified entries to the new table. - * (except the ones that are requested to be deleted). - * - Add all the other entries from the old mac table. - * - If there is a failure, free the new table and return. - * - Switch to the new table. - * - Free the old table and return. - * - * The function also expects the requested range to be valid. - */ -int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter, - struct opa_veswport_mactable *tbl) -{ - struct opa_vnic_mac_tbl_node *node, *new_node; - struct hlist_head *new_mactbl, *old_mactbl; - int i, bkt, rc = 0; - u8 key; - u16 loffset, lnum_entries; - - mutex_lock(&adapter->mactbl_lock); - /* allocate new mac table */ - new_mactbl = opa_vnic_alloc_mac_tbl(); - if (IS_ERR(new_mactbl)) { - mutex_unlock(&adapter->mactbl_lock); - return PTR_ERR(new_mactbl); - } - - loffset = be16_to_cpu(tbl->offset); - lnum_entries = be16_to_cpu(tbl->num_entries); - - /* add updated entries to the new mac table */ - for (i = 0; i < lnum_entries; i++) { - struct __opa_vnic_mactable_entry *nentry; - struct opa_veswport_mactable_entry *entry = - &tbl->tbl_entries[i]; - u8 *mac_addr = entry->mac_addr; - u8 empty_mac[ETH_ALEN] = { 0 }; - - v_dbg("new mac entry %4d: %02x:%02x:%02x:%02x:%02x:%02x %x\n", - loffset + i, mac_addr[0], mac_addr[1], mac_addr[2], - mac_addr[3], mac_addr[4], mac_addr[5], - entry->dlid_sd); - - /* if the entry is being removed, do not add it */ - if (!memcmp(mac_addr, empty_mac, ARRAY_SIZE(empty_mac))) - continue; - - node = kzalloc_obj(*node); - if (!node) { - rc = -ENOMEM; - goto updt_done; - } - - node->index = loffset + i; - nentry = &node->entry; - memcpy(nentry->mac_addr, entry->mac_addr, - ARRAY_SIZE(nentry->mac_addr)); - memcpy(nentry->mac_addr_mask, entry->mac_addr_mask, - ARRAY_SIZE(nentry->mac_addr_mask)); - nentry->dlid_sd = be32_to_cpu(entry->dlid_sd); - key = node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX]; - vnic_hash_add(new_mactbl, &node->hlist, key); - } - - /* add other entries from current mac table to new mac table */ - old_mactbl = rcu_access_pointer(adapter->mactbl); - if (!old_mactbl) - goto switch_tbl; - - vnic_hash_for_each(old_mactbl, bkt, node, hlist) { - if ((node->index >= loffset) && - (node->index < (loffset + lnum_entries))) - continue; - - new_node = kzalloc_obj(*new_node); - if (!new_node) { - rc = -ENOMEM; - goto updt_done; - } - - new_node->index = node->index; - memcpy(&new_node->entry, &node->entry, sizeof(node->entry)); - key = new_node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX]; - vnic_hash_add(new_mactbl, &new_node->hlist, key); - } - -switch_tbl: - /* switch to new table */ - rcu_assign_pointer(adapter->mactbl, new_mactbl); - synchronize_rcu(); - - adapter->info.vport.mac_tbl_digest = be32_to_cpu(tbl->mac_tbl_digest); -updt_done: - /* upon failure, free the new table; otherwise, free the old table */ - if (rc) - opa_vnic_free_mac_tbl(new_mactbl); - else - opa_vnic_free_mac_tbl(old_mactbl); - - mutex_unlock(&adapter->mactbl_lock); - return rc; -} - -/* opa_vnic_chk_mac_tbl - check mac table for dlid */ -static uint32_t opa_vnic_chk_mac_tbl(struct opa_vnic_adapter *adapter, - struct ethhdr *mac_hdr) -{ - struct opa_vnic_mac_tbl_node *node; - struct hlist_head *mactbl; - u32 dlid = 0; - u8 key; - - rcu_read_lock(); - mactbl = rcu_dereference(adapter->mactbl); - if (unlikely(!mactbl)) - goto chk_done; - - key = mac_hdr->h_dest[OPA_VNIC_MAC_HASH_IDX]; - vnic_hash_for_each_possible(mactbl, node, hlist, key) { - struct __opa_vnic_mactable_entry *entry = &node->entry; - - /* if related to source mac, skip */ - if (unlikely(OPA_VNIC_DLID_SD_IS_SRC_MAC(entry->dlid_sd))) - continue; - - if (!memcmp(node->entry.mac_addr, mac_hdr->h_dest, - ARRAY_SIZE(node->entry.mac_addr))) { - /* mac address found */ - dlid = OPA_VNIC_DLID_SD_GET_DLID(node->entry.dlid_sd); - break; - } - } - -chk_done: - rcu_read_unlock(); - return dlid; -} - -/* opa_vnic_get_dlid - find and return the DLID */ -static uint32_t opa_vnic_get_dlid(struct opa_vnic_adapter *adapter, - struct sk_buff *skb, u8 def_port) -{ - struct __opa_veswport_info *info = &adapter->info; - struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); - u32 dlid; - - dlid = opa_vnic_chk_mac_tbl(adapter, mac_hdr); - if (dlid) - return dlid; - - if (is_multicast_ether_addr(mac_hdr->h_dest)) { - dlid = info->vesw.u_mcast_dlid; - } else { - if (is_local_ether_addr(mac_hdr->h_dest)) { - dlid = ((uint32_t)mac_hdr->h_dest[5] << 16) | - ((uint32_t)mac_hdr->h_dest[4] << 8) | - mac_hdr->h_dest[3]; - if (unlikely(!dlid)) - v_warn("Null dlid in MAC address\n"); - } else if (def_port != OPA_VNIC_INVALID_PORT) { - if (def_port < OPA_VESW_MAX_NUM_DEF_PORT) - dlid = info->vesw.u_ucast_dlid[def_port]; - } - } - - return dlid; -} - -/* opa_vnic_get_sc - return the service class */ -static u8 opa_vnic_get_sc(struct __opa_veswport_info *info, - struct sk_buff *skb) -{ - struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); - u16 vlan_tci; - u8 sc; - - if (!__vlan_get_tag(skb, &vlan_tci)) { - u8 pcp = OPA_VNIC_VLAN_PCP(vlan_tci); - - if (is_multicast_ether_addr(mac_hdr->h_dest)) - sc = info->vport.pcp_to_sc_mc[pcp]; - else - sc = info->vport.pcp_to_sc_uc[pcp]; - } else { - if (is_multicast_ether_addr(mac_hdr->h_dest)) - sc = info->vport.non_vlan_sc_mc; - else - sc = info->vport.non_vlan_sc_uc; - } - - return sc; -} - -u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb) -{ - struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); - struct __opa_veswport_info *info = &adapter->info; - u8 vl; - - if (skb_vlan_tag_present(skb)) { - u8 pcp = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT; - - if (is_multicast_ether_addr(mac_hdr->h_dest)) - vl = info->vport.pcp_to_vl_mc[pcp]; - else - vl = info->vport.pcp_to_vl_uc[pcp]; - } else { - if (is_multicast_ether_addr(mac_hdr->h_dest)) - vl = info->vport.non_vlan_vl_mc; - else - vl = info->vport.non_vlan_vl_uc; - } - - return vl; -} - -/* opa_vnic_get_rc - return the routing control */ -static u8 opa_vnic_get_rc(struct __opa_veswport_info *info, - struct sk_buff *skb) -{ - u8 proto, rout_ctrl; - - switch (vlan_get_protocol(skb)) { - case htons(ETH_P_IPV6): - proto = ipv6_hdr(skb)->nexthdr; - if (proto == IPPROTO_TCP) - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, - IPV6_TCP); - else if (proto == IPPROTO_UDP) - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, - IPV6_UDP); - else - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, IPV6); - break; - case htons(ETH_P_IP): - proto = ip_hdr(skb)->protocol; - if (proto == IPPROTO_TCP) - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, - IPV4_TCP); - else if (proto == IPPROTO_UDP) - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, - IPV4_UDP); - else - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, IPV4); - break; - default: - rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, DEFAULT); - } - - return rout_ctrl; -} - -/* opa_vnic_calc_entropy - calculate the packet entropy */ -u8 opa_vnic_calc_entropy(struct sk_buff *skb) -{ - u32 hash = skb_get_hash(skb); - - /* store XOR of all bytes in lower 8 bits */ - hash ^= hash >> 8; - hash ^= hash >> 16; - - /* return lower 8 bits as entropy */ - return (u8)(hash & 0xFF); -} - -/* opa_vnic_get_def_port - get default port based on entropy */ -static inline u8 opa_vnic_get_def_port(struct opa_vnic_adapter *adapter, - u8 entropy) -{ - u8 flow_id; - - /* Add the upper and lower 4-bits of entropy to get the flow id */ - flow_id = ((entropy & 0xf) + (entropy >> 4)); - return adapter->flow_tbl[flow_id & (OPA_VNIC_FLOW_TBL_SIZE - 1)]; -} - -/* Calculate packet length including OPA header, crc and padding */ -static inline int opa_vnic_wire_length(struct sk_buff *skb) -{ - u32 pad_len; - - /* padding for 8 bytes size alignment */ - pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7; - pad_len += OPA_VNIC_ICRC_TAIL_LEN; - - return (skb->len + pad_len) >> 3; -} - -/* opa_vnic_encap_skb - encapsulate skb packet with OPA header and meta data */ -void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb) -{ - struct __opa_veswport_info *info = &adapter->info; - struct opa_vnic_skb_mdata *mdata; - u8 def_port, sc, rc, entropy, *hdr; - u16 len, l4_hdr; - u32 dlid; - - hdr = skb_push(skb, OPA_VNIC_HDR_LEN); - - entropy = opa_vnic_calc_entropy(skb); - def_port = opa_vnic_get_def_port(adapter, entropy); - len = opa_vnic_wire_length(skb); - dlid = opa_vnic_get_dlid(adapter, skb, def_port); - sc = opa_vnic_get_sc(info, skb); - rc = opa_vnic_get_rc(info, skb); - l4_hdr = info->vesw.vesw_id; - - mdata = skb_push(skb, sizeof(*mdata)); - mdata->vl = opa_vnic_get_vl(adapter, skb); - mdata->entropy = entropy; - mdata->flags = 0; - if (unlikely(!dlid)) { - mdata->flags = OPA_VNIC_SKB_MDATA_ENCAP_ERR; - return; - } - - opa_vnic_make_header(hdr, info->vport.encap_slid, dlid, len, - info->vesw.pkey, entropy, sc, rc, - OPA_VNIC_L4_ETHR, l4_hdr); -} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h deleted file mode 100644 index 012fc27c5c93..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h +++ /dev/null @@ -1,524 +0,0 @@ -#ifndef _OPA_VNIC_ENCAP_H -#define _OPA_VNIC_ENCAP_H -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains all OPA VNIC declaration required for encapsulation - * and decapsulation of Ethernet packets - */ - -#include -#include - -/* EMA class version */ -#define OPA_EMA_CLASS_VERSION 0x80 - -/* - * Define the Intel vendor management class for OPA - * ETHERNET MANAGEMENT - */ -#define OPA_MGMT_CLASS_INTEL_EMA 0x34 - -/* EM attribute IDs */ -#define OPA_EM_ATTR_CLASS_PORT_INFO 0x0001 -#define OPA_EM_ATTR_VESWPORT_INFO 0x0011 -#define OPA_EM_ATTR_VESWPORT_MAC_ENTRIES 0x0012 -#define OPA_EM_ATTR_IFACE_UCAST_MACS 0x0013 -#define OPA_EM_ATTR_IFACE_MCAST_MACS 0x0014 -#define OPA_EM_ATTR_DELETE_VESW 0x0015 -#define OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS 0x0020 -#define OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS 0x0022 - -/* VNIC configured and operational state values */ -#define OPA_VNIC_STATE_DROP_ALL 0x1 -#define OPA_VNIC_STATE_FORWARDING 0x3 - -#define OPA_VESW_MAX_NUM_DEF_PORT 16 -#define OPA_VNIC_MAX_NUM_PCP 8 - -#define OPA_VNIC_EMA_DATA (OPA_MGMT_MAD_SIZE - IB_MGMT_VENDOR_HDR) - -/* Defines for vendor specific notice(trap) attributes */ -#define OPA_INTEL_EMA_NOTICE_TYPE_INFO 0x04 - -/* INTEL OUI */ -#define INTEL_OUI_1 0x00 -#define INTEL_OUI_2 0x06 -#define INTEL_OUI_3 0x6a - -/* Trap opcodes sent from VNIC */ -#define OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE 0x1 -#define OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE 0x2 -#define OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE 0x3 - -#define OPA_VNIC_DLID_SD_IS_SRC_MAC(dlid_sd) (!!((dlid_sd) & 0x20)) -#define OPA_VNIC_DLID_SD_GET_DLID(dlid_sd) ((dlid_sd) >> 8) - -/* VNIC Ethernet link status */ -#define OPA_VNIC_ETH_LINK_UP 1 -#define OPA_VNIC_ETH_LINK_DOWN 2 - -/* routing control */ -#define OPA_VNIC_ENCAP_RC_DEFAULT 0 -#define OPA_VNIC_ENCAP_RC_IPV4 4 -#define OPA_VNIC_ENCAP_RC_IPV4_UDP 8 -#define OPA_VNIC_ENCAP_RC_IPV4_TCP 12 -#define OPA_VNIC_ENCAP_RC_IPV6 16 -#define OPA_VNIC_ENCAP_RC_IPV6_TCP 20 -#define OPA_VNIC_ENCAP_RC_IPV6_UDP 24 - -#define OPA_VNIC_ENCAP_RC_EXT(w, b) (((w) >> OPA_VNIC_ENCAP_RC_ ## b) & 0x7) - -/** - * struct opa_vesw_info - OPA vnic switch information - * @fabric_id: 10-bit fabric id - * @vesw_id: 12-bit virtual ethernet switch id - * @rsvd0: reserved bytes - * @def_port_mask: bitmask of default ports - * @rsvd1: reserved bytes - * @pkey: partition key - * @rsvd2: reserved bytes - * @u_mcast_dlid: unknown multicast dlid - * @u_ucast_dlid: array of unknown unicast dlids - * @rsvd3: reserved bytes - * @rc: routing control - * @eth_mtu: Ethernet MTU - * @rsvd4: reserved bytes - */ -struct opa_vesw_info { - __be16 fabric_id; - __be16 vesw_id; - - u8 rsvd0[6]; - __be16 def_port_mask; - - u8 rsvd1[2]; - __be16 pkey; - - u8 rsvd2[4]; - __be32 u_mcast_dlid; - __be32 u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT]; - - __be32 rc; - - u8 rsvd3[56]; - __be16 eth_mtu; - u8 rsvd4[2]; -} __packed; - -/** - * struct opa_per_veswport_info - OPA vnic per port information - * @port_num: port number - * @eth_link_status: current ethernet link state - * @rsvd0: reserved bytes - * @base_mac_addr: base mac address - * @config_state: configured port state - * @oper_state: operational port state - * @max_mac_tbl_ent: max number of mac table entries - * @max_smac_ent: max smac entries in mac table - * @mac_tbl_digest: mac table digest - * @rsvd1: reserved bytes - * @encap_slid: base slid for the port - * @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets - * @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets - * @pcp_to_sc_mc: sc by pcp index for multicast ethernet packets - * @pcp_to_vl_mc: vl by pcp index for multicast ethernet packets - * @non_vlan_sc_uc: sc for non-vlan unicast ethernet packets - * @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets - * @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets - * @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets - * @rsvd2: reserved bytes - * @uc_macs_gen_count: generation count for unicast macs list - * @mc_macs_gen_count: generation count for multicast macs list - * @rsvd3: reserved bytes - */ -struct opa_per_veswport_info { - __be32 port_num; - - u8 eth_link_status; - u8 rsvd0[3]; - - u8 base_mac_addr[ETH_ALEN]; - u8 config_state; - u8 oper_state; - - __be16 max_mac_tbl_ent; - __be16 max_smac_ent; - __be32 mac_tbl_digest; - u8 rsvd1[4]; - - __be32 encap_slid; - - u8 pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP]; - - u8 non_vlan_sc_uc; - u8 non_vlan_vl_uc; - u8 non_vlan_sc_mc; - u8 non_vlan_vl_mc; - - u8 rsvd2[48]; - - __be16 uc_macs_gen_count; - __be16 mc_macs_gen_count; - - u8 rsvd3[8]; -} __packed; - -/** - * struct opa_veswport_info - OPA vnic port information - * @vesw: OPA vnic switch information - * @vport: OPA vnic per port information - * - * On host, each of the virtual ethernet ports belongs - * to a different virtual ethernet switches. - */ -struct opa_veswport_info { - struct opa_vesw_info vesw; - struct opa_per_veswport_info vport; -}; - -/** - * struct opa_veswport_mactable_entry - single entry in the forwarding table - * @mac_addr: MAC address - * @mac_addr_mask: MAC address bit mask - * @dlid_sd: Matching DLID and side data - * - * On the host each virtual ethernet port will have - * a forwarding table. These tables are used to - * map a MAC to a LID and other data. For more - * details see struct opa_veswport_mactable_entries. - * This is the structure of a single mactable entry - */ -struct opa_veswport_mactable_entry { - u8 mac_addr[ETH_ALEN]; - u8 mac_addr_mask[ETH_ALEN]; - __be32 dlid_sd; -} __packed; - -/** - * struct opa_veswport_mactable - Forwarding table array - * @offset: mac table starting offset - * @num_entries: Number of entries to get or set - * @mac_tbl_digest: mac table digest - * @tbl_entries: Array of table entries - * - * The EM sends down this structure in a MAD indicating - * the starting offset in the forwarding table that this - * entry is to be loaded into and the number of entries - * that that this MAD instance contains - * The mac_tbl_digest has been added to this MAD structure. It will be set by - * the EM and it will be used by the EM to check if there are any - * discrepancies with this value and the value - * maintained by the EM in the case of VNIC port being deleted or unloaded - * A new instantiation of a VNIC will always have a value of zero. - * This value is stored as part of the vnic adapter structure and will be - * accessed by the GET and SET routines for both the mactable entries and the - * veswport info. - */ -struct opa_veswport_mactable { - __be16 offset; - __be16 num_entries; - __be32 mac_tbl_digest; - struct opa_veswport_mactable_entry tbl_entries[]; -} __packed; - -/** - * struct opa_veswport_summary_counters - summary counters - * @vp_instance: vport instance on the OPA port - * @vesw_id: virtual ethernet switch id - * @veswport_num: virtual ethernet switch port number - * @tx_errors: transmit errors - * @rx_errors: receive errors - * @tx_packets: transmit packets - * @rx_packets: receive packets - * @tx_bytes: transmit bytes - * @rx_bytes: receive bytes - * @tx_unicast: unicast packets transmitted - * @tx_mcastbcast: multicast/broadcast packets transmitted - * @tx_untagged: non-vlan packets transmitted - * @tx_vlan: vlan packets transmitted - * @tx_64_size: transmit packet length is 64 bytes - * @tx_65_127: transmit packet length is >=65 and < 127 bytes - * @tx_128_255: transmit packet length is >=128 and < 255 bytes - * @tx_256_511: transmit packet length is >=256 and < 511 bytes - * @tx_512_1023: transmit packet length is >=512 and < 1023 bytes - * @tx_1024_1518: transmit packet length is >=1024 and < 1518 bytes - * @tx_1519_max: transmit packet length >= 1519 bytes - * @rx_unicast: unicast packets received - * @rx_mcastbcast: multicast/broadcast packets received - * @rx_untagged: non-vlan packets received - * @rx_vlan: vlan packets received - * @rx_64_size: received packet length is 64 bytes - * @rx_65_127: received packet length is >=65 and < 127 bytes - * @rx_128_255: received packet length is >=128 and < 255 bytes - * @rx_256_511: received packet length is >=256 and < 511 bytes - * @rx_512_1023: received packet length is >=512 and < 1023 bytes - * @rx_1024_1518: received packet length is >=1024 and < 1518 bytes - * @rx_1519_max: received packet length >= 1519 bytes - * @reserved: reserved bytes - * - * All the above are counters of corresponding conditions. - */ -struct opa_veswport_summary_counters { - __be16 vp_instance; - __be16 vesw_id; - __be32 veswport_num; - - __be64 tx_errors; - __be64 rx_errors; - __be64 tx_packets; - __be64 rx_packets; - __be64 tx_bytes; - __be64 rx_bytes; - - __be64 tx_unicast; - __be64 tx_mcastbcast; - - __be64 tx_untagged; - __be64 tx_vlan; - - __be64 tx_64_size; - __be64 tx_65_127; - __be64 tx_128_255; - __be64 tx_256_511; - __be64 tx_512_1023; - __be64 tx_1024_1518; - __be64 tx_1519_max; - - __be64 rx_unicast; - __be64 rx_mcastbcast; - - __be64 rx_untagged; - __be64 rx_vlan; - - __be64 rx_64_size; - __be64 rx_65_127; - __be64 rx_128_255; - __be64 rx_256_511; - __be64 rx_512_1023; - __be64 rx_1024_1518; - __be64 rx_1519_max; - - __be64 reserved[16]; -} __packed; - -/** - * struct opa_veswport_error_counters - error counters - * @vp_instance: vport instance on the OPA port - * @vesw_id: virtual ethernet switch id - * @veswport_num: virtual ethernet switch port number - * @tx_errors: transmit errors - * @rx_errors: receive errors - * @rsvd0: reserved bytes - * @tx_smac_filt: smac filter errors - * @rsvd1: reserved bytes - * @rsvd2: reserved bytes - * @rsvd3: reserved bytes - * @tx_dlid_zero: transmit packets with invalid dlid - * @rsvd4: reserved bytes - * @tx_logic: other transmit errors - * @rsvd5: reserved bytes - * @tx_drop_state: packet tansmission in non-forward port state - * @rx_bad_veswid: received packet with invalid vesw id - * @rsvd6: reserved bytes - * @rx_runt: received ethernet packet with length < 64 bytes - * @rx_oversize: received ethernet packet with length > MTU size - * @rsvd7: reserved bytes - * @rx_eth_down: received packets when interface is down - * @rx_drop_state: received packets in non-forwarding port state - * @rx_logic: other receive errors - * @rsvd8: reserved bytes - * @rsvd9: reserved bytes - * - * All the above are counters of corresponding error conditions. - */ -struct opa_veswport_error_counters { - __be16 vp_instance; - __be16 vesw_id; - __be32 veswport_num; - - __be64 tx_errors; - __be64 rx_errors; - - __be64 rsvd0; - __be64 tx_smac_filt; - __be64 rsvd1; - __be64 rsvd2; - __be64 rsvd3; - __be64 tx_dlid_zero; - __be64 rsvd4; - __be64 tx_logic; - __be64 rsvd5; - __be64 tx_drop_state; - - __be64 rx_bad_veswid; - __be64 rsvd6; - __be64 rx_runt; - __be64 rx_oversize; - __be64 rsvd7; - __be64 rx_eth_down; - __be64 rx_drop_state; - __be64 rx_logic; - __be64 rsvd8; - - __be64 rsvd9[16]; -} __packed; - -/** - * struct opa_veswport_trap - Trap message sent to EM by VNIC - * @fabric_id: 10 bit fabric id - * @veswid: 12 bit virtual ethernet switch id - * @veswportnum: logical port number on the Virtual switch - * @opaportnum: physical port num (redundant on host) - * @veswportindex: switch port index on opa port 0 based - * @opcode: operation - * @reserved: 32 bit for alignment - * - * The VNIC will send trap messages to the Ethernet manager to - * inform it about changes to the VNIC config, behaviour etc. - * This is the format of the trap payload. - */ -struct opa_veswport_trap { - __be16 fabric_id; - __be16 veswid; - __be32 veswportnum; - __be16 opaportnum; - u8 veswportindex; - u8 opcode; - __be32 reserved; -} __packed; - -/** - * struct opa_vnic_iface_mac_entry - single entry in the mac list - * @mac_addr: MAC address - */ -struct opa_vnic_iface_mac_entry { - u8 mac_addr[ETH_ALEN]; -}; - -/** - * struct opa_veswport_iface_macs - Msg to set globally administered MAC - * @start_idx: position of first entry (0 based) - * @num_macs_in_msg: number of MACs in this message - * @tot_macs_in_lst: The total number of MACs the agent has - * @gen_count: gen_count to indicate change - * @entry: The mac list entry - * - * Same attribute IDS and attribute modifiers as in locally administered - * addresses used to set globally administered addresses - */ -struct opa_veswport_iface_macs { - __be16 start_idx; - __be16 num_macs_in_msg; - __be16 tot_macs_in_lst; - __be16 gen_count; - struct opa_vnic_iface_mac_entry entry[]; -} __packed; - -/** - * struct opa_vnic_vema_mad - Generic VEMA MAD - * @mad_hdr: Generic MAD header - * @rmpp_hdr: RMPP header for vendor specific MADs - * @reserved: reserved bytes - * @oui: Unique org identifier - * @data: MAD data - */ -struct opa_vnic_vema_mad { - struct ib_mad_hdr mad_hdr; - struct ib_rmpp_hdr rmpp_hdr; - u8 reserved; - u8 oui[3]; - u8 data[OPA_VNIC_EMA_DATA]; -}; - -/** - * struct opa_vnic_notice_attr - Generic Notice MAD - * @gen_type: Generic/Specific bit and type of notice - * @oui_1: Vendor ID byte 1 - * @oui_2: Vendor ID byte 2 - * @oui_3: Vendor ID byte 3 - * @trap_num: Trap number - * @toggle_count: Notice toggle bit and count value - * @issuer_lid: Trap issuer's lid - * @reserved: reserved bytes - * @issuer_gid: Issuer GID (only if Report method) - * @raw_data: Trap message body - */ -struct opa_vnic_notice_attr { - u8 gen_type; - u8 oui_1; - u8 oui_2; - u8 oui_3; - __be16 trap_num; - __be16 toggle_count; - __be32 issuer_lid; - __be32 reserved; - u8 issuer_gid[16]; - u8 raw_data[64]; -} __packed; - -/** - * struct opa_vnic_vema_mad_trap - Generic VEMA MAD Trap - * @mad_hdr: Generic MAD header - * @rmpp_hdr: RMPP header for vendor specific MADs - * @reserved: reserved bytes - * @oui: Unique org identifier - * @notice: Notice structure - */ -struct opa_vnic_vema_mad_trap { - struct ib_mad_hdr mad_hdr; - struct ib_rmpp_hdr rmpp_hdr; - u8 reserved; - u8 oui[3]; - struct opa_vnic_notice_attr notice; -}; - -#endif /* _OPA_VNIC_ENCAP_H */ diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c deleted file mode 100644 index 316959940d2f..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPA VNIC ethtool functions - */ - -#include - -#include "opa_vnic_internal.h" - -enum {NETDEV_STATS, VNIC_STATS}; - -struct vnic_stats { - char stat_string[ETH_GSTRING_LEN]; - struct { - int sizeof_stat; - int stat_offset; - }; -}; - -#define VNIC_STAT(m) { sizeof_field(struct opa_vnic_stats, m), \ - offsetof(struct opa_vnic_stats, m) } - -static struct vnic_stats vnic_gstrings_stats[] = { - /* NETDEV stats */ - {"rx_packets", VNIC_STAT(netstats.rx_packets)}, - {"tx_packets", VNIC_STAT(netstats.tx_packets)}, - {"rx_bytes", VNIC_STAT(netstats.rx_bytes)}, - {"tx_bytes", VNIC_STAT(netstats.tx_bytes)}, - {"rx_errors", VNIC_STAT(netstats.rx_errors)}, - {"tx_errors", VNIC_STAT(netstats.tx_errors)}, - {"rx_dropped", VNIC_STAT(netstats.rx_dropped)}, - {"tx_dropped", VNIC_STAT(netstats.tx_dropped)}, - - /* SUMMARY counters */ - {"tx_unicast", VNIC_STAT(tx_grp.unicast)}, - {"tx_mcastbcast", VNIC_STAT(tx_grp.mcastbcast)}, - {"tx_untagged", VNIC_STAT(tx_grp.untagged)}, - {"tx_vlan", VNIC_STAT(tx_grp.vlan)}, - - {"tx_64_size", VNIC_STAT(tx_grp.s_64)}, - {"tx_65_127", VNIC_STAT(tx_grp.s_65_127)}, - {"tx_128_255", VNIC_STAT(tx_grp.s_128_255)}, - {"tx_256_511", VNIC_STAT(tx_grp.s_256_511)}, - {"tx_512_1023", VNIC_STAT(tx_grp.s_512_1023)}, - {"tx_1024_1518", VNIC_STAT(tx_grp.s_1024_1518)}, - {"tx_1519_max", VNIC_STAT(tx_grp.s_1519_max)}, - - {"rx_unicast", VNIC_STAT(rx_grp.unicast)}, - {"rx_mcastbcast", VNIC_STAT(rx_grp.mcastbcast)}, - {"rx_untagged", VNIC_STAT(rx_grp.untagged)}, - {"rx_vlan", VNIC_STAT(rx_grp.vlan)}, - - {"rx_64_size", VNIC_STAT(rx_grp.s_64)}, - {"rx_65_127", VNIC_STAT(rx_grp.s_65_127)}, - {"rx_128_255", VNIC_STAT(rx_grp.s_128_255)}, - {"rx_256_511", VNIC_STAT(rx_grp.s_256_511)}, - {"rx_512_1023", VNIC_STAT(rx_grp.s_512_1023)}, - {"rx_1024_1518", VNIC_STAT(rx_grp.s_1024_1518)}, - {"rx_1519_max", VNIC_STAT(rx_grp.s_1519_max)}, - - /* ERROR counters */ - {"rx_fifo_errors", VNIC_STAT(netstats.rx_fifo_errors)}, - {"rx_length_errors", VNIC_STAT(netstats.rx_length_errors)}, - - {"tx_fifo_errors", VNIC_STAT(netstats.tx_fifo_errors)}, - {"tx_carrier_errors", VNIC_STAT(netstats.tx_carrier_errors)}, - - {"tx_dlid_zero", VNIC_STAT(tx_dlid_zero)}, - {"tx_drop_state", VNIC_STAT(tx_drop_state)}, - {"rx_drop_state", VNIC_STAT(rx_drop_state)}, - {"rx_oversize", VNIC_STAT(rx_oversize)}, - {"rx_runt", VNIC_STAT(rx_runt)}, -}; - -#define VNIC_STATS_LEN ARRAY_SIZE(vnic_gstrings_stats) - -/* vnic_get_drvinfo - get driver info */ -static void vnic_get_drvinfo(struct net_device *netdev, - struct ethtool_drvinfo *drvinfo) -{ - strscpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver)); - strscpy(drvinfo->bus_info, dev_name(netdev->dev.parent), - sizeof(drvinfo->bus_info)); -} - -/* vnic_get_sset_count - get string set count */ -static int vnic_get_sset_count(struct net_device *netdev, int sset) -{ - return (sset == ETH_SS_STATS) ? VNIC_STATS_LEN : -EOPNOTSUPP; -} - -/* vnic_get_ethtool_stats - get statistics */ -static void vnic_get_ethtool_stats(struct net_device *netdev, - struct ethtool_stats *stats, u64 *data) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - struct opa_vnic_stats vstats; - int i; - - memset(&vstats, 0, sizeof(vstats)); - spin_lock(&adapter->stats_lock); - adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats); - spin_unlock(&adapter->stats_lock); - for (i = 0; i < VNIC_STATS_LEN; i++) { - char *p = (char *)&vstats + vnic_gstrings_stats[i].stat_offset; - - data[i] = (vnic_gstrings_stats[i].sizeof_stat == - sizeof(u64)) ? *(u64 *)p : *(u32 *)p; - } -} - -/* vnic_get_strings - get strings */ -static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data) -{ - int i; - - if (stringset != ETH_SS_STATS) - return; - - for (i = 0; i < VNIC_STATS_LEN; i++) - ethtool_puts(&data, vnic_gstrings_stats[i].stat_string); -} - -/* ethtool ops */ -static const struct ethtool_ops opa_vnic_ethtool_ops = { - .get_drvinfo = vnic_get_drvinfo, - .get_link = ethtool_op_get_link, - .get_strings = vnic_get_strings, - .get_sset_count = vnic_get_sset_count, - .get_ethtool_stats = vnic_get_ethtool_stats, -}; - -/* opa_vnic_set_ethtool_ops - set ethtool ops */ -void opa_vnic_set_ethtool_ops(struct net_device *netdev) -{ - netdev->ethtool_ops = &opa_vnic_ethtool_ops; -} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h deleted file mode 100644 index dd942dd642bd..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h +++ /dev/null @@ -1,329 +0,0 @@ -#ifndef _OPA_VNIC_INTERNAL_H -#define _OPA_VNIC_INTERNAL_H -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPA VNIC driver internal declarations - */ - -#include -#include -#include -#include -#include - -#include "opa_vnic_encap.h" - -#define OPA_VNIC_VLAN_PCP(vlan_tci) \ - (((vlan_tci) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) - -/* Flow to default port redirection table size */ -#define OPA_VNIC_FLOW_TBL_SIZE 32 - -/* Invalid port number */ -#define OPA_VNIC_INVALID_PORT 0xff - -struct opa_vnic_adapter; - -/* - * struct __opa_vesw_info - OPA vnic virtual switch info - * - * Same as opa_vesw_info without bitwise attribute. - */ -struct __opa_vesw_info { - u16 fabric_id; - u16 vesw_id; - - u8 rsvd0[6]; - u16 def_port_mask; - - u8 rsvd1[2]; - u16 pkey; - - u8 rsvd2[4]; - u32 u_mcast_dlid; - u32 u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT]; - - u32 rc; - - u8 rsvd3[56]; - u16 eth_mtu; - u8 rsvd4[2]; -} __packed; - -/* - * struct __opa_per_veswport_info - OPA vnic per port info - * - * Same as opa_per_veswport_info without bitwise attribute. - */ -struct __opa_per_veswport_info { - u32 port_num; - - u8 eth_link_status; - u8 rsvd0[3]; - - u8 base_mac_addr[ETH_ALEN]; - u8 config_state; - u8 oper_state; - - u16 max_mac_tbl_ent; - u16 max_smac_ent; - u32 mac_tbl_digest; - u8 rsvd1[4]; - - u32 encap_slid; - - u8 pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP]; - u8 pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP]; - - u8 non_vlan_sc_uc; - u8 non_vlan_vl_uc; - u8 non_vlan_sc_mc; - u8 non_vlan_vl_mc; - - u8 rsvd2[48]; - - u16 uc_macs_gen_count; - u16 mc_macs_gen_count; - - u8 rsvd3[8]; -} __packed; - -/* - * struct __opa_veswport_info - OPA vnic port info - * - * Same as opa_veswport_info without bitwise attribute. - */ -struct __opa_veswport_info { - struct __opa_vesw_info vesw; - struct __opa_per_veswport_info vport; -}; - -/* - * struct __opa_veswport_trap - OPA vnic trap info - * - * Same as opa_veswport_trap without bitwise attribute. - */ -struct __opa_veswport_trap { - u16 fabric_id; - u16 veswid; - u32 veswportnum; - u16 opaportnum; - u8 veswportindex; - u8 opcode; - u32 reserved; -} __packed; - -/** - * struct opa_vnic_ctrl_port - OPA virtual NIC control port - * @ibdev: pointer to ib device - * @ops: opa vnic control operations - * @num_ports: number of opa ports - */ -struct opa_vnic_ctrl_port { - struct ib_device *ibdev; - struct opa_vnic_ctrl_ops *ops; - u8 num_ports; -}; - -/** - * struct opa_vnic_adapter - OPA VNIC netdev private data structure - * @netdev: pointer to associated netdev - * @ibdev: ib device - * @cport: pointer to opa vnic control port - * @rn_ops: rdma netdev's net_device_ops - * @port_num: OPA port number - * @vport_num: vesw port number - * @lock: adapter lock - * @info: virtual ethernet switch port information - * @vema_mac_addr: mac address configured by vema - * @umac_hash: unicast maclist hash - * @mmac_hash: multicast maclist hash - * @mactbl: hash table of MAC entries - * @mactbl_lock: mac table lock - * @stats_lock: statistics lock - * @flow_tbl: flow to default port redirection table - * @trap_timeout: trap timeout - * @trap_count: no. of traps allowed within timeout period - */ -struct opa_vnic_adapter { - struct net_device *netdev; - struct ib_device *ibdev; - struct opa_vnic_ctrl_port *cport; - const struct net_device_ops *rn_ops; - - u8 port_num; - u8 vport_num; - - /* Lock used around concurrent updates to netdev */ - struct mutex lock; - - struct __opa_veswport_info info; - u8 vema_mac_addr[ETH_ALEN]; - u32 umac_hash; - u32 mmac_hash; - struct hlist_head __rcu *mactbl; - - /* Lock used to protect updates to mac table */ - struct mutex mactbl_lock; - - /* Lock used to protect access to vnic counters */ - spinlock_t stats_lock; - - u8 flow_tbl[OPA_VNIC_FLOW_TBL_SIZE]; - - unsigned long trap_timeout; - u8 trap_count; -}; - -/* Same as opa_veswport_mactable_entry, but without bitwise attribute */ -struct __opa_vnic_mactable_entry { - u8 mac_addr[ETH_ALEN]; - u8 mac_addr_mask[ETH_ALEN]; - u32 dlid_sd; -} __packed; - -/** - * struct opa_vnic_mac_tbl_node - OPA VNIC mac table node - * @hlist: hash list handle - * @index: index of entry in the mac table - * @entry: entry in the table - */ -struct opa_vnic_mac_tbl_node { - struct hlist_node hlist; - u16 index; - struct __opa_vnic_mactable_entry entry; -}; - -#define v_dbg(format, arg...) \ - netdev_dbg(adapter->netdev, format, ## arg) -#define v_err(format, arg...) \ - netdev_err(adapter->netdev, format, ## arg) -#define v_info(format, arg...) \ - netdev_info(adapter->netdev, format, ## arg) -#define v_warn(format, arg...) \ - netdev_warn(adapter->netdev, format, ## arg) - -#define c_err(format, arg...) \ - dev_err(&cport->ibdev->dev, format, ## arg) -#define c_info(format, arg...) \ - dev_info(&cport->ibdev->dev, format, ## arg) -#define c_dbg(format, arg...) \ - dev_dbg(&cport->ibdev->dev, format, ## arg) - -/* The maximum allowed entries in the mac table */ -#define OPA_VNIC_MAC_TBL_MAX_ENTRIES 2048 -/* Limit of smac entries in mac table */ -#define OPA_VNIC_MAX_SMAC_LIMIT 256 - -/* The last octet of the MAC address is used as the key to the hash table */ -#define OPA_VNIC_MAC_HASH_IDX 5 - -/* The VNIC MAC hash table is of size 2^8 */ -#define OPA_VNIC_MAC_TBL_HASH_BITS 8 -#define OPA_VNIC_MAC_TBL_SIZE BIT(OPA_VNIC_MAC_TBL_HASH_BITS) - -/* VNIC HASH MACROS */ -#define vnic_hash_init(hashtable) __hash_init(hashtable, OPA_VNIC_MAC_TBL_SIZE) - -#define vnic_hash_add(hashtable, node, key) \ - hlist_add_head(node, \ - &hashtable[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))]) - -#define vnic_hash_for_each_safe(name, bkt, tmp, obj, member) \ - for ((bkt) = 0, obj = NULL; \ - !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++) \ - hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) - -#define vnic_hash_for_each_possible(name, obj, member, key) \ - hlist_for_each_entry(obj, \ - &name[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))], member) - -#define vnic_hash_for_each(name, bkt, obj, member) \ - for ((bkt) = 0, obj = NULL; \ - !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++) \ - hlist_for_each_entry(obj, &name[bkt], member) - -extern char opa_vnic_driver_name[]; - -struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev, - u8 port_num, u8 vport_num); -void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter); -void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb); -u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb); -u8 opa_vnic_calc_entropy(struct sk_buff *skb); -void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter); -void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter); -void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter, - struct opa_veswport_mactable *tbl); -int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter, - struct opa_veswport_mactable *tbl); -void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, - struct opa_veswport_iface_macs *macs); -void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter, - struct opa_veswport_iface_macs *macs); -void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter, - struct opa_veswport_summary_counters *cntrs); -void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter, - struct opa_veswport_error_counters *cntrs); -void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter, - struct opa_vesw_info *info); -void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter, - struct opa_vesw_info *info); -void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter, - struct opa_per_veswport_info *info); -void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter, - struct opa_per_veswport_info *info); -void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event); -void opa_vnic_set_ethtool_ops(struct net_device *netdev); -void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter, - struct __opa_veswport_trap *data, u32 lid); - -#endif /* _OPA_VNIC_INTERNAL_H */ diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c deleted file mode 100644 index 1c3e7251f0f4..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPA Virtual Network Interface Controller (VNIC) driver - * netdev functionality. - */ - -#include -#include - -#include "opa_vnic_internal.h" - -#define OPA_TX_TIMEOUT_MS 1000 - -#define OPA_VNIC_SKB_HEADROOM \ - ALIGN((OPA_VNIC_HDR_LEN + OPA_VNIC_SKB_MDATA_LEN), 8) - -/* This function is overloaded for opa_vnic specific implementation */ -static void opa_vnic_get_stats64(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - struct opa_vnic_stats vstats; - - memset(&vstats, 0, sizeof(vstats)); - spin_lock(&adapter->stats_lock); - adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats); - spin_unlock(&adapter->stats_lock); - memcpy(stats, &vstats.netstats, sizeof(*stats)); -} - -/* opa_netdev_start_xmit - transmit function */ -static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb, - struct net_device *netdev) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - - v_dbg("xmit: queue %d skb len %d\n", skb->queue_mapping, skb->len); - /* pad to ensure mininum ethernet packet length */ - if (unlikely(skb->len < ETH_ZLEN)) { - if (skb_padto(skb, ETH_ZLEN)) - return NETDEV_TX_OK; - - skb_put(skb, ETH_ZLEN - skb->len); - } - - opa_vnic_encap_skb(adapter, skb); - return adapter->rn_ops->ndo_start_xmit(skb, netdev); -} - -static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - struct opa_vnic_skb_mdata *mdata; - int rc; - - /* pass entropy and vl as metadata in skb */ - mdata = skb_push(skb, sizeof(*mdata)); - mdata->entropy = opa_vnic_calc_entropy(skb); - mdata->vl = opa_vnic_get_vl(adapter, skb); - rc = adapter->rn_ops->ndo_select_queue(netdev, skb, sb_dev); - skb_pull(skb, sizeof(*mdata)); - return rc; -} - -static void opa_vnic_update_state(struct opa_vnic_adapter *adapter, bool up) -{ - struct __opa_veswport_info *info = &adapter->info; - - mutex_lock(&adapter->lock); - /* Operational state can only be DROP_ALL or FORWARDING */ - if ((info->vport.config_state == OPA_VNIC_STATE_FORWARDING) && up) { - info->vport.oper_state = OPA_VNIC_STATE_FORWARDING; - info->vport.eth_link_status = OPA_VNIC_ETH_LINK_UP; - } else { - info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL; - info->vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN; - } - - if (info->vport.config_state == OPA_VNIC_STATE_FORWARDING) - netif_dormant_off(adapter->netdev); - else - netif_dormant_on(adapter->netdev); - mutex_unlock(&adapter->lock); -} - -/* opa_vnic_process_vema_config - process vema configuration updates */ -void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter) -{ - struct __opa_veswport_info *info = &adapter->info; - struct rdma_netdev *rn = netdev_priv(adapter->netdev); - u8 port_num[OPA_VESW_MAX_NUM_DEF_PORT] = { 0 }; - struct net_device *netdev = adapter->netdev; - u8 i, port_count = 0; - u16 port_mask; - - /* If the base_mac_addr is changed, update the interface mac address */ - if (memcmp(info->vport.base_mac_addr, adapter->vema_mac_addr, - ARRAY_SIZE(info->vport.base_mac_addr))) { - struct sockaddr saddr; - - memcpy(saddr.sa_data, info->vport.base_mac_addr, - ARRAY_SIZE(info->vport.base_mac_addr)); - mutex_lock(&adapter->lock); - eth_commit_mac_addr_change(netdev, &saddr); - memcpy(adapter->vema_mac_addr, - info->vport.base_mac_addr, ETH_ALEN); - mutex_unlock(&adapter->lock); - } - - rn->set_id(netdev, info->vesw.vesw_id); - - /* Handle MTU limit change */ - rtnl_lock(); - netdev->max_mtu = max_t(unsigned int, info->vesw.eth_mtu, - netdev->min_mtu); - if (netdev->mtu > netdev->max_mtu) - dev_set_mtu(netdev, netdev->max_mtu); - rtnl_unlock(); - - /* Update flow to default port redirection table */ - port_mask = info->vesw.def_port_mask; - for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) { - if (port_mask & 1) - port_num[port_count++] = i; - port_mask >>= 1; - } - - /* - * Build the flow table. Flow table is required when destination LID - * is not available. Up to OPA_VNIC_FLOW_TBL_SIZE flows supported. - * Each flow need a default port number to get its dlid from the - * u_ucast_dlid array. - */ - for (i = 0; i < OPA_VNIC_FLOW_TBL_SIZE; i++) - adapter->flow_tbl[i] = port_count ? port_num[i % port_count] : - OPA_VNIC_INVALID_PORT; - - /* update state */ - opa_vnic_update_state(adapter, !!(netdev->flags & IFF_UP)); -} - -/* - * Set the power on default values in adapter's vema interface structure. - */ -static inline void opa_vnic_set_pod_values(struct opa_vnic_adapter *adapter) -{ - adapter->info.vport.max_mac_tbl_ent = OPA_VNIC_MAC_TBL_MAX_ENTRIES; - adapter->info.vport.max_smac_ent = OPA_VNIC_MAX_SMAC_LIMIT; - adapter->info.vport.config_state = OPA_VNIC_STATE_DROP_ALL; - adapter->info.vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN; - adapter->info.vesw.eth_mtu = ETH_DATA_LEN; -} - -/* opa_vnic_set_mac_addr - change mac address */ -static int opa_vnic_set_mac_addr(struct net_device *netdev, void *addr) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - struct sockaddr *sa = addr; - int rc; - - if (!memcmp(netdev->dev_addr, sa->sa_data, ETH_ALEN)) - return 0; - - mutex_lock(&adapter->lock); - rc = eth_mac_addr(netdev, addr); - mutex_unlock(&adapter->lock); - if (rc) - return rc; - - adapter->info.vport.uc_macs_gen_count++; - opa_vnic_vema_report_event(adapter, - OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE); - return 0; -} - -/* - * opa_vnic_mac_send_event - post event on possible mac list exchange - * Send trap when digest from uc/mc mac list differs from previous run. - * Digest is evaluated similar to how cksum does. - */ -static void opa_vnic_mac_send_event(struct net_device *netdev, u8 event) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - struct netdev_hw_addr *ha; - struct netdev_hw_addr_list *hw_list; - u32 *ref_crc; - u32 l, crc = 0; - - switch (event) { - case OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE: - hw_list = &netdev->uc; - adapter->info.vport.uc_macs_gen_count++; - ref_crc = &adapter->umac_hash; - break; - case OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE: - hw_list = &netdev->mc; - adapter->info.vport.mc_macs_gen_count++; - ref_crc = &adapter->mmac_hash; - break; - default: - return; - } - netdev_hw_addr_list_for_each(ha, hw_list) { - crc = crc32_le(crc, ha->addr, ETH_ALEN); - } - l = netdev_hw_addr_list_count(hw_list) * ETH_ALEN; - crc = ~crc32_le(crc, (void *)&l, sizeof(l)); - - if (crc != *ref_crc) { - *ref_crc = crc; - opa_vnic_vema_report_event(adapter, event); - } -} - -/* opa_vnic_set_rx_mode - handle uc/mc mac list change */ -static void opa_vnic_set_rx_mode(struct net_device *netdev) -{ - opa_vnic_mac_send_event(netdev, - OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE); - - opa_vnic_mac_send_event(netdev, - OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE); -} - -/* opa_netdev_open - activate network interface */ -static int opa_netdev_open(struct net_device *netdev) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - int rc; - - rc = adapter->rn_ops->ndo_open(adapter->netdev); - if (rc) { - v_dbg("open failed %d\n", rc); - return rc; - } - - /* Update status and send trap */ - opa_vnic_update_state(adapter, true); - opa_vnic_vema_report_event(adapter, - OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE); - return 0; -} - -/* opa_netdev_close - disable network interface */ -static int opa_netdev_close(struct net_device *netdev) -{ - struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); - int rc; - - rc = adapter->rn_ops->ndo_stop(adapter->netdev); - if (rc) { - v_dbg("close failed %d\n", rc); - return rc; - } - - /* Update status and send trap */ - opa_vnic_update_state(adapter, false); - opa_vnic_vema_report_event(adapter, - OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE); - return 0; -} - -/* netdev ops */ -static const struct net_device_ops opa_netdev_ops = { - .ndo_open = opa_netdev_open, - .ndo_stop = opa_netdev_close, - .ndo_start_xmit = opa_netdev_start_xmit, - .ndo_get_stats64 = opa_vnic_get_stats64, - .ndo_set_rx_mode = opa_vnic_set_rx_mode, - .ndo_select_queue = opa_vnic_select_queue, - .ndo_set_mac_address = opa_vnic_set_mac_addr, -}; - -/* opa_vnic_add_netdev - create vnic netdev interface */ -struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev, - u8 port_num, u8 vport_num) -{ - struct opa_vnic_adapter *adapter; - struct net_device *netdev; - struct rdma_netdev *rn; - int rc; - - netdev = ibdev->ops.alloc_rdma_netdev(ibdev, port_num, - RDMA_NETDEV_OPA_VNIC, - "veth%d", NET_NAME_UNKNOWN, - ether_setup); - if (!netdev) - return ERR_PTR(-ENOMEM); - else if (IS_ERR(netdev)) - return ERR_CAST(netdev); - - rn = netdev_priv(netdev); - adapter = kzalloc_obj(*adapter); - if (!adapter) { - rc = -ENOMEM; - goto adapter_err; - } - - rn->clnt_priv = adapter; - rn->hca = ibdev; - rn->port_num = port_num; - adapter->netdev = netdev; - adapter->ibdev = ibdev; - adapter->port_num = port_num; - adapter->vport_num = vport_num; - adapter->rn_ops = netdev->netdev_ops; - - netdev->netdev_ops = &opa_netdev_ops; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - netdev->hard_header_len += OPA_VNIC_SKB_HEADROOM; - mutex_init(&adapter->lock); - mutex_init(&adapter->mactbl_lock); - spin_lock_init(&adapter->stats_lock); - - SET_NETDEV_DEV(netdev, ibdev->dev.parent); - - opa_vnic_set_ethtool_ops(netdev); - - opa_vnic_set_pod_values(adapter); - - rc = register_netdev(netdev); - if (rc) - goto netdev_err; - - netif_carrier_off(netdev); - netif_dormant_on(netdev); - v_info("initialized\n"); - - return adapter; -netdev_err: - mutex_destroy(&adapter->lock); - mutex_destroy(&adapter->mactbl_lock); - kfree(adapter); -adapter_err: - rn->free_rdma_netdev(netdev); - - return ERR_PTR(rc); -} - -/* opa_vnic_rem_netdev - remove vnic netdev interface */ -void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter) -{ - struct net_device *netdev = adapter->netdev; - struct rdma_netdev *rn = netdev_priv(netdev); - - v_info("removing\n"); - unregister_netdev(netdev); - opa_vnic_release_mac_tbl(adapter); - mutex_destroy(&adapter->lock); - mutex_destroy(&adapter->mactbl_lock); - kfree(adapter); - rn->free_rdma_netdev(netdev); -} diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c deleted file mode 100644 index 21c6cea8b1db..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ /dev/null @@ -1,1056 +0,0 @@ -/* - * Copyright(c) 2017 Intel Corporation. - * Copyright(c) 2021 Cornelis Networks. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPX Virtual Network Interface Controller (VNIC) - * Ethernet Management Agent (EMA) driver - */ - -#include -#include -#include -#include -#include -#include - -#include "opa_vnic_internal.h" - -char opa_vnic_driver_name[] = "opa_vnic"; - -/* - * The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd - * field in the class port info MAD. - */ -#define GET_TRAP_SL_FROM_CLASS_PORT_INFO(x) (((x) >> 3) & 0x1f) - -/* Cap trap bursts to a reasonable limit good for normal cases */ -#define OPA_VNIC_TRAP_BURST_LIMIT 4 - -/* - * VNIC trap limit timeout. - * Inverse of cap2_mask response time out (1.0737 secs) = 0.9 - * secs approx IB spec 13.4.6.2.1 PortInfoSubnetTimeout and - * 13.4.9 Traps. - */ -#define OPA_VNIC_TRAP_TIMEOUT ((4096 * (1UL << 18)) / 1000) - -#define OPA_VNIC_UNSUP_ATTR \ - cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB) - -#define OPA_VNIC_INVAL_ATTR \ - cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE) - -#define OPA_VNIC_CLASS_CAP_TRAP 0x1 - -/* Maximum number of VNIC ports supported */ -#define OPA_VNIC_MAX_NUM_VPORT 255 - -/** - * struct opa_vnic_vema_port -- VNIC VEMA port details - * @cport: pointer to port - * @mad_agent: pointer to mad agent for port - * @class_port_info: Class port info information. - * @tid: Transaction id - * @port_num: OPA port number - * @vports: vnic ports - * @event_handler: ib event handler - * @lock: adapter interface lock - */ -struct opa_vnic_vema_port { - struct opa_vnic_ctrl_port *cport; - struct ib_mad_agent *mad_agent; - struct opa_class_port_info class_port_info; - u64 tid; - u8 port_num; - struct xarray vports; - struct ib_event_handler event_handler; - - /* Lock to query/update network adapter */ - struct mutex lock; -}; - -static int opa_vnic_vema_add_one(struct ib_device *device); -static void opa_vnic_vema_rem_one(struct ib_device *device, - void *client_data); - -static struct ib_client opa_vnic_client = { - .name = opa_vnic_driver_name, - .add = opa_vnic_vema_add_one, - .remove = opa_vnic_vema_rem_one, -}; - -/** - * vema_get_vport_num -- Get the vnic from the mad - * @recvd_mad: Received mad - * - * Return: returns value of the vnic port number - */ -static inline u8 vema_get_vport_num(struct opa_vnic_vema_mad *recvd_mad) -{ - return be32_to_cpu(recvd_mad->mad_hdr.attr_mod) & 0xff; -} - -/** - * vema_get_vport_adapter -- Get vnic port adapter from recvd mad - * @recvd_mad: received mad - * @port: ptr to port struct on which MAD was recvd - * - * Return: vnic adapter - */ -static inline struct opa_vnic_adapter * -vema_get_vport_adapter(struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_port *port) -{ - u8 vport_num = vema_get_vport_num(recvd_mad); - - return xa_load(&port->vports, vport_num); -} - -/** - * vema_mac_tbl_req_ok -- Check if mac request has correct values - * @mac_tbl: mac table - * - * This function checks for the validity of the offset and number of - * entries required. - * - * Return: true if offset and num_entries are valid - */ -static inline bool vema_mac_tbl_req_ok(struct opa_veswport_mactable *mac_tbl) -{ - u16 offset, num_entries; - u16 req_entries = ((OPA_VNIC_EMA_DATA - sizeof(*mac_tbl)) / - sizeof(mac_tbl->tbl_entries[0])); - - offset = be16_to_cpu(mac_tbl->offset); - num_entries = be16_to_cpu(mac_tbl->num_entries); - - return ((num_entries <= req_entries) && - (offset + num_entries <= OPA_VNIC_MAC_TBL_MAX_ENTRIES)); -} - -/* - * Return the power on default values in the port info structure - * in big endian format as required by MAD. - */ -static inline void vema_get_pod_values(struct opa_veswport_info *port_info) -{ - memset(port_info, 0, sizeof(*port_info)); - port_info->vport.max_mac_tbl_ent = - cpu_to_be16(OPA_VNIC_MAC_TBL_MAX_ENTRIES); - port_info->vport.max_smac_ent = - cpu_to_be16(OPA_VNIC_MAX_SMAC_LIMIT); - port_info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL; - port_info->vport.config_state = OPA_VNIC_STATE_DROP_ALL; - port_info->vesw.eth_mtu = cpu_to_be16(ETH_DATA_LEN); -} - -/** - * vema_add_vport -- Add a new vnic port - * @port: ptr to opa_vnic_vema_port struct - * @vport_num: vnic port number (to be added) - * - * Return a pointer to the vnic adapter structure - */ -static struct opa_vnic_adapter *vema_add_vport(struct opa_vnic_vema_port *port, - u8 vport_num) -{ - struct opa_vnic_ctrl_port *cport = port->cport; - struct opa_vnic_adapter *adapter; - - adapter = opa_vnic_add_netdev(cport->ibdev, port->port_num, vport_num); - if (!IS_ERR(adapter)) { - int rc; - - adapter->cport = cport; - rc = xa_insert(&port->vports, vport_num, adapter, GFP_KERNEL); - if (rc < 0) { - opa_vnic_rem_netdev(adapter); - adapter = ERR_PTR(rc); - } - } - - return adapter; -} - -/** - * vema_get_class_port_info -- Get class info for port - * @port: Port on whic MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function copies the latest class port info value set for the - * port and stores it for generating traps - */ -static void vema_get_class_port_info(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_class_port_info *port_info; - - port_info = (struct opa_class_port_info *)rsp_mad->data; - memcpy(port_info, &port->class_port_info, sizeof(*port_info)); - port_info->base_version = OPA_MGMT_BASE_VERSION; - port_info->class_version = OPA_EMA_CLASS_VERSION; - - /* - * Set capability mask bit indicating agent generates traps, - * and set the maximum number of VNIC ports supported. - */ - port_info->cap_mask = cpu_to_be16((OPA_VNIC_CLASS_CAP_TRAP | - (OPA_VNIC_MAX_NUM_VPORT << 8))); - - /* - * Since a get routine is always sent by the EM first we - * set the expected response time to - * 4.096 usec * 2^18 == 1.0737 sec here. - */ - port_info->cap_mask2_resp_time = cpu_to_be32(18); -} - -/** - * vema_set_class_port_info -- Get class info for port - * @port: Port on whic MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function updates the port class info for the specific vnic - * and sets up the response mad data - */ -static void vema_set_class_port_info(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - memcpy(&port->class_port_info, recvd_mad->data, - sizeof(port->class_port_info)); - - vema_get_class_port_info(port, recvd_mad, rsp_mad); -} - -/** - * vema_get_veswport_info -- Get veswport info - * @port: source port on which MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - */ -static void vema_get_veswport_info(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_info *port_info = - (struct opa_veswport_info *)rsp_mad->data; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (adapter) { - memset(port_info, 0, sizeof(*port_info)); - opa_vnic_get_vesw_info(adapter, &port_info->vesw); - opa_vnic_get_per_veswport_info(adapter, - &port_info->vport); - } else { - vema_get_pod_values(port_info); - } -} - -/** - * vema_set_veswport_info -- Set veswport info - * @port: source port on which MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function gets the port class infor for vnic - */ -static void vema_set_veswport_info(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_vnic_ctrl_port *cport = port->cport; - struct opa_veswport_info *port_info; - struct opa_vnic_adapter *adapter; - u8 vport_num; - - vport_num = vema_get_vport_num(recvd_mad); - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (!adapter) { - adapter = vema_add_vport(port, vport_num); - if (IS_ERR(adapter)) { - c_err("failed to add vport %d: %ld\n", - vport_num, PTR_ERR(adapter)); - goto err_exit; - } - } - - port_info = (struct opa_veswport_info *)recvd_mad->data; - opa_vnic_set_vesw_info(adapter, &port_info->vesw); - opa_vnic_set_per_veswport_info(adapter, &port_info->vport); - - /* Process the new config settings */ - opa_vnic_process_vema_config(adapter); - - vema_get_veswport_info(port, recvd_mad, rsp_mad); - return; - -err_exit: - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; -} - -/** - * vema_get_mac_entries -- Get MAC entries in VNIC MAC table - * @port: source port on which MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function gets the MAC entries that are programmed into - * the VNIC MAC forwarding table. It checks for the validity of - * the index into the MAC table and the number of entries that - * are to be retrieved. - */ -static void vema_get_mac_entries(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_mactable *mac_tbl_in, *mac_tbl_out; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (!adapter) { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - return; - } - - mac_tbl_in = (struct opa_veswport_mactable *)recvd_mad->data; - mac_tbl_out = (struct opa_veswport_mactable *)rsp_mad->data; - - if (vema_mac_tbl_req_ok(mac_tbl_in)) { - mac_tbl_out->offset = mac_tbl_in->offset; - mac_tbl_out->num_entries = mac_tbl_in->num_entries; - opa_vnic_query_mac_tbl(adapter, mac_tbl_out); - } else { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - } -} - -/** - * vema_set_mac_entries -- Set MAC entries in VNIC MAC table - * @port: source port on which MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function sets the MAC entries in the VNIC forwarding table - * It checks for the validity of the index and the number of forwarding - * table entries to be programmed. - */ -static void vema_set_mac_entries(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_mactable *mac_tbl; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (!adapter) { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - return; - } - - mac_tbl = (struct opa_veswport_mactable *)recvd_mad->data; - if (vema_mac_tbl_req_ok(mac_tbl)) { - if (opa_vnic_update_mac_tbl(adapter, mac_tbl)) - rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; - } else { - rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; - } - vema_get_mac_entries(port, recvd_mad, rsp_mad); -} - -/** - * vema_set_delete_vesw -- Reset VESW info to POD values - * @port: source port on which MAD was received - * @recvd_mad: pointer to the received mad - * @rsp_mad: pointer to respose mad - * - * This function clears all the fields of veswport info for the requested vesw - * and sets them back to the power-on default values. It does not delete the - * vesw. - */ -static void vema_set_delete_vesw(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_info *port_info = - (struct opa_veswport_info *)rsp_mad->data; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (!adapter) { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - return; - } - - vema_get_pod_values(port_info); - opa_vnic_set_vesw_info(adapter, &port_info->vesw); - opa_vnic_set_per_veswport_info(adapter, &port_info->vport); - - /* Process the new config settings */ - opa_vnic_process_vema_config(adapter); - - opa_vnic_release_mac_tbl(adapter); - - vema_get_veswport_info(port, recvd_mad, rsp_mad); -} - -/** - * vema_get_mac_list -- Get the unicast/multicast macs. - * @port: source port on which MAD was received - * @recvd_mad: Received mad contains fields to set vnic parameters - * @rsp_mad: Response mad to be built - * @attr_id: Attribute ID indicating multicast or unicast mac list - */ -static void vema_get_mac_list(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad, - u16 attr_id) -{ - struct opa_veswport_iface_macs *macs_in, *macs_out; - int max_entries = (OPA_VNIC_EMA_DATA - sizeof(*macs_out)) / ETH_ALEN; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (!adapter) { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - return; - } - - macs_in = (struct opa_veswport_iface_macs *)recvd_mad->data; - macs_out = (struct opa_veswport_iface_macs *)rsp_mad->data; - - macs_out->start_idx = macs_in->start_idx; - if (macs_in->num_macs_in_msg) - macs_out->num_macs_in_msg = macs_in->num_macs_in_msg; - else - macs_out->num_macs_in_msg = cpu_to_be16(max_entries); - - if (attr_id == OPA_EM_ATTR_IFACE_MCAST_MACS) - opa_vnic_query_mcast_macs(adapter, macs_out); - else - opa_vnic_query_ucast_macs(adapter, macs_out); -} - -/** - * vema_get_summary_counters -- Gets summary counters. - * @port: source port on which MAD was received - * @recvd_mad: Received mad contains fields to set vnic parameters - * @rsp_mad: Response mad to be built - */ -static void vema_get_summary_counters(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_summary_counters *cntrs; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (adapter) { - cntrs = (struct opa_veswport_summary_counters *)rsp_mad->data; - opa_vnic_get_summary_counters(adapter, cntrs); - } else { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - } -} - -/** - * vema_get_error_counters -- Gets summary counters. - * @port: source port on which MAD was received - * @recvd_mad: Received mad contains fields to set vnic parameters - * @rsp_mad: Response mad to be built - */ -static void vema_get_error_counters(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - struct opa_veswport_error_counters *cntrs; - struct opa_vnic_adapter *adapter; - - adapter = vema_get_vport_adapter(recvd_mad, port); - if (adapter) { - cntrs = (struct opa_veswport_error_counters *)rsp_mad->data; - opa_vnic_get_error_counters(adapter, cntrs); - } else { - rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR; - } -} - -/** - * vema_get -- Process received get MAD - * @port: source port on which MAD was received - * @recvd_mad: Received mad - * @rsp_mad: Response mad to be built - */ -static void vema_get(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id); - - switch (attr_id) { - case OPA_EM_ATTR_CLASS_PORT_INFO: - vema_get_class_port_info(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_VESWPORT_INFO: - vema_get_veswport_info(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES: - vema_get_mac_entries(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_IFACE_UCAST_MACS: - case OPA_EM_ATTR_IFACE_MCAST_MACS: - vema_get_mac_list(port, recvd_mad, rsp_mad, attr_id); - break; - case OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS: - vema_get_summary_counters(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS: - vema_get_error_counters(port, recvd_mad, rsp_mad); - break; - default: - rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; - break; - } -} - -/** - * vema_set -- Process received set MAD - * @port: source port on which MAD was received - * @recvd_mad: Received mad contains fields to set vnic parameters - * @rsp_mad: Response mad to be built - */ -static void vema_set(struct opa_vnic_vema_port *port, - struct opa_vnic_vema_mad *recvd_mad, - struct opa_vnic_vema_mad *rsp_mad) -{ - u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id); - - switch (attr_id) { - case OPA_EM_ATTR_CLASS_PORT_INFO: - vema_set_class_port_info(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_VESWPORT_INFO: - vema_set_veswport_info(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES: - vema_set_mac_entries(port, recvd_mad, rsp_mad); - break; - case OPA_EM_ATTR_DELETE_VESW: - vema_set_delete_vesw(port, recvd_mad, rsp_mad); - break; - default: - rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; - break; - } -} - -/** - * vema_send -- Send handler for VEMA MAD agent - * @mad_agent: pointer to the mad agent - * @mad_wc: pointer to mad send work completion information - * - * Free all the data structures associated with the sent MAD - */ -static void vema_send(struct ib_mad_agent *mad_agent, - struct ib_mad_send_wc *mad_wc) -{ - rdma_destroy_ah(mad_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE); - ib_free_send_mad(mad_wc->send_buf); -} - -/** - * vema_recv -- Recv handler for VEMA MAD agent - * @mad_agent: pointer to the mad agent - * @send_buf: Send buffer if found, else NULL - * @mad_wc: pointer to mad send work completion information - * - * Handle only set and get methods and respond to other methods - * as unsupported. Allocate response buffer and address handle - * for the response MAD. - */ -static void vema_recv(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf, - struct ib_mad_recv_wc *mad_wc) -{ - struct opa_vnic_vema_port *port; - struct ib_ah *ah; - struct ib_mad_send_buf *rsp; - struct opa_vnic_vema_mad *vema_mad; - - if (!mad_wc || !mad_wc->recv_buf.mad) - return; - - port = mad_agent->context; - ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc, - mad_wc->recv_buf.grh, mad_agent->port_num); - if (IS_ERR(ah)) - goto free_recv_mad; - - rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp, - mad_wc->wc->pkey_index, 0, - IB_MGMT_VENDOR_HDR, OPA_VNIC_EMA_DATA, - GFP_KERNEL, OPA_MGMT_BASE_VERSION); - if (IS_ERR(rsp)) - goto err_rsp; - - rsp->ah = ah; - vema_mad = rsp->mad; - memcpy(vema_mad, mad_wc->recv_buf.mad, IB_MGMT_VENDOR_HDR); - vema_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; - vema_mad->mad_hdr.status = 0; - - /* Lock ensures network adapter is not removed */ - mutex_lock(&port->lock); - - switch (mad_wc->recv_buf.mad->mad_hdr.method) { - case IB_MGMT_METHOD_GET: - vema_get(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad, - vema_mad); - break; - case IB_MGMT_METHOD_SET: - vema_set(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad, - vema_mad); - break; - default: - vema_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR; - break; - } - mutex_unlock(&port->lock); - - if (!ib_post_send_mad(rsp, NULL)) { - /* - * with post send successful ah and send mad - * will be destroyed in send handler - */ - goto free_recv_mad; - } - - ib_free_send_mad(rsp); - -err_rsp: - rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); -free_recv_mad: - ib_free_recv_mad(mad_wc); -} - -/** - * vema_get_port -- Gets the opa_vnic_vema_port - * @cport: pointer to control dev - * @port_num: Port number - * - * This function loops through the ports and returns - * the opa_vnic_vema port structure that is associated - * with the OPA port number - * - * Return: ptr to requested opa_vnic_vema_port strucure - * if success, NULL if not - */ -static struct opa_vnic_vema_port * -vema_get_port(struct opa_vnic_ctrl_port *cport, u8 port_num) -{ - struct opa_vnic_vema_port *port = (void *)cport + sizeof(*cport); - - if (port_num > cport->num_ports) - return NULL; - - return port + (port_num - 1); -} - -/** - * opa_vnic_vema_send_trap -- This function sends a trap to the EM - * @adapter: pointer to vnic adapter - * @data: pointer to trap data filled by calling function - * @lid: issuers lid (encap_slid from vesw_port_info) - * - * This function is called from the VNIC driver to send a trap if there - * is somethng the EM should be notified about. These events currently - * are - * 1) UNICAST INTERFACE MACADDRESS changes - * 2) MULTICAST INTERFACE MACADDRESS changes - * 3) ETHERNET LINK STATUS changes - * While allocating the send mad the remote site qpn used is 1 - * as this is the well known QP. - * - */ -void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter, - struct __opa_veswport_trap *data, u32 lid) -{ - struct opa_vnic_ctrl_port *cport = adapter->cport; - struct ib_mad_send_buf *send_buf; - struct opa_vnic_vema_port *port; - struct ib_device *ibp; - struct opa_vnic_vema_mad_trap *trap_mad; - struct opa_class_port_info *class; - struct rdma_ah_attr ah_attr; - struct ib_ah *ah; - struct opa_veswport_trap *trap; - u32 trap_lid; - u16 pkey_idx; - - if (!cport) - goto err_exit; - ibp = cport->ibdev; - port = vema_get_port(cport, data->opaportnum); - if (!port || !port->mad_agent) - goto err_exit; - - if (time_before(jiffies, adapter->trap_timeout)) { - if (adapter->trap_count == OPA_VNIC_TRAP_BURST_LIMIT) { - v_warn("Trap rate exceeded\n"); - goto err_exit; - } else { - adapter->trap_count++; - } - } else { - adapter->trap_count = 0; - } - - class = &port->class_port_info; - /* Set up address handle */ - memset(&ah_attr, 0, sizeof(ah_attr)); - ah_attr.type = rdma_ah_find_type(ibp, port->port_num); - rdma_ah_set_sl(&ah_attr, - GET_TRAP_SL_FROM_CLASS_PORT_INFO(class->trap_sl_rsvd)); - rdma_ah_set_port_num(&ah_attr, port->port_num); - trap_lid = be32_to_cpu(class->trap_lid); - /* - * check for trap lid validity, must not be zero - * The trap sink could change after we fashion the MAD but since traps - * are not guaranteed we won't use a lock as anyway the change will take - * place even with locking. - */ - if (!trap_lid) { - c_err("%s: Invalid dlid\n", __func__); - goto err_exit; - } - - rdma_ah_set_dlid(&ah_attr, trap_lid); - ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr, 0); - if (IS_ERR(ah)) { - c_err("%s:Couldn't create new AH = %p\n", __func__, ah); - c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__, - rdma_ah_get_dlid(&ah_attr), rdma_ah_get_sl(&ah_attr), - rdma_ah_get_port_num(&ah_attr)); - goto err_exit; - } - - if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_FULL, - &pkey_idx) < 0) { - c_err("%s:full key not found, defaulting to partial\n", - __func__); - if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_PARTIAL, - &pkey_idx) < 0) - pkey_idx = 1; - } - - send_buf = ib_create_send_mad(port->mad_agent, 1, pkey_idx, 0, - IB_MGMT_VENDOR_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, OPA_MGMT_BASE_VERSION); - if (IS_ERR(send_buf)) { - c_err("%s:Couldn't allocate send buf\n", __func__); - goto err_sndbuf; - } - - send_buf->ah = ah; - - /* Set up common MAD hdr */ - trap_mad = send_buf->mad; - trap_mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION; - trap_mad->mad_hdr.mgmt_class = OPA_MGMT_CLASS_INTEL_EMA; - trap_mad->mad_hdr.class_version = OPA_EMA_CLASS_VERSION; - trap_mad->mad_hdr.method = IB_MGMT_METHOD_TRAP; - port->tid++; - trap_mad->mad_hdr.tid = cpu_to_be64(port->tid); - trap_mad->mad_hdr.attr_id = IB_SMP_ATTR_NOTICE; - - /* Set up vendor OUI */ - trap_mad->oui[0] = INTEL_OUI_1; - trap_mad->oui[1] = INTEL_OUI_2; - trap_mad->oui[2] = INTEL_OUI_3; - - /* Setup notice attribute portion */ - trap_mad->notice.gen_type = OPA_INTEL_EMA_NOTICE_TYPE_INFO << 1; - trap_mad->notice.oui_1 = INTEL_OUI_1; - trap_mad->notice.oui_2 = INTEL_OUI_2; - trap_mad->notice.oui_3 = INTEL_OUI_3; - trap_mad->notice.issuer_lid = cpu_to_be32(lid); - - /* copy the actual trap data */ - trap = (struct opa_veswport_trap *)trap_mad->notice.raw_data; - trap->fabric_id = cpu_to_be16(data->fabric_id); - trap->veswid = cpu_to_be16(data->veswid); - trap->veswportnum = cpu_to_be32(data->veswportnum); - trap->opaportnum = cpu_to_be16(data->opaportnum); - trap->veswportindex = data->veswportindex; - trap->opcode = data->opcode; - - /* If successful send set up rate limit timeout else bail */ - if (ib_post_send_mad(send_buf, NULL)) { - ib_free_send_mad(send_buf); - } else { - if (adapter->trap_count) - return; - adapter->trap_timeout = jiffies + - usecs_to_jiffies(OPA_VNIC_TRAP_TIMEOUT); - return; - } - -err_sndbuf: - rdma_destroy_ah(ah, 0); -err_exit: - v_err("Aborting trap\n"); -} - -static void opa_vnic_event(struct ib_event_handler *handler, - struct ib_event *record) -{ - struct opa_vnic_vema_port *port = - container_of(handler, struct opa_vnic_vema_port, event_handler); - struct opa_vnic_ctrl_port *cport = port->cport; - struct opa_vnic_adapter *adapter; - unsigned long index; - - if (record->element.port_num != port->port_num) - return; - - c_dbg("OPA_VNIC received event %d on device %s port %d\n", - record->event, dev_name(&record->device->dev), - record->element.port_num); - - if (record->event != IB_EVENT_PORT_ERR && - record->event != IB_EVENT_PORT_ACTIVE) - return; - - xa_for_each(&port->vports, index, adapter) { - if (record->event == IB_EVENT_PORT_ACTIVE) - netif_carrier_on(adapter->netdev); - else - netif_carrier_off(adapter->netdev); - } -} - -/** - * vema_unregister -- Unregisters agent - * @cport: pointer to control port - * - * This deletes the registration by VEMA for MADs - */ -static void vema_unregister(struct opa_vnic_ctrl_port *cport) -{ - struct opa_vnic_adapter *adapter; - unsigned long index; - int i; - - for (i = 1; i <= cport->num_ports; i++) { - struct opa_vnic_vema_port *port = vema_get_port(cport, i); - - if (!port->mad_agent) - continue; - - /* Lock ensures no MAD is being processed */ - mutex_lock(&port->lock); - xa_for_each(&port->vports, index, adapter) - opa_vnic_rem_netdev(adapter); - mutex_unlock(&port->lock); - - ib_unregister_mad_agent(port->mad_agent); - port->mad_agent = NULL; - mutex_destroy(&port->lock); - xa_destroy(&port->vports); - ib_unregister_event_handler(&port->event_handler); - } -} - -/** - * vema_register -- Registers agent - * @cport: pointer to control port - * - * This function registers the handlers for the VEMA MADs - * - * Return: returns 0 on success. non zero otherwise - */ -static int vema_register(struct opa_vnic_ctrl_port *cport) -{ - struct ib_mad_reg_req reg_req = { - .mgmt_class = OPA_MGMT_CLASS_INTEL_EMA, - .mgmt_class_version = OPA_MGMT_BASE_VERSION, - .oui = { INTEL_OUI_1, INTEL_OUI_2, INTEL_OUI_3 } - }; - int i; - - set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); - set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask); - - /* register ib event handler and mad agent for each port on dev */ - for (i = 1; i <= cport->num_ports; i++) { - struct opa_vnic_vema_port *port = vema_get_port(cport, i); - int ret; - - port->cport = cport; - port->port_num = i; - - INIT_IB_EVENT_HANDLER(&port->event_handler, - cport->ibdev, opa_vnic_event); - ib_register_event_handler(&port->event_handler); - - xa_init(&port->vports); - mutex_init(&port->lock); - port->mad_agent = ib_register_mad_agent(cport->ibdev, i, - IB_QPT_GSI, ®_req, - IB_MGMT_RMPP_VERSION, - vema_send, vema_recv, - port, 0); - if (IS_ERR(port->mad_agent)) { - ret = PTR_ERR(port->mad_agent); - port->mad_agent = NULL; - mutex_destroy(&port->lock); - vema_unregister(cport); - return ret; - } - } - - return 0; -} - -/** - * opa_vnic_ctrl_config_dev -- This function sends a trap to the EM - * by way of ib_modify_port to indicate support for ethernet on the - * fabric. - * @cport: pointer to control port - * @en: enable or disable ethernet on fabric support - */ -static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en) -{ - struct ib_port_modify pm = { 0 }; - int i; - - if (en) - pm.set_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; - else - pm.clr_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; - - for (i = 1; i <= cport->num_ports; i++) - ib_modify_port(cport->ibdev, i, IB_PORT_OPA_MASK_CHG, &pm); -} - -/** - * opa_vnic_vema_add_one -- Handle new ib device - * @device: ib device pointer - * - * Allocate the vnic control port and initialize it. - */ -static int opa_vnic_vema_add_one(struct ib_device *device) -{ - struct opa_vnic_ctrl_port *cport; - int rc, size = sizeof(*cport); - - if (!rdma_cap_opa_vnic(device)) - return -EOPNOTSUPP; - - size += device->phys_port_cnt * sizeof(struct opa_vnic_vema_port); - cport = kzalloc(size, GFP_KERNEL); - if (!cport) - return -ENOMEM; - - cport->num_ports = device->phys_port_cnt; - cport->ibdev = device; - - /* Initialize opa vnic management agent (vema) */ - rc = vema_register(cport); - if (!rc) - c_info("VNIC client initialized\n"); - - ib_set_client_data(device, &opa_vnic_client, cport); - opa_vnic_ctrl_config_dev(cport, true); - return 0; -} - -/** - * opa_vnic_vema_rem_one -- Handle ib device removal - * @device: ib device pointer - * @client_data: ib client data - * - * Uninitialize and free the vnic control port. - */ -static void opa_vnic_vema_rem_one(struct ib_device *device, - void *client_data) -{ - struct opa_vnic_ctrl_port *cport = client_data; - - c_info("removing VNIC client\n"); - opa_vnic_ctrl_config_dev(cport, false); - vema_unregister(cport); - kfree(cport); -} - -static int __init opa_vnic_init(void) -{ - int rc; - - rc = ib_register_client(&opa_vnic_client); - if (rc) - pr_err("VNIC driver register failed %d\n", rc); - - return rc; -} -module_init(opa_vnic_init); - -static void opa_vnic_deinit(void) -{ - ib_unregister_client(&opa_vnic_client); -} -module_exit(opa_vnic_deinit); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Cornelis Networks"); -MODULE_DESCRIPTION("Cornelis OPX Virtual Network driver"); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c deleted file mode 100644 index 292c037aa239..000000000000 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright(c) 2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains OPA VNIC EMA Interface functions. - */ - -#include "opa_vnic_internal.h" - -/** - * opa_vnic_vema_report_event - sent trap to report the specified event - * @adapter: vnic port adapter - * @event: event to be reported - * - * This function calls vema api to sent a trap for the given event. - */ -void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event) -{ - struct __opa_veswport_info *info = &adapter->info; - struct __opa_veswport_trap trap_data; - - trap_data.fabric_id = info->vesw.fabric_id; - trap_data.veswid = info->vesw.vesw_id; - trap_data.veswportnum = info->vport.port_num; - trap_data.opaportnum = adapter->port_num; - trap_data.veswportindex = adapter->vport_num; - trap_data.opcode = event; - - opa_vnic_vema_send_trap(adapter, &trap_data, info->vport.encap_slid); -} - -/** - * opa_vnic_get_summary_counters - get summary counters - * @adapter: vnic port adapter - * @cntrs: pointer to destination summary counters structure - * - * This function populates the summary counters that is maintained by the - * given adapter to destination address provided. - */ -void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter, - struct opa_veswport_summary_counters *cntrs) -{ - struct opa_vnic_stats vstats; - __be64 *dst; - u64 *src; - - memset(&vstats, 0, sizeof(vstats)); - spin_lock(&adapter->stats_lock); - adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats); - spin_unlock(&adapter->stats_lock); - - cntrs->vp_instance = cpu_to_be16(adapter->vport_num); - cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id); - cntrs->veswport_num = cpu_to_be32(adapter->port_num); - - cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors); - cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors); - cntrs->tx_packets = cpu_to_be64(vstats.netstats.tx_packets); - cntrs->rx_packets = cpu_to_be64(vstats.netstats.rx_packets); - cntrs->tx_bytes = cpu_to_be64(vstats.netstats.tx_bytes); - cntrs->rx_bytes = cpu_to_be64(vstats.netstats.rx_bytes); - - /* - * This loop depends on layout of - * opa_veswport_summary_counters opa_vnic_stats structures. - */ - for (dst = &cntrs->tx_unicast, src = &vstats.tx_grp.unicast; - dst < &cntrs->reserved[0]; dst++, src++) { - *dst = cpu_to_be64(*src); - } -} - -/** - * opa_vnic_get_error_counters - get error counters - * @adapter: vnic port adapter - * @cntrs: pointer to destination error counters structure - * - * This function populates the error counters that is maintained by the - * given adapter to destination address provided. - */ -void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter, - struct opa_veswport_error_counters *cntrs) -{ - struct opa_vnic_stats vstats; - - memset(&vstats, 0, sizeof(vstats)); - spin_lock(&adapter->stats_lock); - adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats); - spin_unlock(&adapter->stats_lock); - - cntrs->vp_instance = cpu_to_be16(adapter->vport_num); - cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id); - cntrs->veswport_num = cpu_to_be32(adapter->port_num); - - cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors); - cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors); - cntrs->tx_dlid_zero = cpu_to_be64(vstats.tx_dlid_zero); - cntrs->tx_drop_state = cpu_to_be64(vstats.tx_drop_state); - cntrs->tx_logic = cpu_to_be64(vstats.netstats.tx_fifo_errors + - vstats.netstats.tx_carrier_errors); - - cntrs->rx_bad_veswid = cpu_to_be64(vstats.netstats.rx_nohandler); - cntrs->rx_runt = cpu_to_be64(vstats.rx_runt); - cntrs->rx_oversize = cpu_to_be64(vstats.rx_oversize); - cntrs->rx_drop_state = cpu_to_be64(vstats.rx_drop_state); - cntrs->rx_logic = cpu_to_be64(vstats.netstats.rx_fifo_errors); -} - -/** - * opa_vnic_get_vesw_info -- Get the vesw information - * @adapter: vnic port adapter - * @info: pointer to destination vesw info structure - * - * This function copies the vesw info that is maintained by the - * given adapter to destination address provided. - */ -void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter, - struct opa_vesw_info *info) -{ - struct __opa_vesw_info *src = &adapter->info.vesw; - int i; - - info->fabric_id = cpu_to_be16(src->fabric_id); - info->vesw_id = cpu_to_be16(src->vesw_id); - memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0)); - info->def_port_mask = cpu_to_be16(src->def_port_mask); - memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1)); - info->pkey = cpu_to_be16(src->pkey); - - memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2)); - info->u_mcast_dlid = cpu_to_be32(src->u_mcast_dlid); - for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) - info->u_ucast_dlid[i] = cpu_to_be32(src->u_ucast_dlid[i]); - - info->rc = cpu_to_be32(src->rc); - - memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3)); - info->eth_mtu = cpu_to_be16(src->eth_mtu); - memcpy(info->rsvd4, src->rsvd4, ARRAY_SIZE(src->rsvd4)); -} - -/** - * opa_vnic_set_vesw_info -- Set the vesw information - * @adapter: vnic port adapter - * @info: pointer to vesw info structure - * - * This function updates the vesw info that is maintained by the - * given adapter with vesw info provided. Reserved fields are stored - * and returned back to EM as is. - */ -void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter, - struct opa_vesw_info *info) -{ - struct __opa_vesw_info *dst = &adapter->info.vesw; - int i; - - dst->fabric_id = be16_to_cpu(info->fabric_id); - dst->vesw_id = be16_to_cpu(info->vesw_id); - memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0)); - dst->def_port_mask = be16_to_cpu(info->def_port_mask); - memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1)); - dst->pkey = be16_to_cpu(info->pkey); - - memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2)); - dst->u_mcast_dlid = be32_to_cpu(info->u_mcast_dlid); - for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) - dst->u_ucast_dlid[i] = be32_to_cpu(info->u_ucast_dlid[i]); - - dst->rc = be32_to_cpu(info->rc); - - memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3)); - dst->eth_mtu = be16_to_cpu(info->eth_mtu); - memcpy(dst->rsvd4, info->rsvd4, ARRAY_SIZE(info->rsvd4)); -} - -/** - * opa_vnic_get_per_veswport_info -- Get the vesw per port information - * @adapter: vnic port adapter - * @info: pointer to destination vport info structure - * - * This function copies the vesw per port info that is maintained by the - * given adapter to destination address provided. - * Note that the read only fields are not copied. - */ -void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter, - struct opa_per_veswport_info *info) -{ - struct __opa_per_veswport_info *src = &adapter->info.vport; - - info->port_num = cpu_to_be32(src->port_num); - info->eth_link_status = src->eth_link_status; - memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0)); - - memcpy(info->base_mac_addr, src->base_mac_addr, - ARRAY_SIZE(info->base_mac_addr)); - info->config_state = src->config_state; - info->oper_state = src->oper_state; - info->max_mac_tbl_ent = cpu_to_be16(src->max_mac_tbl_ent); - info->max_smac_ent = cpu_to_be16(src->max_smac_ent); - info->mac_tbl_digest = cpu_to_be32(src->mac_tbl_digest); - memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1)); - - info->encap_slid = cpu_to_be32(src->encap_slid); - memcpy(info->pcp_to_sc_uc, src->pcp_to_sc_uc, - ARRAY_SIZE(info->pcp_to_sc_uc)); - memcpy(info->pcp_to_vl_uc, src->pcp_to_vl_uc, - ARRAY_SIZE(info->pcp_to_vl_uc)); - memcpy(info->pcp_to_sc_mc, src->pcp_to_sc_mc, - ARRAY_SIZE(info->pcp_to_sc_mc)); - memcpy(info->pcp_to_vl_mc, src->pcp_to_vl_mc, - ARRAY_SIZE(info->pcp_to_vl_mc)); - info->non_vlan_sc_uc = src->non_vlan_sc_uc; - info->non_vlan_vl_uc = src->non_vlan_vl_uc; - info->non_vlan_sc_mc = src->non_vlan_sc_mc; - info->non_vlan_vl_mc = src->non_vlan_vl_mc; - memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2)); - - info->uc_macs_gen_count = cpu_to_be16(src->uc_macs_gen_count); - info->mc_macs_gen_count = cpu_to_be16(src->mc_macs_gen_count); - memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3)); -} - -/** - * opa_vnic_set_per_veswport_info -- Set vesw per port information - * @adapter: vnic port adapter - * @info: pointer to vport info structure - * - * This function updates the vesw per port info that is maintained by the - * given adapter with vesw per port info provided. Reserved fields are - * stored and returned back to EM as is. - */ -void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter, - struct opa_per_veswport_info *info) -{ - struct __opa_per_veswport_info *dst = &adapter->info.vport; - - dst->port_num = be32_to_cpu(info->port_num); - memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0)); - - memcpy(dst->base_mac_addr, info->base_mac_addr, - ARRAY_SIZE(dst->base_mac_addr)); - dst->config_state = info->config_state; - memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1)); - - dst->encap_slid = be32_to_cpu(info->encap_slid); - memcpy(dst->pcp_to_sc_uc, info->pcp_to_sc_uc, - ARRAY_SIZE(dst->pcp_to_sc_uc)); - memcpy(dst->pcp_to_vl_uc, info->pcp_to_vl_uc, - ARRAY_SIZE(dst->pcp_to_vl_uc)); - memcpy(dst->pcp_to_sc_mc, info->pcp_to_sc_mc, - ARRAY_SIZE(dst->pcp_to_sc_mc)); - memcpy(dst->pcp_to_vl_mc, info->pcp_to_vl_mc, - ARRAY_SIZE(dst->pcp_to_vl_mc)); - dst->non_vlan_sc_uc = info->non_vlan_sc_uc; - dst->non_vlan_vl_uc = info->non_vlan_vl_uc; - dst->non_vlan_sc_mc = info->non_vlan_sc_mc; - dst->non_vlan_vl_mc = info->non_vlan_vl_mc; - memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2)); - memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3)); -} - -/** - * opa_vnic_query_mcast_macs - query multicast mac list - * @adapter: vnic port adapter - * @macs: pointer mac list - * - * This function populates the provided mac list with the configured - * multicast addresses in the adapter. - */ -void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter, - struct opa_veswport_iface_macs *macs) -{ - u16 start_idx, num_macs, idx = 0, count = 0; - struct netdev_hw_addr *ha; - - start_idx = be16_to_cpu(macs->start_idx); - num_macs = be16_to_cpu(macs->num_macs_in_msg); - netdev_for_each_mc_addr(ha, adapter->netdev) { - struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; - - if (start_idx > idx++) - continue; - else if (num_macs == count) - break; - memcpy(entry, ha->addr, sizeof(*entry)); - count++; - } - - macs->tot_macs_in_lst = cpu_to_be16(netdev_mc_count(adapter->netdev)); - macs->num_macs_in_msg = cpu_to_be16(count); - macs->gen_count = cpu_to_be16(adapter->info.vport.mc_macs_gen_count); -} - -/** - * opa_vnic_query_ucast_macs - query unicast mac list - * @adapter: vnic port adapter - * @macs: pointer mac list - * - * This function populates the provided mac list with the configured - * unicast addresses in the adapter. - */ -void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, - struct opa_veswport_iface_macs *macs) -{ - u16 start_idx, tot_macs, num_macs, idx = 0, count = 0, em_macs = 0; - struct netdev_hw_addr *ha; - - start_idx = be16_to_cpu(macs->start_idx); - num_macs = be16_to_cpu(macs->num_macs_in_msg); - /* loop through dev_addrs list first */ - for_each_dev_addr(adapter->netdev, ha) { - struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; - - /* Do not include EM specified MAC address */ - if (!memcmp(adapter->info.vport.base_mac_addr, ha->addr, - ARRAY_SIZE(adapter->info.vport.base_mac_addr))) { - em_macs++; - continue; - } - - if (start_idx > idx++) - continue; - else if (num_macs == count) - break; - memcpy(entry, ha->addr, sizeof(*entry)); - count++; - } - - /* loop through uc list */ - netdev_for_each_uc_addr(ha, adapter->netdev) { - struct opa_vnic_iface_mac_entry *entry = &macs->entry[count]; - - if (start_idx > idx++) - continue; - else if (num_macs == count) - break; - memcpy(entry, ha->addr, sizeof(*entry)); - count++; - } - - tot_macs = netdev_hw_addr_list_count(&adapter->netdev->dev_addrs) + - netdev_uc_count(adapter->netdev) - em_macs; - macs->tot_macs_in_lst = cpu_to_be16(tot_macs); - macs->num_macs_in_msg = cpu_to_be16(count); - macs->gen_count = cpu_to_be16(adapter->info.vport.uc_macs_gen_count); -} diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6354c613e9a8..57b81ca0fabd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2361,7 +2361,6 @@ struct ib_port_data { /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { - RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; @@ -2375,11 +2374,6 @@ struct rdma_netdev { u32 port_num; int mtu; - /* - * cleanup function must be specified. - * FIXME: This is only used for OPA_VNIC and that usage should be - * removed too. - */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h deleted file mode 100644 index d297f084001a..000000000000 --- a/include/rdma/opa_vnic.h +++ /dev/null @@ -1,96 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright(c) 2017 - 2020 Intel Corporation. - */ - -#ifndef _OPA_VNIC_H -#define _OPA_VNIC_H - -/* - * This file contains Intel Omni-Path (OPA) Virtual Network Interface - * Controller (VNIC) specific declarations. - */ - -#include - -/* 16 header bytes + 2 reserved bytes */ -#define OPA_VNIC_L2_HDR_LEN (16 + 2) - -#define OPA_VNIC_L4_HDR_LEN 2 - -#define OPA_VNIC_HDR_LEN (OPA_VNIC_L2_HDR_LEN + \ - OPA_VNIC_L4_HDR_LEN) - -#define OPA_VNIC_L4_ETHR 0x78 - -#define OPA_VNIC_ICRC_LEN 4 -#define OPA_VNIC_TAIL_LEN 1 -#define OPA_VNIC_ICRC_TAIL_LEN (OPA_VNIC_ICRC_LEN + OPA_VNIC_TAIL_LEN) - -#define OPA_VNIC_SKB_MDATA_LEN 4 -#define OPA_VNIC_SKB_MDATA_ENCAP_ERR 0x1 - -/* opa vnic rdma netdev's private data structure */ -struct opa_vnic_rdma_netdev { - struct rdma_netdev rn; /* keep this first */ - /* followed by device private data */ - char *dev_priv[]; -}; - -static inline void *opa_vnic_priv(const struct net_device *dev) -{ - struct rdma_netdev *rn = netdev_priv(dev); - - return rn->clnt_priv; -} - -static inline void *opa_vnic_dev_priv(const struct net_device *dev) -{ - struct opa_vnic_rdma_netdev *oparn = netdev_priv(dev); - - return oparn->dev_priv; -} - -/* opa_vnic skb meta data structure */ -struct opa_vnic_skb_mdata { - u8 vl; - u8 entropy; - u8 flags; - u8 rsvd; -} __packed; - -/* OPA VNIC group statistics */ -struct opa_vnic_grp_stats { - u64 unicast; - u64 mcastbcast; - u64 untagged; - u64 vlan; - u64 s_64; - u64 s_65_127; - u64 s_128_255; - u64 s_256_511; - u64 s_512_1023; - u64 s_1024_1518; - u64 s_1519_max; -}; - -struct opa_vnic_stats { - /* standard netdev statistics */ - struct rtnl_link_stats64 netstats; - - /* OPA VNIC statistics */ - struct opa_vnic_grp_stats tx_grp; - struct opa_vnic_grp_stats rx_grp; - u64 tx_dlid_zero; - u64 tx_drop_state; - u64 rx_drop_state; - u64 rx_runt; - u64 rx_oversize; -}; - -static inline bool rdma_cap_opa_vnic(struct ib_device *device) -{ - return !!(device->attrs.kernel_cap_flags & IBK_RDMA_NETDEV_OPA); -} - -#endif /* _OPA_VNIC_H */ -- cgit v1.2.3 From 89fe91c65992a37863241e35aec151210efc53ce Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Fri, 6 Mar 2026 13:12:06 -0800 Subject: net: mana: hardening: Validate doorbell ID from GDMA_REGISTER_DEVICE response As a part of MANA hardening for CVM, add validation for the doorbell ID (db_id) received from hardware in the GDMA_REGISTER_DEVICE response to prevent out-of-bounds memory access when calculating the doorbell page address. In mana_gd_ring_doorbell(), the doorbell page address is calculated as: addr = db_page_base + db_page_size * db_index = (bar0_va + db_page_off) + db_page_size * db_index A hardware could return values that cause this address to fall outside the BAR0 MMIO region. In Confidential VM environments, hardware responses cannot be fully trusted. Add the following validations: - Store the BAR0 size (bar0_size) in gdma_context during probe. - Validate the doorbell page offset (db_page_off) read from device registers does not exceed bar0_size during initialization, converting mana_gd_init_registers() to return an error code. - Validate db_id from GDMA_REGISTER_DEVICE response against the maximum number of doorbell pages that fit within BAR0. Signed-off-by: Erni Sri Satya Vennela Link: https://patch.msgid.link/20260306211212.543376-1-ernis@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 60 +++++++++++++++++++------ include/net/mana/gdma.h | 4 +- 2 files changed, 49 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index aef8612b73cb..ef0dbfaac8f4 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -39,49 +39,66 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset) return readq(g->bar0_va + offset); } -static void mana_gd_init_pf_regs(struct pci_dev *pdev) +static int mana_gd_init_pf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); void __iomem *sriov_base_va; u64 sriov_base_off; gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF; - gc->db_page_base = gc->bar0_va + - mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + gc->db_page_off = mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); - gc->phys_db_page_base = gc->bar0_pa + - mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + /* Validate doorbell offset is within BAR0 */ + if (gc->db_page_off >= gc->bar0_size) { + dev_err(gc->dev, + "Doorbell offset 0x%llx exceeds BAR0 size 0x%llx\n", + gc->db_page_off, (u64)gc->bar0_size); + return -EPROTO; + } + + gc->db_page_base = gc->bar0_va + gc->db_page_off; + gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); sriov_base_va = gc->bar0_va + sriov_base_off; gc->shm_base = sriov_base_va + mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF); + + return 0; } -static void mana_gd_init_vf_regs(struct pci_dev *pdev) +static int mana_gd_init_vf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF; + gc->db_page_off = mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET); - gc->db_page_base = gc->bar0_va + - mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET); + /* Validate doorbell offset is within BAR0 */ + if (gc->db_page_off >= gc->bar0_size) { + dev_err(gc->dev, + "Doorbell offset 0x%llx exceeds BAR0 size 0x%llx\n", + gc->db_page_off, (u64)gc->bar0_size); + return -EPROTO; + } - gc->phys_db_page_base = gc->bar0_pa + - mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET); + gc->db_page_base = gc->bar0_va + gc->db_page_off; + gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); + + return 0; } -static void mana_gd_init_registers(struct pci_dev *pdev) +static int mana_gd_init_registers(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); if (gc->is_pf) - mana_gd_init_pf_regs(pdev); + return mana_gd_init_pf_regs(pdev); else - mana_gd_init_vf_regs(pdev); + return mana_gd_init_vf_regs(pdev); } /* Suppress logging when we set timeout to zero */ @@ -1256,6 +1273,17 @@ int mana_gd_register_device(struct gdma_dev *gd) return err ? err : -EPROTO; } + /* Validate that doorbell page for db_id is within the BAR0 region. + * In mana_gd_ring_doorbell(), the address is calculated as: + * addr = db_page_base + db_page_size * db_id + * = (bar0_va + db_page_off) + (db_page_size * db_id) + * So we need: db_page_off + db_page_size * (db_id + 1) <= bar0_size + */ + if (gc->db_page_off + gc->db_page_size * ((u64)resp.db_id + 1) > gc->bar0_size) { + dev_err(gc->dev, "Doorbell ID %u out of range\n", resp.db_id); + return -EPROTO; + } + gd->pdid = resp.pdid; gd->gpa_mkey = resp.gpa_mkey; gd->doorbell = resp.db_id; @@ -1890,7 +1918,10 @@ static int mana_gd_setup(struct pci_dev *pdev) struct gdma_context *gc = pci_get_drvdata(pdev); int err; - mana_gd_init_registers(pdev); + err = mana_gd_init_registers(pdev); + if (err) + return err; + mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0); @@ -1996,6 +2027,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) mutex_init(&gc->eq_test_event_mutex); pci_set_drvdata(pdev, gc); gc->bar0_pa = pci_resource_start(pdev, 0); + gc->bar0_size = pci_resource_len(pdev, 0); bar0_va = pci_iomap(pdev, bar, 0); if (!bar0_va) diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index ec17004b10c0..7fe3a1b61b2d 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -421,10 +421,12 @@ struct gdma_context { phys_addr_t bar0_pa; void __iomem *bar0_va; + resource_size_t bar0_size; void __iomem *shm_base; void __iomem *db_page_base; phys_addr_t phys_db_page_base; - u32 db_page_size; + u64 db_page_off; + u64 db_page_size; int numa_node; /* Shared memory chanenl (used to bootstrap HWC) */ -- cgit v1.2.3 From 0e07b16371b6eef9b5a4a1fd3e7942938811072e Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 6 Mar 2026 09:55:03 +0200 Subject: drm/xe: Allow per queue programming of COMMON_SLICE_CHICKEN3 bit13 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to i915's commit cebc13de7e704b1355bea208a9f9cdb042c74588 ("drm/i915: Whitelist COMMON_SLICE_CHICKEN3 for UMD access"), except that instead of putting the register on the allowlist for UMD to program, the KMD is doing the programming at context initialization based on a queue creation flag. This is a recommended tuning setting for both gen12 and Xe_HP platforms. If a render queue is created with DRM_XE_EXEC_QUEUE_SET_STATE_CACHE_PERF_FIX, COMMON_SLICE_CHICKEN3 will be programmed at initialization to enable the render color cache to key with BTP+BTI (binding table pool + binding table entry) instead of just BTI (binding table entry). This enables the UMD to avoid emitting render-target-cache-flush + stall-at-pixel-scoreboard every time a binding table entry pointing to a render target is changed. v2: Use xe_lrc_write_ring() v3: Update xe_query.c to report availability v4: Rename defines to add DISABLE_ v5: update commit message v6: rebase Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39982 Bspec: 73993, 73994, 72161, 31870, 68331 Acked-by: Rodrigo Vivi Reviewed-by: José Roberto de Souza Signed-off-by: Lionel Landwerlin Signed-off-by: José Roberto de Souza Link: https://patch.msgid.link/20260306075504.1288676-1-lionel.g.landwerlin@intel.com --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 + drivers/gpu/drm/xe/xe_exec_queue.c | 19 ++++++++++++++++++- drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_lrc.c | 9 +++++++++ drivers/gpu/drm/xe/xe_lrc.h | 1 + drivers/gpu/drm/xe/xe_query.c | 2 ++ include/uapi/drm/xe_drm.h | 8 ++++++++ 7 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index 8e6df9dcd137..4ee88f629c02 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -183,6 +183,7 @@ #define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED) #define XEHP_COMMON_SLICE_CHICKEN3 XE_REG_MCR(0x7304, XE_REG_OPTION_MASKED) +#define DISABLE_STATE_CACHE_PERF_FIX REG_BIT(13) #define DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN REG_BIT(12) #define XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE REG_BIT(12) #define BLEND_EMB_FIX_DISABLE_IN_RCC REG_BIT(11) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 6166b1a81433..a11021d36f87 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -353,6 +353,9 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags) if (!(exec_queue_flags & EXEC_QUEUE_FLAG_KERNEL)) flags |= XE_LRC_CREATE_USER_CTX; + if (q->flags & EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX) + flags |= XE_LRC_DISABLE_STATE_CACHE_PERF_FIX; + err = q->ops->init(q); if (err) return err; @@ -978,6 +981,17 @@ static int exec_queue_set_multi_queue_priority(struct xe_device *xe, struct xe_e return q->ops->set_multi_queue_priority(q, value); } +static int exec_queue_set_state_cache_perf_fix(struct xe_device *xe, struct xe_exec_queue *q, + u64 value) +{ + if (XE_IOCTL_DBG(xe, q->class != XE_ENGINE_CLASS_RENDER)) + return -EOPNOTSUPP; + + q->flags |= value != 0 ? EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX : 0; + + return 0; +} + typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe, struct xe_exec_queue *q, u64 value); @@ -990,6 +1004,8 @@ static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = { [DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP] = exec_queue_set_multi_group, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY] = exec_queue_set_multi_queue_priority, + [DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX] = + exec_queue_set_state_cache_perf_fix, }; /** @@ -1085,7 +1101,8 @@ static int exec_queue_user_ext_set_property(struct xe_device *xe, ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_PXP_TYPE && ext.property != DRM_XE_EXEC_QUEUE_SET_HANG_REPLAY_STATE && ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP && - ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY)) + ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY && + ext.property != DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX)) return -EINVAL; idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs)); diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index a1f3938f4173..8ce78e0b1d50 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -134,6 +134,8 @@ struct xe_exec_queue { #define EXEC_QUEUE_FLAG_LOW_LATENCY BIT(5) /* for migration (kernel copy, clear, bind) jobs */ #define EXEC_QUEUE_FLAG_MIGRATE BIT(6) +/* for programming COMMON_SLICE_CHICKEN3 on first submission */ +#define EXEC_QUEUE_FLAG_DISABLE_STATE_CACHE_PERF_FIX BIT(7) /** * @flags: flags for this exec queue, should statically setup aside from ban diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index ebab5d78f7cc..a1f856eff4ee 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -14,6 +14,7 @@ #include "instructions/xe_gfxpipe_commands.h" #include "instructions/xe_gfx_state_commands.h" #include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" #include "regs/xe_lrc_layout.h" #include "xe_bb.h" #include "xe_bo.h" @@ -1446,6 +1447,7 @@ static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct struct xe_device *xe = gt_to_xe(gt); struct iosys_map map; u32 arb_enable; + u32 state_cache_perf_fix[3]; int err; /* @@ -1546,6 +1548,13 @@ static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); + if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) { + state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); + state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr; + state_cache_perf_fix[2] = _MASKED_BIT_ENABLE(DISABLE_STATE_CACHE_PERF_FIX); + xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix)); + } + map = __xe_lrc_seqno_map(lrc); xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h index 48f7c26cf129..e7c975f9e2d9 100644 --- a/drivers/gpu/drm/xe/xe_lrc.h +++ b/drivers/gpu/drm/xe/xe_lrc.h @@ -49,6 +49,7 @@ struct xe_lrc_snapshot { #define XE_LRC_CREATE_RUNALONE BIT(0) #define XE_LRC_CREATE_PXP BIT(1) #define XE_LRC_CREATE_USER_CTX BIT(2) +#define XE_LRC_DISABLE_STATE_CACHE_PERF_FIX BIT(3) struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, void *replay_state, u32 ring_size, u16 msix_vec, u32 flags); diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c index 34db266b723f..4852fdcb4b95 100644 --- a/drivers/gpu/drm/xe/xe_query.c +++ b/drivers/gpu/drm/xe/xe_query.c @@ -340,6 +340,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query) DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT; config->info[DRM_XE_QUERY_CONFIG_FLAGS] |= DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY; + config->info[DRM_XE_QUERY_CONFIG_FLAGS] |= + DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX; config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] = xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K; config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits; diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index b0264c32ceb2..f074871b4d96 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -406,6 +406,9 @@ struct drm_xe_query_mem_regions { * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT - Flag is set if the * device supports the userspace hint %DRM_XE_GEM_CREATE_FLAG_NO_COMPRESSION. * This is exposed only on Xe2+. + * - %DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX - Flag is set + * if a queue can be creaed with + * %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX * - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment * required by this device, typically SZ_4K or SZ_64K * - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address @@ -425,6 +428,7 @@ struct drm_xe_query_config { #define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY (1 << 1) #define DRM_XE_QUERY_CONFIG_FLAG_HAS_CPU_ADDR_MIRROR (1 << 2) #define DRM_XE_QUERY_CONFIG_FLAG_HAS_NO_COMPRESSION_HINT (1 << 3) + #define DRM_XE_QUERY_CONFIG_FLAG_HAS_DISABLE_STATE_CACHE_PERF_FIX (1 << 4) #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT 2 #define DRM_XE_QUERY_CONFIG_VA_BITS 3 #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY 4 @@ -1285,6 +1289,9 @@ struct drm_xe_vm_bind { * - %DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY - Set the queue * priority within the multi-queue group. Current valid priority values are 0–2 * (default is 1), with higher values indicating higher priority. + * - %DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX - Set the queue to + * enable render color cache keying on BTP+BTI instead of just BTI + * (only valid for render queues). * * The example below shows how to use @drm_xe_exec_queue_create to create * a simple exec_queue (no parallel submission) of class @@ -1329,6 +1336,7 @@ struct drm_xe_exec_queue_create { #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_GROUP 4 #define DRM_XE_MULTI_GROUP_CREATE (1ull << 63) #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_MULTI_QUEUE_PRIORITY 5 +#define DRM_XE_EXEC_QUEUE_SET_DISABLE_STATE_CACHE_PERF_FIX 6 /** @extensions: Pointer to the first extension struct, if any */ __u64 extensions; -- cgit v1.2.3 From 1b891f4c852817b3afd231712ab7e171932e1eb1 Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Tue, 10 Mar 2026 07:29:19 +0000 Subject: include: device.h: Add named device attributes Adds DEVICE_ATTR_[RW|RO|WO]_NAMED macros for adding attributes that reuse the same sysfs name in a driver under separate subdirectories. When dealing with some devices it can be useful to be able to reuse the same name for similar attributes under a different subdirectory. For example, a single logical HID endpoint may provide a configuration interface for multiple physical devices. In such a case it is useful to provide symmetrical attribute names under different subdirectories on the configuration device. The Lenovo Legion Go is one such device, providing configuration to a detachable left controller, detachable right controller, the wireless transmission dongle, and the MCU. It is therefore beneficial to treat each of these as individual devices in the driver, providing a subdirectory for each physical device in the sysfs. As some attributes are reused by each physical device, it provides a much cleaner interface if the same driver can reuse the same attribute name in sysfs while uniquely distinguishing the store/show functions in the driver, rather than repeat string portions. Example new WO attrs: ATTRS{left_handle/reset}=="(not readable)" ATTRS{right_handle/reset}=="(not readable)" ATTRS{tx_dongle/reset}=="(not readable)" vs old WO attrs in a subdir: ATTRS{left_handle/left_handle_reset}=="(not readable)" ATTRS{right_handle/right_handle_reset}=="(not readable)" ATTRS{tx_dongle/tx_dongle_reset}=="(not readable)" or old WO attrs with no subdir: ATTRS{left_handle_reset}=="(not readable)" ATTRS{right_handle_reset}=="(not readable)" ATTRS{tx_dongle_reset}=="(not readable)" While the third option is usable, it doesn't logically break up the physical devices and creates a device directory with over 80 attributes once all attrs are defined. Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Acked-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/device.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..381463baed6d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -189,6 +189,22 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) +/** + * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RW(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RW_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0644 }, \ + .show = _name##_show, \ + .store = _name##_store, \ + } + /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. @@ -207,6 +223,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) +/** + * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0444 }, \ + .show = _name##_show, \ + } + /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. @@ -216,6 +247,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) +/** + * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_WO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_WO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0200 }, \ + .store = _name##_store, \ + } + /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. -- cgit v1.2.3 From 6ca9029c823b7853e980585e757343e0e84227cd Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Mar 2026 07:29:27 +0000 Subject: HID: Include firmware version in the uevent Userspace software fwupd probes some HID devices when the daemon starts up to determine the current firmware version in order to be able to offer updated firmware if the manufacturer has made it available. In order to do this fwupd will detach the existing kernel driver if one is present, send a HID command and then reattach the kernel driver. This can be problematic if the user is using the HID device at the time that fwupd probes the hardware and can cause a few frames of input to be dropped. In some cases HID drivers already have a command to look up the firmware version, and so if that is exported to userspace fwupd can discover it and avoid needing to detach the kernel driver until it's time to update the device. Introduce a new member in the struct hid_device for the version and export a new uevent variable HID_FIRMWARE_VERSION that will display the version that HID drivers obtained. Reviewed-by: Derek J. Clark Reviewed-by: Mark Pearson Cc: Richard Hughes Signed-off-by: Mario Limonciello Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 5 +++++ include/linux/hid.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 840a60113868..da57cbf0af26 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2887,6 +2887,11 @@ static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env) if (add_uevent_var(env, "MODALIAS=hid:b%04Xg%04Xv%08Xp%08X", hdev->bus, hdev->group, hdev->vendor, hdev->product)) return -ENOMEM; + if (hdev->firmware_version) { + if (add_uevent_var(env, "HID_FIRMWARE_VERSION=0x%04llX", + hdev->firmware_version)) + return -ENOMEM; + } return 0; } diff --git a/include/linux/hid.h b/include/linux/hid.h index 2990b9f94cb5..b0b70c05049d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -698,6 +698,7 @@ struct hid_device { char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ + u64 firmware_version; /* Firmware version */ void *driver_data; -- cgit v1.2.3 From 1dfc9d60a69ec148e1cb709256617d86e5f0e8f8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 5 Mar 2026 22:45:40 +0100 Subject: workqueue: devres: Add device-managed allocate workqueue Add a Resource-managed version of alloc_workqueue() to fix common problem of drivers mixing devm() calls with destroy_workqueue. Such naive and discouraged driver approach leads to difficult to debug bugs when the driver: 1. Allocates workqueue in standard way and destroys it in driver remove() callback, 2. Sets work struct with devm_work_autocancel(), 3. Registers interrupt handler with devm_request_threaded_irq(). Which leads to following unbind/removal path: 1. destroy_workqueue() via driver remove(), Any interrupt coming now would still execute the interrupt handler, which queues work on destroyed workqueue. 2. devm_irq_release(), 3. devm_work_drop() -> cancel_work_sync() on destroyed workqueue. devm_alloc_workqueue() has two benefits: 1. Solves above problem of mix-and-match devres and non-devres code in driver, 2. Simplify any sane drivers which were correctly using alloc_workqueue() + devm_add_action_or_reset(). Signed-off-by: Krzysztof Kozlowski Acked-by: Tejun Heo Reviewed-by: Andy Shevchenko Signed-off-by: Tejun Heo --- Documentation/driver-api/driver-model/devres.rst | 4 ++++ include/linux/workqueue.h | 22 +++++++++++++++++++ kernel/workqueue.c | 28 ++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'include') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 7d2b897d66fa..017fb155a5bc 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -464,3 +464,7 @@ SPI WATCHDOG devm_watchdog_register_device() + +WORKQUEUE + devm_alloc_workqueue() + devm_alloc_ordered_workqueue() diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..f8d235aef10d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -512,6 +512,26 @@ __printf(1, 4) struct workqueue_struct * alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...); #define alloc_workqueue(...) alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__)) +/** + * devm_alloc_workqueue - Resource-managed allocate a workqueue + * @dev: Device to allocate workqueue for + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @...: args for @fmt + * + * Resource managed workqueue, see alloc_workqueue() for details. + * + * The workqueue will be automatically destroyed on driver detach. Typically + * this should be used in drivers already relying on devm interafaces. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...); + #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map @@ -568,6 +588,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) +#define devm_alloc_ordered_workqueue(dev, fmt, flags, args...) \ + devm_alloc_workqueue(dev, fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeaec79bc09c..19d20f3039d9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -5891,6 +5892,33 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, } EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); +static void devm_workqueue_release(void *res) +{ + destroy_workqueue(res); +} + +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + int ret; + + va_start(args, max_active); + wq = alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq); + if (ret) + return NULL; + + return wq; +} +EXPORT_SYMBOL_GPL(devm_alloc_workqueue); + #ifdef CONFIG_LOCKDEP __printf(1, 5) struct workqueue_struct * -- cgit v1.2.3 From 5a8103a6fb0ae9cf99c0271b17474468d6bae2b2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Mar 2026 18:21:57 +0100 Subject: genirq: Document interaction between and DT binding defines Document that the DT binding definitions in shadow the first six IRQ_TYPE_* definitions in . The values must be the same anyway, so this is harmless (as long as the latter is included first when both are included), but it is good to document this explicitly. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/fbcc65dcee6c5437fab5ef18d21766bb4effb7cb.1772644406.git.geert+renesas@glider.be --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 951acbdb9f84..efa514ee562f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -35,6 +35,10 @@ enum irqchip_irq_state; * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * + * Note that the first 6 definitions are shadowed by C preprocessor definitions + * in include/dt-bindings/interrupt-controller/irq.h. This is not an issue, as + * the actual values must be the same, due to being part of the stable DT ABI. + * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered -- cgit v1.2.3 From 360160f75592bdc85edba8fe78fb20d90924c7e8 Mon Sep 17 00:00:00 2001 From: Ricardo Robaina Date: Mon, 9 Mar 2026 10:05:33 -0300 Subject: audit: handle unknown status requests in audit_receive_msg() Currently, audit_receive_msg() ignores unknown status bits in AUDIT_SET requests, incorrectly returning success to newer user space tools querying unsupported features. This breaks forward compatibility. Fix this by defining AUDIT_STATUS_ALL and returning -EINVAL if any unrecognized bits are set (s.mask & ~AUDIT_STATUS_ALL). This ensures invalid requests are safely rejected, allowing user space to reliably test for and gracefully handle feature detection on older kernels. Suggested-by: Steve Grubb Signed-off-by: Ricardo Robaina [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/audit.h | 9 +++++++++ kernel/audit.c | 2 ++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..d79218bf075a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,6 +15,15 @@ #include #include +#define AUDIT_STATUS_ALL (AUDIT_STATUS_ENABLED | \ + AUDIT_STATUS_FAILURE | \ + AUDIT_STATUS_PID | \ + AUDIT_STATUS_RATE_LIMIT | \ + AUDIT_STATUS_BACKLOG_LIMIT | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME | \ + AUDIT_STATUS_LOST | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) + #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) diff --git a/kernel/audit.c b/kernel/audit.c index 08793e71b975..e1d489bc2dff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1295,6 +1295,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); + if (s.mask & ~AUDIT_STATUS_ALL) + return -EINVAL; if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) -- cgit v1.2.3 From a564839e630c11b089470d2e010b0019b7cf61bc Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Mon, 24 Mar 2025 10:04:44 +0200 Subject: media: subdev: Add v4l2_subdev_get_frame_desc_passthrough helper Add a helper for v4l2_subdev_pad_ops.v4l2_get_frame_desc operation. The helper can be used when the subdevice directly passes through the streams. Signed-off-by: Tomi Valkeinen Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-subdev.c | 113 ++++++++++++++++++++++++++++++++++ include/media/v4l2-subdev.h | 22 +++++++ 2 files changed, 135 insertions(+) (limited to 'include') diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 32e6f60e26c7..9efd14d4026f 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -2545,6 +2545,119 @@ int v4l2_subdev_s_stream_helper(struct v4l2_subdev *sd, int enable) } EXPORT_SYMBOL_GPL(v4l2_subdev_s_stream_helper); +int v4l2_subdev_get_frame_desc_passthrough(struct v4l2_subdev *sd, + unsigned int pad, + struct v4l2_mbus_frame_desc *fd) +{ + const struct media_pad *pads = sd->entity.pads; + struct media_pad *local_sink_pad; + struct v4l2_subdev_route *route; + struct v4l2_subdev_state *state; + struct device *dev = sd->dev; + int ret = 0; + + if (WARN_ON(!(pads[pad].flags & MEDIA_PAD_FL_SOURCE))) + return -EINVAL; + + state = v4l2_subdev_lock_and_get_active_state(sd); + + /* Iterate over sink pads */ + media_entity_for_each_pad(&sd->entity, local_sink_pad) { + struct v4l2_mbus_frame_desc source_fd; + bool have_source_fd = false; + + if (!(local_sink_pad->flags & MEDIA_PAD_FL_SINK)) + continue; + + /* + * Copy frame desc entries for the streams going from the sink + * pad to the requested pad + */ + for_each_active_route(&state->routing, route) { + struct v4l2_mbus_frame_desc_entry *source_entry = NULL; + struct media_pad *remote_source_pad; + struct v4l2_subdev *remote_sd; + unsigned int i; + + if (route->source_pad != pad || + route->sink_pad != local_sink_pad->index) + continue; + + if (!have_source_fd) { + remote_source_pad = media_pad_remote_pad_unique(local_sink_pad); + if (!remote_source_pad) { + dev_dbg(dev, "Failed to find remote pad for sink pad %u\n", + local_sink_pad->index); + ret = -EINVAL; + goto out_unlock; + } + + remote_sd = media_entity_to_v4l2_subdev(remote_source_pad->entity); + if (!remote_sd) { + ret = -EINVAL; + goto out_unlock; + } + + ret = v4l2_subdev_call(remote_sd, pad, + get_frame_desc, + remote_source_pad->index, + &source_fd); + if (ret) { + dev_err(dev, + "Failed to get frame desc from remote subdev %s\n", + remote_sd->name); + goto out_unlock; + } + + have_source_fd = true; + + if (fd->num_entries == 0) { + fd->type = source_fd.type; + } else if (fd->type != source_fd.type) { + dev_err(dev, + "Frame desc type mismatch: %u != %u\n", + fd->type, source_fd.type); + ret = -EPIPE; + goto out_unlock; + } + } + + for (i = 0; i < source_fd.num_entries; i++) { + if (source_fd.entry[i].stream == route->sink_stream) { + source_entry = &source_fd.entry[i]; + break; + } + } + + if (!source_entry) { + dev_dbg(sd->dev, + "Failed to find stream %u from source frame desc\n", + route->sink_stream); + ret = -EPIPE; + goto out_unlock; + } + + if (fd->num_entries >= V4L2_FRAME_DESC_ENTRY_MAX) { + dev_dbg(sd->dev, "Frame desc entry limit reached\n"); + ret = -ENOSPC; + goto out_unlock; + } + + fd->entry[fd->num_entries] = *source_entry; + + fd->entry[fd->num_entries].stream = route->source_stream; + + fd->num_entries++; + } + } + +out_unlock: + v4l2_subdev_unlock_state(state); + + return ret; +} +EXPORT_SYMBOL_GPL(v4l2_subdev_get_frame_desc_passthrough); + #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ #endif /* CONFIG_MEDIA_CONTROLLER */ diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index a37d9a847196..e754ed3421c5 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -1722,6 +1722,28 @@ int v4l2_subdev_disable_streams(struct v4l2_subdev *sd, u32 pad, */ int v4l2_subdev_s_stream_helper(struct v4l2_subdev *sd, int enable); +/** + * v4l2_subdev_get_frame_desc_passthrough() - Helper to implement the subdev + * v4l2_get_frame_desc operation in simple passthrough cases + * @sd: The subdevice + * @pad: The source pad index + * @fd: The mbus frame desc + * + * Subdevice drivers that only pass through the streams can use this helper + * to implement the &v4l2_subdev_pad_ops.v4l2_get_frame_desc operation. + * + * The helper will call get_frame_desc on the subdevice's sources, create a new + * frame desc which contains only the streams on the given source pad. The data + * for each frame desc entry is copied directly from the data provided from the + * calls to the subdevice's sources, with the exception of the 'stream' field + * which is set according to the subdevice's routing table. + * + * Return: 0 on success, or a negative error code otherwise. + */ +int v4l2_subdev_get_frame_desc_passthrough(struct v4l2_subdev *sd, + unsigned int pad, + struct v4l2_mbus_frame_desc *fd); + #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ #endif /* CONFIG_MEDIA_CONTROLLER */ -- cgit v1.2.3 From 6ab94d0194ddca662da69cf42b98dcf74690ed92 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Tue, 10 Mar 2026 08:52:28 +0800 Subject: scsi: ufs: core: Add quirks for VCC ramp-up delay On some platforms, the VCC regulator has a slow ramp-up time. Add a delay after enabling VCC to ensure voltage has fully stabilized before we enable the clocks. Reviewed-by: Bart Van Assche Signed-off-by: Ed Tsai Link: https://patch.msgid.link/20260310005230.4001904-4-ed.tsai@mediatek.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 12 ++++++++++++ include/ufs/ufshcd.h | 6 ++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0eb4f4af231e..cf7f0ae46f75 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9952,11 +9952,13 @@ static void ufshcd_vreg_set_lpm(struct ufs_hba *hba) #ifdef CONFIG_PM static int ufshcd_vreg_set_hpm(struct ufs_hba *hba) { + bool vcc_on = false; int ret = 0; if (ufshcd_is_ufs_dev_poweroff(hba) && ufshcd_is_link_off(hba) && !hba->dev_info.is_lu_power_on_wp) { ret = ufshcd_setup_vreg(hba, true); + vcc_on = true; } else if (!ufshcd_is_ufs_dev_active(hba)) { if (!ufshcd_is_link_active(hba)) { ret = ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq); @@ -9967,6 +9969,7 @@ static int ufshcd_vreg_set_hpm(struct ufs_hba *hba) goto vccq_lpm; } ret = ufshcd_toggle_vreg(hba->dev, hba->vreg_info.vcc, true); + vcc_on = true; } goto out; @@ -9975,6 +9978,15 @@ vccq_lpm: vcc_disable: ufshcd_toggle_vreg(hba->dev, hba->vreg_info.vcc, false); out: + /* + * On platforms with a slow VCC ramp-up, a delay is needed after + * turning on VCC to ensure the voltage is stable before the + * reference clock is enabled. + */ + if (hba->quirks & UFSHCD_QUIRK_VCC_ON_DELAY && !ret && vcc_on && + hba->vreg_info.vcc && !hba->vreg_info.vcc->always_on) + usleep_range(1000, 1100); + return ret; } #endif /* CONFIG_PM */ diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 182f301c11e7..cb6f1537a3f3 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -690,6 +690,12 @@ enum ufshcd_quirks { * because it causes link startup to become unreliable. */ UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE = 1 << 26, + + /* + * On some platforms, the VCC regulator has a slow ramp-up time. Add a + * delay after enabling VCC to ensure it's stable. + */ + UFSHCD_QUIRK_VCC_ON_DELAY = 1 << 27, }; enum ufshcd_caps { -- cgit v1.2.3 From 7da62262ec96a4b345d207b6bcd2ddf5231b7f7d Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 9 Mar 2026 03:39:45 +0100 Subject: inet: add ip_local_port_step_width sysctl to improve port usage distribution With the current port selection algorithm, ports after a reserved port range or long time used port are used more often than others [1]. This causes an uneven port usage distribution. This combines with cloud environments blocking connections between the application server and the database server if there was a previous connection with the same source port, leading to connectivity problems between applications on cloud environments. The real issue here is that these firewalls cannot cope with standards-compliant port reuse. This is a workaround for such situations and an improvement on the distribution of ports selected. The proposed solution is to implement a variant of RFC 6056 Algorithm 5. The step size is selected randomly on every connect() call ensuring it is a coprime with respect to the size of the range of ports we want to scan. This way, we can ensure that all ports within the range are scanned before returning an error. To enable this algorithm, the user must configure the new sysctl option "net.ipv4.ip_local_port_step_width". In addition, on graphs generated we can observe that the distribution of source ports is more even with the proposed approach. [2] [1] https://0xffsoftware.com/port_graph_current_alg.html [2] https://0xffsoftware.com/port_graph_random_step_alg.html Reviewed-by: Eric Dumazet Signed-off-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260309023946.5473-2-fmancera@suse.de Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 16 +++++++++++++ .../net_cachelines/netns_ipv4_sysctl.rst | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/inet_hashtables.c | 28 +++++++++++++++++++--- net/ipv4/sysctl_net_ipv4.c | 7 ++++++ 5 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 265158534cda..2e3a746fcc6d 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1630,6 +1630,22 @@ ip_local_reserved_ports - list of comma separated ranges Default: Empty +ip_local_port_step_width - INTEGER + Defines the numerical maximum increment between successive port + allocations within the ephemeral port range when an unavailable port is + reached. This can be used to mitigate accumulated nodes in port + distribution when reserved ports have been configured. Please note that + port collisions may be more frequent in a system with a very high load. + + It is recommended to set this value strictly larger than the largest + contiguous block of ports configure in ip_local_reserved_ports. For + large reserved port ranges, setting this to 3x or 4x the size of the + largest block is advised. Using a value equal or greater than the local + port range size completely solves the uneven port distribution problem, + but it can degrade performance under port exhaustion situations. + + Default: 0 (disabled) + ip_unprivileged_port_start - INTEGER This is a per-namespace sysctl. It defines the first unprivileged port in the network namespace. Privileged ports diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index beaf1880a19b..cf284263e69b 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -52,6 +52,7 @@ u8 sysctl_ip_fwd_update_priority u8 sysctl_ip_nonlocal_bind u8 sysctl_ip_autobind_reuse u8 sysctl_ip_dynaddr +u32 sysctl_ip_local_port_step_width u8 sysctl_ip_early_demux read_mostly ip(6)_rcv_finish_core u8 sysctl_raw_l3mdev_accept u8 sysctl_tcp_early_demux read_mostly ip(6)_rcv_finish_core diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 38624beff9b3..80ccd4dda8e0 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -166,6 +166,7 @@ struct netns_ipv4 { u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; + u32 sysctl_ip_local_port_step_width; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ac7b67c603b5..13310c72b0bf 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1057,12 +1058,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; + int step, scan_step, l3mdev; + u32 index, max_rand_step; bool tb_created = false; u32 remaining, offset; int ret, i, low, high; bool local_ports; - int step, l3mdev; - u32 index; if (port) { local_bh_disable(); @@ -1076,6 +1077,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_ports = inet_sk_get_local_port_range(sk, &low, &high); step = local_ports ? 1 : 2; + scan_step = step; + max_rand_step = READ_ONCE(net->ipv4.sysctl_ip_local_port_step_width); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; @@ -1094,9 +1097,28 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, */ if (!local_ports) offset &= ~1U; + + if (max_rand_step && remaining > 1) { + u32 range = remaining / step; + u32 upper_bound; + + upper_bound = min(range, max_rand_step); + scan_step = get_random_u32_inclusive(1, upper_bound); + while (gcd(scan_step, range) != 1) { + scan_step++; + /* if both scan_step and range are even gcd won't be 1 */ + if (!(scan_step & 1) && !(range & 1)) + scan_step++; + if (unlikely(scan_step > upper_bound)) { + scan_step = 1; + break; + } + } + scan_step *= step; + } other_parity_scan: port = low + offset; - for (i = 0; i < remaining; i += step, port += step) { + for (i = 0; i < remaining; i += step, port += scan_step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5654cc9c8a0b..d8bdb1bdbff1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -823,6 +823,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_port_step_width", + .maxlen = sizeof(u32), + .data = &init_net.ipv4.sysctl_ip_local_port_step_width, + .mode = 0644, + .proc_handler = proc_douintvec, + }, { .procname = "ip_local_reserved_ports", .data = &init_net.ipv4.sysctl_local_reserved_ports, -- cgit v1.2.3 From 7a6387dec8cee5a237dc5092269e97028f5a983b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:18 +0000 Subject: net: stmmac: provide plat_dat->dma_cfg in stmmac_plat_dat_alloc() plat_dat->dma_cfg is unconditionally required for the operation of the driver, so it would make sense to allocate it along with the plat_dat. On Arm64, sizeof(*plat_dat) has recently shrunk from 880 to 816 bytes and sizeof(*plat_dat->dma_cfg) has shrunk from 32 to 20 bytes. Given that dma_cfg is required, and it is now less than a cache line, It doesn't make sense to allocate this separateny, so place it at the end of struct plat_stmmacenet_data, and set plat_dat->dma_cfg to point at that to avoid mass changes. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX54-0000000CVrw-2jfu@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 5 ----- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 4 ---- drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c | 4 ---- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 5 ----- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 +------- include/linux/stmmac.h | 1 + 7 files changed, 4 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index fc13bfb47783..0b32560cd059 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -1251,11 +1251,6 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), - GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - plat->safety_feat_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->safety_feat_cfg), GFP_KERNEL); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index ada6c6ef1f5c..51b1562f84d1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -513,10 +513,6 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - ld = devm_kzalloc(&pdev->dev, sizeof(*ld), GFP_KERNEL); if (!ld) return -ENOMEM; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c index 8b45b9cf7202..d245546b90db 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c @@ -218,10 +218,6 @@ motorcomm_default_plat_data(struct pci_dev *pdev) if (!plat->mdio_bus_data) return NULL; - plat->dma_cfg = devm_kzalloc(dev, sizeof(*plat->dma_cfg), GFP_KERNEL); - if (!plat->dma_cfg) - return NULL; - plat->axi = devm_kzalloc(dev, sizeof(*plat->axi), GFP_KERNEL); if (!plat->axi) return NULL; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f0160ff54a59..87f43811faa0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7730,6 +7730,8 @@ struct plat_stmmacenet_data *stmmac_plat_dat_alloc(struct device *dev) if (!plat_dat) return NULL; + plat_dat->dma_cfg = &plat_dat->__dma_cfg; + /* Set the defaults: * - phy autodetection * - determine GMII_Address CR field from CSR clock diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 270ad066ced3..836fed7d60ab 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -134,11 +134,6 @@ static int stmmac_pci_probe(struct pci_dev *pdev, if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), - GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - plat->safety_feat_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->safety_feat_cfg), GFP_KERNEL); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index c34998486293..1aed48fe0db6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -548,13 +548,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) &plat->multicast_filter_bins); } - dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), - GFP_KERNEL); - if (!dma_cfg) { - ret = ERR_PTR(-ENOMEM); - goto error_put_mdio; - } - plat->dma_cfg = dma_cfg; + dma_cfg = plat->dma_cfg; of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); if (!dma_cfg->pbl) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 965ada809fdf..919196713c05 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -306,5 +306,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; const struct dwmac4_addrs *dwmac4_addrs; unsigned int flags; + struct stmmac_dma_cfg __dma_cfg; }; #endif -- cgit v1.2.3 From c3d08424e025aaac8fb54134f76e611ef919cd08 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:23 +0000 Subject: net: stmmac: convert plat_stmmacenet_data booleans to type bool Convert members of struct plat_stmmacenet_data that are booleans to type 'bool' and ensure their initialisers are true/false. Move the has_xxx for the GMAC cores together, and move the COE members to the end of the list of bool to avoid unused holes in the struct. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX59-0000000CVs2-3MHc@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 6 +++--- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 20 ++++++++++---------- include/linux/stmmac.h | 14 +++++++------- 14 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 0495437d3a6e..b0c5d1ecabce 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -88,7 +88,7 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, plat_dat->core_type = DWMAC_CORE_GMAC4; plat_dat->dma_cfg->aal = 1; plat_dat->flags |= STMMAC_FLAG_TSO_EN; - plat_dat->pmt = 1; + plat_dat->pmt = true; return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 0b32560cd059..421c6c81ca5e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -566,7 +566,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->mdio_bus_data->needs_reset = true; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 51b1562f84d1..eb14c197d6ae 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -94,7 +94,7 @@ static void loongson_default_data(struct pci_dev *pdev, /* clk_csr_i = 100-150MHz & MDC = clk_csr_i/62 */ plat->clk_csr = STMMAC_CSR_100_150M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; /* Increase the default value for multicast hash bins */ plat->multicast_filter_bins = 256; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 1f2d7d19ca56..a139db6a8cbb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -564,7 +564,7 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL; else plat->flags |= STMMAC_FLAG_USE_PHY_WOL; - plat->riwt_off = 1; + plat->riwt_off = true; plat->maxmtu = ETH_DATA_LEN; plat->host_dma_width = priv_plat->variant->dma_bit_mask; plat->bsp_priv = priv_plat; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index cb1c074c2053..388e9fdeb86c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -817,7 +817,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->core_type = DWMAC_CORE_GMAC4; if (ethqos->has_emac_ge_3) plat_dat->dwmac4_addrs = &data->dwmac4_addrs; - plat_dat->pmt = 1; + plat_dat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat_dat->flags |= STMMAC_FLAG_TSO_EN; if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c index af594a096676..48fceadc55b1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -163,7 +163,7 @@ static int s32_dwmac_probe(struct platform_device *pdev) /* S32CC core feature set */ plat->core_type = DWMAC_CORE_GMAC4; - plat->pmt = 1; + plat->pmt = true; plat->flags |= STMMAC_FLAG_SPH_DISABLE; plat->rx_fifo_size = 20480; plat->tx_fifo_size = 20480; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index c6b99814d391..5f89fd968ae9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -565,7 +565,7 @@ static void socfpga_gen5_setup_plat_dat(struct socfpga_dwmac *dwmac) plat_dat->core_type = DWMAC_CORE_GMAC; /* Rx watchdog timer in dwmac is buggy in this hw */ - plat_dat->riwt_off = 1; + plat_dat->riwt_off = true; } static void socfpga_agilex5_setup_plat_dat(struct socfpga_dwmac *dwmac) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 3ce03b059277..6dbe5d5a3224 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1179,7 +1179,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) * hardware features were copied from Allwinner drivers. */ plat_dat->rx_coe = STMMAC_RX_COE_TYPE2; - plat_dat->tx_coe = 1; + plat_dat->tx_coe = true; plat_dat->flags |= STMMAC_FLAG_HAS_SUN8I; plat_dat->bsp_priv = gmac; plat_dat->init = sun8i_dwmac_init; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 52593ba3a3a3..74bd996d93c9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -135,7 +135,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) /* platform data specifying hardware features and callbacks. * hardware features were copied from Allwinner drivers. */ - plat_dat->tx_coe = 1; + plat_dat->tx_coe = true; plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->bsp_priv = gmac; plat_dat->init = sun7i_gmac_init; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index d765acbe3754..b4b39e6a169e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -310,7 +310,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) plat->core_type = DWMAC_CORE_XGMAC; plat->flags |= STMMAC_FLAG_TSO_EN; - plat->pmt = 1; + plat->pmt = true; plat->bsp_priv = mgbe; if (!plat->mdio_node) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 87f43811faa0..939431255fa5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7401,7 +7401,7 @@ static int stmmac_hw_init(struct stmmac_priv *priv) /* TXCOE doesn't work in thresh DMA mode */ if (priv->plat->force_thresh_dma_mode) - priv->plat->tx_coe = 0; + priv->plat->tx_coe = false; else priv->plat->tx_coe = priv->dma_cap.tx_coe; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 836fed7d60ab..d584fd2daa6f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -25,7 +25,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->mdio_bus_data->needs_reset = true; } @@ -58,9 +58,9 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, plat->clk_csr = STMMAC_CSR_250_300M; plat->core_type = DWMAC_CORE_GMAC4; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->flags |= STMMAC_FLAG_TSO_EN; - plat->pmt = 1; + plat->pmt = true; /* Set default number of RX and TX queues to use */ plat->tx_queues_to_use = 4; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 1aed48fe0db6..0d3bad0f8915 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -514,34 +514,34 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) plat->multicast_filter_bins = dwmac1000_validate_mcast_bins( &pdev->dev, plat->multicast_filter_bins); plat->core_type = DWMAC_CORE_GMAC; - plat->pmt = 1; + plat->pmt = true; } if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { plat->core_type = DWMAC_CORE_GMAC; - plat->enh_desc = 1; - plat->tx_coe = 1; - plat->bugged_jumbo = 1; - plat->pmt = 1; + plat->enh_desc = true; + plat->tx_coe = true; + plat->bugged_jumbo = true; + plat->pmt = true; } if (of_device_compatible_match(np, stmmac_gmac4_compats)) { plat->core_type = DWMAC_CORE_GMAC4; - plat->pmt = 1; + plat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; } if (of_device_is_compatible(np, "snps,dwmac-3.610") || of_device_is_compatible(np, "snps,dwmac-3.710")) { - plat->enh_desc = 1; - plat->bugged_jumbo = 1; - plat->force_sf_dma_mode = 1; + plat->enh_desc = true; + plat->bugged_jumbo = true; + plat->force_sf_dma_mode = true; } if (of_device_is_compatible(np, "snps,dwxgmac")) { plat->core_type = DWMAC_CORE_XGMAC; - plat->pmt = 1; + plat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; of_property_read_u32(np, "snps,multicast-filter-bins", diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 919196713c05..9420da96a4ff 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -229,14 +229,14 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; - int enh_desc; - int tx_coe; + bool enh_desc; + bool tx_coe; + bool bugged_jumbo; + bool pmt; + bool force_sf_dma_mode; + bool force_thresh_dma_mode; + bool riwt_off; int rx_coe; - int bugged_jumbo; - int pmt; - int force_sf_dma_mode; - int force_thresh_dma_mode; - int riwt_off; int max_speed; int maxmtu; int multicast_filter_bins; -- cgit v1.2.3 From 3357642e65e9454c3da64b62c0ed987ee4010008 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:28 +0000 Subject: net: stmmac: reorder structs to reduce memory consumption Reorder some of the stmmac structures to allow them to pack better, thereby using less memory. On aarch64, sizeof(struct stmmac_priv) was 880, and with this change becomes 816, saving 64 bytes, which is an 8% saving. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5E-0000000CVs8-40w4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 9420da96a4ff..411cdd3ea034 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -108,37 +108,37 @@ struct stmmac_dma_cfg { #define AXI_BLEN 7 struct stmmac_axi { - bool axi_lpi_en; - bool axi_xit_frm; u32 axi_wr_osr_lmt; u32 axi_rd_osr_lmt; - bool axi_kbbe; u32 axi_blen_regval; + bool axi_lpi_en; + bool axi_xit_frm; + bool axi_kbbe; bool axi_fb; bool axi_mb; bool axi_rb; }; struct stmmac_rxq_cfg { - u8 mode_to_use; u32 chan; + u32 prio; + u8 mode_to_use; u8 pkt_route; bool use_prio; - u32 prio; }; struct stmmac_txq_cfg { u32 weight; - bool coe_unsupported; - u8 mode_to_use; /* Credit Base Shaper parameters */ u32 send_slope; u32 idle_slope; u32 high_credit; u32 low_credit; - bool use_prio; u32 prio; int tbs_en; + bool use_prio; + bool coe_unsupported; + u8 mode_to_use; }; struct stmmac_safety_feature_cfg { -- cgit v1.2.3 From 94808793fed71ee47741df0923d353024b6904ff Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:34 +0000 Subject: net: stmmac: use u8 for ?x_queues_to_use and number_?x_queues The maximum number of queues is a compile time constant of only eight. This makes using a 32-bit quantity wastefulf. Instead, use u8 for these and their associated variables. When reading the DT properties, saturdate at U8_MAX. Provided the core provides DMA capabilities to describe the number of queues, this will be capped by stmmac_hw_init() with a warning. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5K-0000000CVsE-0J0Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 4 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac100_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 +- .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 4 +- drivers/net/ethernet/stmicro/stmmac/hwif.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 223 +++++++++++---------- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 15 +- include/linux/stmmac.h | 4 +- 11 files changed, 136 insertions(+), 128 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 46454e2886ce..f1628de8ed18 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -446,8 +446,8 @@ struct dma_features { unsigned int number_rx_channel; unsigned int number_tx_channel; /* TX and RX number of queues */ - unsigned int number_rx_queues; - unsigned int number_tx_queues; + u8 number_rx_queues; + u8 number_tx_queues; /* PPS output */ unsigned int pps_out_num; /* Number of Traffic Classes */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 6dbe5d5a3224..48c52eb96233 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -718,7 +718,7 @@ static void sun8i_dwmac_set_filter(struct mac_device_info *hw, static void sun8i_dwmac_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, - unsigned int pause_time, u32 tx_cnt) + unsigned int pause_time, u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; u32 v; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index c7cb30672604..01f8353eb6ef 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -222,7 +222,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw, static void dwmac1000_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; /* Set flow such that DZPQ in Mac Register 6 is 0, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index 6b5cf3a0866a..94d24d355d95 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -126,7 +126,7 @@ static void dwmac100_set_filter(struct mac_device_info *hw, static void dwmac100_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; unsigned int flow = MAC_FLOW_CTRL_ENABLE; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index e6bcb77b22a2..4c6fed3ecbcf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -547,11 +547,11 @@ static void dwmac4_set_filter(struct mac_device_info *hw, static void dwmac4_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; unsigned int flow = 0; - u32 queue = 0; + u8 queue; pr_debug("GMAC Flow-Control:\n"); if (fc & FLOW_RX) { diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index efa76b147f9e..f02b434bbd50 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -355,10 +355,10 @@ static int dwxgmac2_host_mtl_irq_status(struct stmmac_priv *priv, static void dwxgmac2_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; - u32 i; + u8 i; if (fc & FLOW_RX) writel(XGMAC_RFE, ioaddr + XGMAC_RX_FLOW_CTRL); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 374f326efa01..010b4d32484a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -352,7 +352,7 @@ struct stmmac_ops { void (*set_filter)(struct mac_device_info *hw, struct net_device *dev); /* Flow control setting */ void (*flow_ctrl)(struct mac_device_info *hw, unsigned int duplex, - unsigned int fc, unsigned int pause_time, u32 tx_cnt); + unsigned int fc, unsigned int pause_time, u8 tx_cnt); /* Set power management mode (e.g. magic frame) */ void (*pmt)(struct mac_device_info *hw, unsigned long mode); /* Set/Get Unicast MAC addresses */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 335e60439b42..bba9bb9c95bf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -407,7 +407,7 @@ void stmmac_dvr_remove(struct device *dev); int stmmac_dvr_probe(struct device *device, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res); -int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt); +int stmmac_reinit_queues(struct net_device *dev, u8 rx_cnt, u8 tx_cnt); int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size); int stmmac_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 939431255fa5..11150bddd872 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -264,10 +264,10 @@ static void stmmac_verify_args(void) static void __stmmac_disable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; - u32 tx_queues_cnt = priv->plat->tx_queues_to_use; - u32 maxq = max(rx_queues_cnt, tx_queues_cnt); - u32 queue; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 tx_queues_cnt = priv->plat->tx_queues_to_use; + u8 maxq = max(rx_queues_cnt, tx_queues_cnt); + u8 queue; for (queue = 0; queue < maxq; queue++) { struct stmmac_channel *ch = &priv->channel[queue]; @@ -291,9 +291,9 @@ static void __stmmac_disable_all_queues(struct stmmac_priv *priv) */ static void stmmac_disable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; struct stmmac_rx_queue *rx_q; - u32 queue; + u8 queue; /* synchronize_rcu() needed for pending XDP buffers to drain */ for (queue = 0; queue < rx_queues_cnt; queue++) { @@ -313,10 +313,10 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv) */ static void stmmac_enable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; - u32 tx_queues_cnt = priv->plat->tx_queues_to_use; - u32 maxq = max(rx_queues_cnt, tx_queues_cnt); - u32 queue; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 tx_queues_cnt = priv->plat->tx_queues_to_use; + u8 maxq = max(rx_queues_cnt, tx_queues_cnt); + u8 queue; for (queue = 0; queue < maxq; queue++) { struct stmmac_channel *ch = &priv->channel[queue]; @@ -377,8 +377,8 @@ static inline u32 stmmac_rx_dirty(struct stmmac_priv *priv, u32 queue) static bool stmmac_eee_tx_busy(struct stmmac_priv *priv) { - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queue; /* check if all TX queues have the work finished */ for (queue = 0; queue < tx_cnt; queue++) { @@ -909,7 +909,7 @@ static int stmmac_legacy_serdes_power_up(struct stmmac_priv *priv) static void stmmac_mac_flow_ctrl(struct stmmac_priv *priv, u32 duplex, unsigned int flow_ctrl) { - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; stmmac_flow_ctrl(priv, priv->hw, duplex, flow_ctrl, priv->pause_time, tx_cnt); @@ -1410,10 +1410,10 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) static void stmmac_display_rx_rings(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_cnt = priv->plat->rx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; unsigned int desc_size; void *head_rx; - u32 queue; + u8 queue; /* Display RX rings */ for (queue = 0; queue < rx_cnt; queue++) { @@ -1438,10 +1438,10 @@ static void stmmac_display_rx_rings(struct stmmac_priv *priv, static void stmmac_display_tx_rings(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; unsigned int desc_size; void *head_tx; - u32 queue; + u8 queue; /* Display TX rings */ for (queue = 0; queue < tx_cnt; queue++) { @@ -1571,9 +1571,9 @@ static void stmmac_clear_tx_descriptors(struct stmmac_priv *priv, static void stmmac_clear_descriptors(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_queue_cnt = priv->plat->rx_queues_to_use; - u32 tx_queue_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_queue_cnt = priv->plat->rx_queues_to_use; + u8 tx_queue_cnt = priv->plat->tx_queues_to_use; + u8 queue; /* Clear the RX descriptors */ for (queue = 0; queue < rx_queue_cnt; queue++) @@ -1891,7 +1891,7 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_count = priv->plat->rx_queues_to_use; + u8 rx_count = priv->plat->rx_queues_to_use; int queue; int ret; @@ -1985,8 +1985,8 @@ static int init_dma_tx_desc_rings(struct net_device *dev, struct stmmac_dma_conf *dma_conf) { struct stmmac_priv *priv = netdev_priv(dev); - u32 tx_queue_cnt; - u32 queue; + u8 tx_queue_cnt; + u8 queue; tx_queue_cnt = priv->plat->tx_queues_to_use; @@ -2057,8 +2057,8 @@ static void dma_free_tx_skbufs(struct stmmac_priv *priv, */ static void stmmac_free_tx_skbufs(struct stmmac_priv *priv) { - u32 tx_queue_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_queue_cnt = priv->plat->tx_queues_to_use; + u8 queue; for (queue = 0; queue < tx_queue_cnt; queue++) dma_free_tx_skbufs(priv, &priv->dma_conf, queue); @@ -2106,8 +2106,8 @@ static void __free_dma_rx_desc_resources(struct stmmac_priv *priv, static void free_dma_rx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 queue; /* Free RX queue resources */ for (queue = 0; queue < rx_count; queue++) @@ -2153,8 +2153,8 @@ static void __free_dma_tx_desc_resources(struct stmmac_priv *priv, static void free_dma_tx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; /* Free TX queue resources */ for (queue = 0; queue < tx_count; queue++) @@ -2255,8 +2255,8 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 queue; int ret; /* RX queues buffers and DMA */ @@ -2331,8 +2331,8 @@ static int __alloc_dma_tx_desc_resources(struct stmmac_priv *priv, static int alloc_dma_tx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; int ret; /* TX queues buffers and DMA */ @@ -2396,8 +2396,8 @@ static void free_dma_desc_resources(struct stmmac_priv *priv, */ static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - int queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u8 mode; for (queue = 0; queue < rx_queues_count; queue++) { @@ -2460,10 +2460,10 @@ static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan) static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); - u32 chan; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u8 chan; for (chan = 0; chan < dma_csr_ch; chan++) { struct stmmac_channel *ch = &priv->channel[chan]; @@ -2483,9 +2483,9 @@ static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) */ static void stmmac_start_all_dma(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan = 0; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; for (chan = 0; chan < rx_channels_count; chan++) stmmac_start_rx_dma(priv, chan); @@ -2502,9 +2502,9 @@ static void stmmac_start_all_dma(struct stmmac_priv *priv) */ static void stmmac_stop_all_dma(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan = 0; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; for (chan = 0; chan < rx_channels_count; chan++) stmmac_stop_rx_dma(priv, chan); @@ -2521,14 +2521,14 @@ static void stmmac_stop_all_dma(struct stmmac_priv *priv) */ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; u32 txmode = 0; u32 rxmode = 0; - u32 chan = 0; u8 qmode = 0; + u8 chan; if (rxfifosz == 0) rxfifosz = priv->dma_cap.rx_fifo_size; @@ -3012,8 +3012,8 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, { u8 rxqmode = priv->plat->rx_queues_cfg[chan].mode_to_use; u8 txqmode = priv->plat->tx_queues_cfg[chan].mode_to_use; - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; @@ -3088,12 +3088,12 @@ static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan, u32 dir) */ static void stmmac_dma_interrupt(struct stmmac_priv *priv) { - u32 tx_channel_count = priv->plat->tx_queues_to_use; - u32 rx_channel_count = priv->plat->rx_queues_to_use; - u32 channels_to_check = tx_channel_count > rx_channel_count ? - tx_channel_count : rx_channel_count; - u32 chan; + u8 tx_channel_count = priv->plat->tx_queues_to_use; + u8 rx_channel_count = priv->plat->rx_queues_to_use; + u8 channels_to_check = tx_channel_count > rx_channel_count ? + tx_channel_count : rx_channel_count; int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; + u8 chan; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) @@ -3237,13 +3237,13 @@ static int stmmac_prereset_configure(struct stmmac_priv *priv) */ static int stmmac_init_dma_engine(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_channels_count, tx_channels_count); struct stmmac_rx_queue *rx_q; struct stmmac_tx_queue *tx_q; - u32 chan = 0; int ret = 0; + u8 chan; ret = stmmac_prereset_configure(priv); if (ret) @@ -3359,9 +3359,9 @@ static enum hrtimer_restart stmmac_tx_timer(struct hrtimer *t) */ static void stmmac_init_coalesce(struct stmmac_priv *priv) { - u32 tx_channel_count = priv->plat->tx_queues_to_use; - u32 rx_channel_count = priv->plat->rx_queues_to_use; - u32 chan; + u8 tx_channel_count = priv->plat->tx_queues_to_use; + u8 rx_channel_count = priv->plat->rx_queues_to_use; + u8 chan; for (chan = 0; chan < tx_channel_count; chan++) { struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[chan]; @@ -3378,9 +3378,9 @@ static void stmmac_init_coalesce(struct stmmac_priv *priv) static void stmmac_set_rings_length(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; /* set TX ring length */ for (chan = 0; chan < tx_channels_count; chan++) @@ -3400,9 +3400,9 @@ static void stmmac_set_rings_length(struct stmmac_priv *priv) */ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; u32 weight; - u32 queue; + u8 queue; for (queue = 0; queue < tx_queues_count; queue++) { weight = priv->plat->tx_queues_cfg[queue].weight; @@ -3417,9 +3417,9 @@ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv) */ static void stmmac_configure_cbs(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; u32 mode_to_use; - u32 queue; + u8 queue; /* queue 0 is reserved for legacy traffic */ for (queue = 1; queue < tx_queues_count; queue++) { @@ -3443,8 +3443,8 @@ static void stmmac_configure_cbs(struct stmmac_priv *priv) */ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u32 chan; for (queue = 0; queue < rx_queues_count; queue++) { @@ -3460,8 +3460,8 @@ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv) */ static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u32 prio; for (queue = 0; queue < rx_queues_count; queue++) { @@ -3480,8 +3480,8 @@ static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv) */ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_queues_count = priv->plat->tx_queues_to_use; + u8 queue; u32 prio; for (queue = 0; queue < tx_queues_count; queue++) { @@ -3500,9 +3500,9 @@ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv) */ static void stmmac_mac_config_rx_queues_routing(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; u8 packet; + u8 queue; for (queue = 0; queue < rx_queues_count; queue++) { /* no specific packet type routing specified for the queue */ @@ -3537,8 +3537,8 @@ static void stmmac_mac_config_rss(struct stmmac_priv *priv) */ static void stmmac_mtl_configuration(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; if (tx_queues_count > 1) stmmac_set_tx_queue_weight(priv); @@ -3606,10 +3606,10 @@ static void stmmac_safety_feat_configuration(struct stmmac_priv *priv) static int stmmac_hw_setup(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; bool sph_en; - u32 chan; + u8 chan; int ret; /* Make sure RX clock is enabled */ @@ -4001,7 +4001,8 @@ static struct stmmac_dma_conf * stmmac_setup_dma_desc(struct stmmac_priv *priv, unsigned int mtu) { struct stmmac_dma_conf *dma_conf; - int chan, bfsize, ret; + int bfsize, ret; + u8 chan; dma_conf = kzalloc_obj(*dma_conf); if (!dma_conf) { @@ -4076,7 +4077,7 @@ static int __stmmac_open(struct net_device *dev, struct stmmac_dma_conf *dma_conf) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; int ret; for (int i = 0; i < MTL_MAX_TX_QUEUES; i++) @@ -4175,7 +4176,7 @@ err_dma_resources: static void __stmmac_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; /* Stop and disconnect the PHY */ phylink_stop(priv->phylink); @@ -6123,7 +6124,7 @@ static int stmmac_set_features(struct net_device *netdev, if (priv->sph_capable) { bool sph_en = (priv->hw->rx_csum > 0) && priv->sph_active; - u32 chan; + u8 chan; for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan); @@ -6143,11 +6144,11 @@ static int stmmac_set_features(struct net_device *netdev, static void stmmac_common_interrupt(struct stmmac_priv *priv) { - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queues_count; - u32 queue; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queues_count; bool xmac; + u8 queue; xmac = dwmac_is_xmac(priv->plat->core_type); queues_count = (rx_cnt > tx_cnt) ? rx_cnt : tx_cnt; @@ -6445,9 +6446,9 @@ static int stmmac_rings_status_show(struct seq_file *seq, void *v) { struct net_device *dev = seq->private; struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_count = priv->plat->rx_queues_to_use; - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; if ((dev->flags & IFF_UP) == 0) return 0; @@ -6572,9 +6573,9 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) priv->dma_cap.number_rx_channel); seq_printf(seq, "\tNumber of Additional TX channel: %d\n", priv->dma_cap.number_tx_channel); - seq_printf(seq, "\tNumber of Additional RX queues: %d\n", + seq_printf(seq, "\tNumber of Additional RX queues: %u\n", priv->dma_cap.number_rx_queues); - seq_printf(seq, "\tNumber of Additional TX queues: %d\n", + seq_printf(seq, "\tNumber of Additional TX queues: %u\n", priv->dma_cap.number_tx_queues); seq_printf(seq, "\tEnhanced descriptors: %s\n", (priv->dma_cap.enh_desc) ? "Y" : "N"); @@ -7043,7 +7044,7 @@ void stmmac_enable_tx_queue(struct stmmac_priv *priv, u32 queue) void stmmac_xdp_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; /* Ensure tx function is not running */ netif_tx_disable(dev); @@ -7076,14 +7077,14 @@ void stmmac_xdp_release(struct net_device *dev) int stmmac_xdp_open(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_cnt, tx_cnt); + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_cnt, tx_cnt); struct stmmac_rx_queue *rx_q; struct stmmac_tx_queue *tx_q; u32 buf_size; bool sph_en; - u32 chan; + u8 chan; int ret; ret = alloc_dma_desc_resources(priv, &priv->dma_conf); @@ -7219,10 +7220,10 @@ int stmmac_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags) static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct stmmac_priv *priv = netdev_priv(dev); - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; unsigned int start; - int q; + u8 q; for (q = 0; q < tx_cnt; q++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[q]; @@ -7511,7 +7512,7 @@ static int stmmac_hw_init(struct stmmac_priv *priv) static void stmmac_napi_add(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 queue, maxq; + u8 queue, maxq; maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); @@ -7540,7 +7541,7 @@ static void stmmac_napi_add(struct net_device *dev) static void stmmac_napi_del(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 queue, maxq; + u8 queue, maxq; maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); @@ -7558,7 +7559,7 @@ static void stmmac_napi_del(struct net_device *dev) } } -int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) +int stmmac_reinit_queues(struct net_device *dev, u8 rx_cnt, u8 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; @@ -7763,8 +7764,8 @@ static int __stmmac_dvr_probe(struct device *device, { struct net_device *ndev = NULL; struct stmmac_priv *priv; - u32 rxq; int i, ret = 0; + u8 rxq; if (!plat_dat->dma_cfg || !plat_dat->dma_cfg->pbl) { dev_err(device, "invalid DMA configuration\n"); @@ -8147,7 +8148,7 @@ int stmmac_suspend(struct device *dev) { struct net_device *ndev = dev_get_drvdata(dev); struct stmmac_priv *priv = netdev_priv(ndev); - u32 chan; + u8 chan; if (!ndev || !netif_running(ndev)) goto suspend_bsp; @@ -8222,9 +8223,9 @@ static void stmmac_reset_tx_queue(struct stmmac_priv *priv, u32 queue) */ static void stmmac_reset_queues_param(struct stmmac_priv *priv) { - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queue; for (queue = 0; queue < rx_cnt; queue++) stmmac_reset_rx_queue(priv, queue); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 0d3bad0f8915..3b514a702612 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -138,6 +138,7 @@ static int stmmac_mtl_setup(struct platform_device *pdev, struct device_node *tx_node; u8 queue = 0; int ret = 0; + u32 value; /* First Queue must always be in DCB mode. As MTL_QUEUE_DCB = 1 we need * to always set this, otherwise Queue will be classified as AVB @@ -157,8 +158,11 @@ static int stmmac_mtl_setup(struct platform_device *pdev, } /* Processing RX queues common config */ - of_property_read_u32(rx_node, "snps,rx-queues-to-use", - &plat->rx_queues_to_use); + if (!of_property_read_u32(rx_node, "snps,rx-queues-to-use", &value)) { + if (value > U8_MAX) + value = U8_MAX; + plat->rx_queues_to_use = value; + } if (of_property_read_bool(rx_node, "snps,rx-sched-sp")) plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP; @@ -208,8 +212,11 @@ static int stmmac_mtl_setup(struct platform_device *pdev, } /* Processing TX queues common config */ - of_property_read_u32(tx_node, "snps,tx-queues-to-use", - &plat->tx_queues_to_use); + if (!of_property_read_u32(tx_node, "snps,tx-queues-to-use", &value)) { + if (value > U8_MAX) + value = U8_MAX; + plat->tx_queues_to_use = value; + } if (of_property_read_bool(tx_node, "snps,tx-sched-wrr")) plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WRR; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 411cdd3ea034..03fd85060a73 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -244,8 +244,8 @@ struct plat_stmmacenet_data { int tx_fifo_size; int rx_fifo_size; u32 host_dma_width; - u32 rx_queues_to_use; - u32 tx_queues_to_use; + u8 rx_queues_to_use; + u8 tx_queues_to_use; u8 rx_sched_algorithm; u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; -- cgit v1.2.3 From 758ed85aadd0668c66cb359c63f384992b10938c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:39 +0000 Subject: net: stmmac: use u8 for host_dma_width and similar struct members We aren't going to see >= 256-bit address busses soon, so reduce host_dma_width and associated other struct members that initialise this from u32 to u8. Signed-off-by: Russell King (Oracle) Acked-by: Mohd Ayaan Anwar # qcom-ethqos Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5P-0000000CVsK-0iwX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- include/linux/stmmac.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 9f5a15b81f8a..9d1bd72ffb73 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -42,8 +42,8 @@ struct imx_priv_data; struct imx_dwmac_ops { - u32 addr_width; u32 flags; + u8 addr_width; bool mac_rgmii_txclk_auto_adj; int (*fix_soc_reset)(struct stmmac_priv *priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index a139db6a8cbb..30ae0dba7fff 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -93,9 +93,9 @@ struct mediatek_dwmac_variant { const char * const *clk_list; int num_clks; - u32 dma_bit_mask; u32 rx_delay_max; u32 tx_delay_max; + u8 dma_bit_mask; }; /* list of clocks required for mac */ @@ -268,9 +268,9 @@ static const struct mediatek_dwmac_variant mt2712_gmac_variant = { .dwmac_set_delay = mt2712_set_delay, .clk_list = mt2712_dwmac_clk_l, .num_clks = ARRAY_SIZE(mt2712_dwmac_clk_l), - .dma_bit_mask = 33, .rx_delay_max = 17600, .tx_delay_max = 17600, + .dma_bit_mask = 33, }; static int mt8195_set_interface(struct mediatek_dwmac_plat_data *plat, @@ -418,9 +418,9 @@ static const struct mediatek_dwmac_variant mt8195_gmac_variant = { .dwmac_set_delay = mt8195_set_delay, .clk_list = mt8195_dwmac_clk_l, .num_clks = ARRAY_SIZE(mt8195_dwmac_clk_l), - .dma_bit_mask = 35, .rx_delay_max = 9280, .tx_delay_max = 9280, + .dma_bit_mask = 35, }; static int mediatek_dwmac_config_dt(struct mediatek_dwmac_plat_data *plat) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 388e9fdeb86c..3ccf20fdf52a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -91,8 +91,8 @@ struct ethqos_emac_driver_data { unsigned int num_rgmii_por; bool rgmii_config_loopback_en; bool has_emac_ge_3; + u8 dma_addr_width; const char *link_clk_name; - u32 dma_addr_width; struct dwmac4_addrs dwmac4_addrs; bool needs_sgmii_loopback; }; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 03fd85060a73..11886189bf51 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -243,7 +243,7 @@ struct plat_stmmacenet_data { int unicast_filter_entries; int tx_fifo_size; int rx_fifo_size; - u32 host_dma_width; + u8 host_dma_width; u8 rx_queues_to_use; u8 tx_queues_to_use; u8 rx_sched_algorithm; -- cgit v1.2.3 From 9fe167ab790b10c9eb9ef82f46a03c83f9953b61 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:44 +0000 Subject: net: stmmac: add documentation for stmmac_dma_cfg members Add documentation of each of the struct stmmac_dma_cfg members. dche remains undocumented as I don't have documentation that covers this. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5U-0000000CVsQ-162V@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 11886189bf51..1af3a5e197c9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -93,16 +93,37 @@ struct stmmac_mdio_bus_data { }; struct stmmac_dma_cfg { + /* pbl: programmable burst limit + * txpbl: transmit programmable burst limit + * rxpbl: receive programmable burst limit + * If txpbl or rxpbl are zero, the value of pbl will be substituted. + * Range 0 - 63. + */ int pbl; int txpbl; int rxpbl; + /* pblx8: multiplies pbl, txpbl, rxpbl by a factor of 8 for dwmac >= + * 3.50a, or a factor of 4 for previous versions. + */ bool pblx8; + /* fixed_burst: + * when set, AXI bursts defined by axi_blen_regval are permitted. + * AHB uses SINGLE, INCR4, INCR8 or INCR16 during burst transfers. + * when clear, AXI and AHB use SINGLE or INCR bursts. + */ bool fixed_burst; + /* mixed_burst: + * when set and fixed_burst is clear, AHB uses INCR for bursts > 16 + * and SINGLE or INCRx for bursts <= 16. + */ bool mixed_burst; + /* aal: address aligned bursts for AHB and AXI master interface */ bool aal; + bool dche; bool eame; + /* multi_msi_en: stmmac core internal */ bool multi_msi_en; - bool dche; + /* atds: stmmac core internal */ bool atds; }; -- cgit v1.2.3 From 315bab9411f3bd3465a47a64a3e44323bfab60be Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:49 +0000 Subject: net: stmmac: add documentation for clocks Add documentation covering stmmac_clk, pclk, clk_ptp_ref and clk_tx_i in the hope that this will help understand what each of these clocks are for. There is confusion around stmmac_clk and pclk which can't be easily resolved today as the Imagination Technologies Pistachio board that pclk was introduced for has no public documentation and is likely now obsolete. So the origins of pclk are lost to the winds of time. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5Z-0000000CVsb-1XTm@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 1af3a5e197c9..937985276e6b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -300,10 +300,41 @@ struct plat_stmmacenet_data { struct phylink_pcs *(*select_pcs)(struct stmmac_priv *priv, phy_interface_t interface); void *bsp_priv; + + /* stmmac clocks: + * stmmac_clk: CSR clock (which can be hclk_i, clk_csr_i, aclk_i, + * or clk_app_i depending on GMAC configuration). This clock + * generates the MDC clock. + * + * pclk: introduced for Imagination Technologies Pistachio board - + * see 5f9755d26fbf ("stmmac: Add an optional register interface + * clock"). This is probably used for cases where separate clocks + * are provided for the host interface and register interface. In + * this case, as the MDC clock is derived from stmmac_clk, pclk + * can only really be the "application clock" for the "host + * interface" and not the "register interface" aka CSR clock as + * it is never used when determining the divider for the MDC + * clock. + * + * clk_ptp_ref: optional PTP reference clock (clk_ptp_ref_i). When + * present, this clock increments the timestamp value. Otherwise, + * the rate of stmmac_clk will be used. + * + * clk_tx_i: MAC transmit clock, which will be 2.5MHz for 10M, + * 25MHz for 100M, or 125MHz for 1G irrespective of the interface + * mode. For the DWMAC PHY interface modes: + * + * GMII/MII PHY's transmit clock for 10M (2.5MHz) or 100M (25MHz), + * or 125MHz local clock for 1G mode + * RMII 50MHz RMII clock divided by 2 or 20. + * RGMII 125MHz local clock divided by 1, 5, or 50. + * SGMII 125MHz SerDes clock divided by 1, 5, or 50. + * TBI/RTBI 125MHz SerDes clock + */ struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - struct clk *clk_tx_i; /* clk_tx_i to MAC core */ + struct clk *clk_tx_i; unsigned long clk_ptp_rate; unsigned long clk_ref_rate; struct clk_bulk_data *clks; -- cgit v1.2.3 From 7aba71dbc41641e43a79fb9f6fac91719094b4fb Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Thu, 5 Mar 2026 10:39:06 +0100 Subject: mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU use-cases for mmu_interval_notifiers with hmm often involve starting a gpu operation and then waiting for it to complete. These operations are typically context preemption or TLB flushing. With single-pass notifiers per GPU this doesn't scale in multi-gpu scenarios. In those scenarios we'd want to first start preemption- or TLB flushing on all GPUs and as a second pass wait for them to complete. One can do this on per-driver basis multiplexing per-driver notifiers but that would mean sharing the notifier "user" lock across all GPUs and that doesn't scale well either, so adding support for multi-pass in the core appears to be the right choice. Implement two-pass capability in the mmu_interval_notifier. Use a linked list for the final passes to minimize the impact for use-cases that don't need the multi-pass functionality by avoiding a second interval tree walk, and to be able to easily pass data between the two passes. v1: - Restrict to two passes (Jason Gunthorpe) - Improve on documentation (Jason Gunthorpe) - Improve on function naming (Alistair Popple) v2: - Include the invalidate_finish() callback in the struct mmu_interval_notifier_ops. - Update documentation (GitHub Copilot:claude-sonnet-4.6) - Use lockless list for list management. v3: - Update kerneldoc for the struct mmu_interval_notifier_finish::list member (Matthew Brost) - Add a WARN_ON_ONCE() checking for NULL invalidate_finish() op if if invalidate_start() is non-NULL. (Matthew Brost) v4: - Addressed documentation review comments by David Hildenbrand. Cc: Matthew Brost Cc: Christian König Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Simona Vetter Cc: Dave Airlie Cc: Alistair Popple Cc: Cc: Cc: Assisted-by: GitHub Copilot:claude-sonnet-4.6 # Documentation only. Signed-off-by: Thomas Hellström Acked-by: David Hildenbrand (Arm) Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260305093909.43623-2-thomas.hellstrom@linux.intel.com --- include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++++++ mm/mmu_notifier.c | 65 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..b60673a8e0bb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -233,16 +233,58 @@ struct mmu_notifier { unsigned int users; }; +/** + * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction + * @link: Lockless list link for the notifiers pending pass list + * @notifier: The mmu_interval_notifier for which the finish pass is called. + * + * Allocate, typically using GFP_NOWAIT in the interval notifier's start pass. + * Note that with a large number of notifiers implementing two passes, + * allocation with GFP_NOWAIT will become increasingly likely to fail, so consider + * implementing a small pool instead of using kmalloc() allocations. + * + * If the implementation needs to pass data between the start and the finish passes, + * the recommended way is to embed struct mmu_interval_notifier_finish into a larger + * structure that also contains the data needed to be shared. Keep in mind that + * a notifier callback can be invoked in parallel, and each invocation needs its + * own struct mmu_interval_notifier_finish. + * + * If allocation fails, then the &mmu_interval_notifier_ops->invalidate_start op + * needs to implements the full notifier functionality. Please refer to its + * documentation. + */ +struct mmu_interval_notifier_finish { + struct llist_node link; + struct mmu_interval_notifier *notifier; +}; + /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. + * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier + * callbacks where the call to @invalidate_start is the first + * pass and any struct mmu_interval_notifier_finish pointer + * returned in the @finish parameter describes the finish pass. + * If *@finish is %NULL on return, then no final pass will be + * called, and @invalidate_start needs to implement the full + * notifier, behaving like @invalidate. The value of *@finish + * is guaranteed to be %NULL at function entry. + * @invalidate_finish: Called as the second pass for any notifier that returned + * a non-NULL *@finish from @invalidate_start. The @finish + * pointer passed here is the same one returned by + * @invalidate_start. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); + bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub, + const struct mmu_notifier_range *range, + unsigned long cur_seq, + struct mmu_interval_notifier_finish **finish); + void (*invalidate_finish)(struct mmu_interval_notifier_finish *finish); }; struct mmu_interval_notifier { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8e0125dc0522..33023dbbd76d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -260,6 +260,15 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); +static void mn_itree_finish_pass(struct llist_head *finish_passes) +{ + struct llist_node *first = llist_reverse_order(__llist_del_all(finish_passes)); + struct mmu_interval_notifier_finish *f, *next; + + llist_for_each_entry_safe(f, next, first, link) + f->notifier->ops->invalidate_finish(f); +} + static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { @@ -271,6 +280,7 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, .end = ULONG_MAX, }; struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; bool ret; @@ -278,11 +288,27 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, mn_itree_inv_start_range(subscriptions, &range, &cur_seq); interval_sub; interval_sub = mn_itree_inv_next(interval_sub, &range)) { - ret = interval_sub->ops->invalidate(interval_sub, &range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + &range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + &range, + cur_seq); + } WARN_ON(!ret); } + mn_itree_finish_pass(&finish_passes); mn_itree_inv_end(subscriptions); } @@ -430,7 +456,9 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; + int err = 0; for (interval_sub = mn_itree_inv_start_range(subscriptions, range, &cur_seq); @@ -438,23 +466,41 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = interval_sub->ops->invalidate(interval_sub, range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + range, + cur_seq); + } if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; - goto out_would_block; + err = -EAGAIN; + break; } } - return 0; -out_would_block: + mn_itree_finish_pass(&finish_passes); + /* * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(subscriptions); - return -EAGAIN; + if (err) + mn_itree_inv_end(subscriptions); + + return err; } static int mn_hlist_invalidate_range_start( @@ -977,6 +1023,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mmu_notifier_subscriptions *subscriptions; int ret; + WARN_ON_ONCE(ops->invalidate_start && !ops->invalidate_finish); might_lock(&mm->mmap_lock); subscriptions = smp_load_acquire(&mm->notifier_subscriptions); -- cgit v1.2.3 From a5ccec8fd10efa50a3fd4444915a2abff31f2535 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 27 Feb 2026 23:17:11 -0800 Subject: vdso/datapage: Correct struct member kernel-doc Remove the "[]" array indicators from the struct member descriptions to avoid kernel-doc warnings. Warning: include/vdso/datapage.h:107 struct member 'basetime' not described in 'vdso_clock' Warning: include/vdso/datapage.h:107 struct member 'offset' not described in 'vdso_clock' Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260228071711.2663851-1-rdunlap@infradead.org --- include/vdso/datapage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 23c39b96190f..07c2e086d8f4 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -80,8 +80,8 @@ struct vdso_timestamp { * @mask: clocksource mask * @mult: clocksource multiplier * @shift: clocksource shift - * @basetime[clock_id]: basetime per clock_id - * @offset[clock_id]: time namespace offset per clock_id + * @basetime: basetime per clock_id + * @offset: time namespace offset per clock_id * * See also struct vdso_time_data for basic access and ordering information as * struct vdso_clock is used there. -- cgit v1.2.3 From 62357a5888ea6ef81f718eee20ad962a1101fb96 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 2 Mar 2026 08:58:41 +0100 Subject: asm-generic/bitsperlong.h: Add sanity checks for __BITS_PER_LONG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of __BITS_PER_LONG from architecture-specific logic should always match the generic one if that is available. It should also match the actual C type 'long'. Mismatches can happen for example when building the compat vDSO. Either during the compilation, see commit 9a6d3ff10f7f ("arm64: uapi: Provide correct __BITS_PER_LONG for the compat vDSO"), or when running sparse when mismatched CHECKFLAGS are inherited from the kernel build. Add some consistency checks which detect such issues early and clearly. The kernel-internal BITS_PER_LONG is not checked as it is derived from CONFIG_64BIT and therefore breaks for the compat vDSO. See the similar, deactivated check above. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20260302-vdso-compat-checkflags-v2-5-78e55baa58ba@linutronix.de --- include/asm-generic/bitsperlong.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 1023e2a4bd37..90e8aeebfd2f 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -19,6 +19,15 @@ #error Inconsistent word size. Check asm/bitsperlong.h #endif +#if __CHAR_BIT__ * __SIZEOF_LONG__ != __BITS_PER_LONG +#error Inconsistent word size. Check asm/bitsperlong.h +#endif + +#ifndef __ASSEMBLER__ +_Static_assert(sizeof(long) * 8 == __BITS_PER_LONG, + "Inconsistent word size. Check asm/bitsperlong.h"); +#endif + #ifndef BITS_PER_LONG_LONG #define BITS_PER_LONG_LONG 64 #endif -- cgit v1.2.3 From 05988dba11791ccbb458254484826b32f17f4ad2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 4 Mar 2026 08:49:00 +0100 Subject: vdso/datastore: Allocate data pages dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocating the data pages as part of the kernel image does not work on SPARC. The MMU will raise a fault when userspace tries to access them. Allocate the data pages through the page allocator instead. Unused pages in the vDSO VMA are still allocated to keep the virtual addresses aligned. Switch the mapping from PFNs to 'struct page' as that is required for dynamically allocated pages. This also aligns the allocation of the datapages with the code pages and is a prerequisite for mlockall() support. VM_MIXEDMAP is necessary for the call to vmf_insert_page() in the timens prefault path to work. The data pages need to be order-0, non-compound pages so that the mapping to userspace and the different orderings work. These pages are also used by the timekeeping, random pool and architecture initialization code. Some of these are running before the page allocator is available. To keep these subsytems working without changes, introduce early, statically data storage which will then replaced by the real one as soon as that is available. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de --- include/linux/vdso_datastore.h | 6 +++ init/main.c | 2 + lib/vdso/datastore.c | 92 +++++++++++++++++++++++++++--------------- 3 files changed, 68 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h index a91fa24b06e0..0b530428db71 100644 --- a/include/linux/vdso_datastore.h +++ b/include/linux/vdso_datastore.h @@ -2,9 +2,15 @@ #ifndef _LINUX_VDSO_DATASTORE_H #define _LINUX_VDSO_DATASTORE_H +#ifdef CONFIG_HAVE_GENERIC_VDSO #include extern const struct vm_special_mapping vdso_vvar_mapping; struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); +void __init vdso_setup_data_pages(void); +#else /* !CONFIG_HAVE_GENERIC_VDSO */ +static inline void vdso_setup_data_pages(void) { } +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + #endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/init/main.c b/init/main.c index 1cb395dd94e4..de867b2693d2 100644 --- a/init/main.c +++ b/init/main.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -1119,6 +1120,7 @@ void start_kernel(void) srcu_init(); hrtimers_init(); softirq_init(); + vdso_setup_data_pages(); timekeeping_init(); time_init(); diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index 7377fcb6e1df..faebf5b7cd6e 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -1,52 +1,79 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include +#include #include #include #include #include #include -/* - * The vDSO data page. - */ +static u8 vdso_initdata[VDSO_NR_PAGES * PAGE_SIZE] __aligned(PAGE_SIZE) __initdata = {}; + #ifdef CONFIG_GENERIC_GETTIMEOFDAY -static union { - struct vdso_time_data data; - u8 page[PAGE_SIZE]; -} vdso_time_data_store __page_aligned_data; -struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; -static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); +struct vdso_time_data *vdso_k_time_data __refdata = + (void *)&vdso_initdata[VDSO_TIME_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_time_data) <= PAGE_SIZE); #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #ifdef CONFIG_VDSO_GETRANDOM -static union { - struct vdso_rng_data data; - u8 page[PAGE_SIZE]; -} vdso_rng_data_store __page_aligned_data; -struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data; -static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE); +struct vdso_rng_data *vdso_k_rng_data __refdata = + (void *)&vdso_initdata[VDSO_RNG_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_rng_data) <= PAGE_SIZE); #endif /* CONFIG_VDSO_GETRANDOM */ #ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA -static union { - struct vdso_arch_data data; - u8 page[VDSO_ARCH_DATA_SIZE]; -} vdso_arch_data_store __page_aligned_data; -struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data; +struct vdso_arch_data *vdso_k_arch_data __refdata = + (void *)&vdso_initdata[VDSO_ARCH_PAGES_START * PAGE_SIZE]; #endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ +void __init vdso_setup_data_pages(void) +{ + unsigned int order = get_order(VDSO_NR_PAGES * PAGE_SIZE); + struct page *pages; + + /* + * Allocate the data pages dynamically. SPARC does not support mapping + * static pages to be mapped into userspace. + * It is also a requirement for mlockall() support. + * + * Do not use folios. In time namespaces the pages are mapped in a different order + * to userspace, which is not handled by the folio optimizations in finish_fault(). + */ + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) + panic("Unable to allocate VDSO storage pages"); + + /* The pages are mapped one-by-one into userspace and each one needs to be refcounted. */ + split_page(pages, order); + + /* Move the data already written by other subsystems to the new pages */ + memcpy(page_address(pages), vdso_initdata, VDSO_NR_PAGES * PAGE_SIZE); + + if (IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) + vdso_k_time_data = page_address(pages + VDSO_TIME_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + vdso_k_rng_data = page_address(pages + VDSO_RNG_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + vdso_k_arch_data = page_address(pages + VDSO_ARCH_PAGES_START); +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; + struct page *page, *timens_page; + + timens_page = find_timens_vvar_page(vma); switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page = virt_to_page(vdso_k_time_data); if (timens_page) { /* * Fault in VVAR page too, since it will be accessed @@ -56,10 +83,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, vm_fault_t err; addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); + err = vmf_insert_page(vma, addr, page); if (unlikely(err & VM_FAULT_ERROR)) return err; - pfn = page_to_pfn(timens_page); + page = timens_page; } break; case VDSO_TIMENS_PAGE_OFFSET: @@ -72,24 +99,25 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, */ if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page = virt_to_page(vdso_k_time_data); break; case VDSO_RNG_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + page = virt_to_page(vdso_k_rng_data); break; case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + - vmf->pgoff - VDSO_ARCH_PAGES_START; + page = virt_to_page(vdso_k_arch_data) + vmf->pgoff - VDSO_ARCH_PAGES_START; break; default: return VM_FAULT_SIGBUS; } - return vmf_insert_pfn(vma, vmf->address, pfn); + get_page(page); + vmf->page = page; + return 0; } const struct vm_special_mapping vdso_vvar_mapping = { @@ -101,7 +129,7 @@ struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned { return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | - VM_PFNMAP | VM_SEALED_SYSMAP, + VM_MIXEDMAP | VM_SEALED_SYSMAP, &vdso_vvar_mapping); } -- cgit v1.2.3 From c453b9abb4f422461c1493ef74d63af0961a2d30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Mar 2026 08:49:11 +0100 Subject: clocksource: Remove ARCH_CLOCKSOURCE_DATA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After sparc64, there are no remaining users of ARCH_CLOCKSOURCE_DATA and it can just be removed. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Andreas Larsson Reviewed-by: Andreas Larsson Acked-by: John Stultz Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-14-d8eb3b0e1410@linutronix.de [Thomas: drop sparc64 bits from the patch] --- include/linux/clocksource.h | 6 +----- kernel/time/Kconfig | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..12d853b18832 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -25,8 +25,7 @@ struct clocksource_base; struct clocksource; struct module; -#if defined(CONFIG_ARCH_CLOCKSOURCE_DATA) || \ - defined(CONFIG_GENERIC_GETTIMEOFDAY) +#if defined(CONFIG_GENERIC_GETTIMEOFDAY) #include #endif @@ -106,9 +105,6 @@ struct clocksource { u64 max_idle_ns; u32 maxadj; u32 uncertainty_margin; -#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA - struct arch_clocksource_data archdata; -#endif u64 max_cycles; u64 max_raw_delta; const char *name; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..fe3311877097 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,10 +9,6 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool -- cgit v1.2.3 From ed78b7b2c5ae679960469c0f679539c427e051ab Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:43:21 +0100 Subject: vdso/gettimeofday: Add a helper to read the sequence lock of a time namespace aware clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there are three different open-coded variants of a time namespace aware variant of vdso_read_begin(). They make the code hard to read and introduce an inconsistency, as only the first copy uses unlikely(). Split the code into a shared helper function. Move that next to the definition of the regular vdso_read_begin(), so that any future changes can be kept in sync easily. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260227-vdso-cleanups-v1-2-c848b4bc4850@linutronix.de --- include/vdso/helpers.h | 22 +++++++++++++++++++++ lib/vdso/gettimeofday.c | 51 +++++++++---------------------------------------- 2 files changed, 31 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 1a5ee9d9052c..9ccf6b53ef50 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -18,6 +18,28 @@ static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) return seq; } +/* + * Variant of vdso_read_begin() to handle VDSO_CLOCKMODE_TIMENS. + * + * Time namespace enabled tasks have a special VVAR page installed which has + * vc->seq set to 1 and vc->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non + * time namespace affected tasks this does not affect performance because if + * vc->seq is odd, i.e. a concurrent update is in progress the extra check for + * vc->clock_mode is just a few extra instructions while spin waiting for + * vc->seq to become even again. + */ +static __always_inline bool vdso_read_begin_timens(const struct vdso_clock *vc, u32 *seq) +{ + while (unlikely((*seq = READ_ONCE(vc->seq)) & 1)) { + if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return true; + cpu_relax(); + } + smp_rmb(); + + return false; +} + static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, u32 start) { diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 4939ee86af65..e49369676928 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -158,24 +158,8 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, return false; do { - /* - * Open coded function vdso_read_begin() to handle - * VDSO_CLOCKMODE_TIMENS. Time namespace enabled tasks have a - * special VVAR page installed which has vc->seq set to 1 and - * vc->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time - * namespace affected tasks this does not affect performance - * because if vc->seq is odd, i.e. a concurrent update is in - * progress the extra check for vc->clock_mode is just a few - * extra instructions while spin waiting for vc->seq to become - * even again. - */ - while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) { - if (IS_ENABLED(CONFIG_TIME_NS) && - vc->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_hres_timens(vd, vc, clk, ts); - cpu_relax(); - } - smp_rmb(); + if (vdso_read_begin_timens(vc, &seq)) + return do_hres_timens(vd, vc, clk, ts); if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) return false; @@ -223,17 +207,8 @@ bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, u32 seq; do { - /* - * Open coded function vdso_read_begin() to handle - * VDSO_CLOCK_TIMENS. See comment in do_hres(). - */ - while ((seq = READ_ONCE(vc->seq)) & 1) { - if (IS_ENABLED(CONFIG_TIME_NS) && - vc->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_coarse_timens(vd, vc, clk, ts); - cpu_relax(); - } - smp_rmb(); + if (vdso_read_begin_timens(vc, &seq)) + return do_coarse_timens(vd, vc, clk, ts); ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; @@ -256,20 +231,12 @@ bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_ti vc = &vd->aux_clock_data[idx]; do { - /* - * Open coded function vdso_read_begin() to handle - * VDSO_CLOCK_TIMENS. See comment in do_hres(). - */ - while ((seq = READ_ONCE(vc->seq)) & 1) { - if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { - vd = __arch_get_vdso_u_timens_data(vd); - vc = &vd->aux_clock_data[idx]; - /* Re-read from the real time data page */ - continue; - } - cpu_relax(); + if (vdso_read_begin_timens(vc, &seq)) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = &vd->aux_clock_data[idx]; + /* Re-read from the real time data page */ + continue; } - smp_rmb(); /* Auxclock disabled? */ if (vc->clock_mode == VDSO_CLOCKMODE_NONE) -- cgit v1.2.3 From 6a3e5eb3c51dbd01ca46c2c40a67bea1dd845cdb Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 2 Mar 2026 20:17:36 +0200 Subject: drm/intel: fix @dpt kernel-doc for parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the copy-paste fail. Reviewed-by: Jouni Högander Link: https://patch.msgid.link/0209e128312520ca1c6a0c39f9dfb0184125322a.1772475391.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- include/drm/intel/display_parent_interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index d02ab7cc1c92..6a88c8640683 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -166,7 +166,7 @@ struct intel_display_vma_interface { * check the optional pointers. */ struct intel_display_parent_interface { - /** @dsb: DPT interface. Optional. */ + /** @dpt: DPT interface. Optional. */ const struct intel_display_dpt_interface *dpt; /** @dsb: DSB buffer interface */ -- cgit v1.2.3 From 2cca25160d159e6351e3273b088db0b4f359ef6a Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 2 Mar 2026 20:17:37 +0200 Subject: drm/{i915, xe}/frontbuffer: move frontbuffer handling to parent interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the get/put/ref/flush_for_display calls to the display parent interface. For i915, move the hooks next to the other i915 core frontbuffer code in i915_gem_object_frontbuffer.c. For xe, add new file xe_frontbuffer.c for the same. Note: The intel_frontbuffer_flush() calls from i915_gem_object_frontbuffer.c will partially route back to i915 core via the parent interface. This is less than stellar. Reviewed-by: Jouni Högander Link: https://patch.msgid.link/f69b967ed82bbcfd60ffa77ba197b26a1399f09f.1772475391.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_bo.c | 36 ----------- drivers/gpu/drm/i915/display/intel_bo.h | 5 -- drivers/gpu/drm/i915/display/intel_frontbuffer.c | 12 ++-- drivers/gpu/drm/i915/display/intel_parent.c | 21 +++++++ drivers/gpu/drm/i915/display/intel_parent.h | 7 +++ .../gpu/drm/i915/gem/i915_gem_object_frontbuffer.c | 45 ++++++++++++++ .../gpu/drm/i915/gem/i915_gem_object_frontbuffer.h | 2 + drivers/gpu/drm/i915/i915_driver.c | 2 + drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/display/intel_bo.c | 56 ----------------- drivers/gpu/drm/xe/display/xe_display.c | 2 + drivers/gpu/drm/xe/display/xe_frontbuffer.c | 71 ++++++++++++++++++++++ drivers/gpu/drm/xe/display/xe_frontbuffer.h | 9 +++ include/drm/intel/display_parent_interface.h | 11 ++++ 14 files changed, 178 insertions(+), 102 deletions(-) create mode 100644 drivers/gpu/drm/xe/display/xe_frontbuffer.c create mode 100644 drivers/gpu/drm/xe/display/xe_frontbuffer.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/display/intel_bo.c b/drivers/gpu/drm/i915/display/intel_bo.c index 8f372b33d48b..2b6eaec351d8 100644 --- a/drivers/gpu/drm/i915/display/intel_bo.c +++ b/drivers/gpu/drm/i915/display/intel_bo.c @@ -45,42 +45,6 @@ int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, i return i915_gem_object_read_from_page(to_intel_bo(obj), offset, dst, size); } -struct intel_frontbuffer *intel_bo_frontbuffer_get(struct drm_gem_object *_obj) -{ - struct drm_i915_gem_object *obj = to_intel_bo(_obj); - struct i915_frontbuffer *front; - - front = i915_gem_object_frontbuffer_get(obj); - if (!front) - return NULL; - - return &front->base; -} - -void intel_bo_frontbuffer_ref(struct intel_frontbuffer *_front) -{ - struct i915_frontbuffer *front = - container_of(_front, typeof(*front), base); - - i915_gem_object_frontbuffer_ref(front); -} - -void intel_bo_frontbuffer_put(struct intel_frontbuffer *_front) -{ - struct i915_frontbuffer *front = - container_of(_front, typeof(*front), base); - - return i915_gem_object_frontbuffer_put(front); -} - -void intel_bo_frontbuffer_flush_for_display(struct intel_frontbuffer *_front) -{ - struct i915_frontbuffer *front = - container_of(_front, typeof(*front), base); - - i915_gem_object_flush_if_display(front->obj); -} - void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) { i915_debugfs_describe_obj(m, to_intel_bo(obj)); diff --git a/drivers/gpu/drm/i915/display/intel_bo.h b/drivers/gpu/drm/i915/display/intel_bo.h index 516a3836a6bc..40390ed92ceb 100644 --- a/drivers/gpu/drm/i915/display/intel_bo.h +++ b/drivers/gpu/drm/i915/display/intel_bo.h @@ -20,11 +20,6 @@ int intel_bo_key_check(struct drm_gem_object *obj); int intel_bo_fb_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma); int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size); -struct intel_frontbuffer *intel_bo_frontbuffer_get(struct drm_gem_object *obj); -void intel_bo_frontbuffer_ref(struct intel_frontbuffer *front); -void intel_bo_frontbuffer_put(struct intel_frontbuffer *front); -void intel_bo_frontbuffer_flush_for_display(struct intel_frontbuffer *front); - void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj); #endif /* __INTEL_BO__ */ diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c b/drivers/gpu/drm/i915/display/intel_frontbuffer.c index 03c4978fa5ec..a355dc064528 100644 --- a/drivers/gpu/drm/i915/display/intel_frontbuffer.c +++ b/drivers/gpu/drm/i915/display/intel_frontbuffer.c @@ -58,13 +58,13 @@ #include #include -#include "intel_bo.h" #include "intel_display_trace.h" #include "intel_display_types.h" #include "intel_dp.h" #include "intel_drrs.h" #include "intel_fbc.h" #include "intel_frontbuffer.h" +#include "intel_parent.h" #include "intel_psr.h" #include "intel_tdf.h" @@ -150,7 +150,7 @@ void __intel_fb_flush(struct intel_frontbuffer *front, struct intel_display *display = front->display; if (origin == ORIGIN_DIRTYFB) - intel_bo_frontbuffer_flush_for_display(front); + intel_parent_frontbuffer_flush_for_display(display, front); if (origin == ORIGIN_CS) { spin_lock(&display->fb_tracking.lock); @@ -166,7 +166,7 @@ void __intel_fb_flush(struct intel_frontbuffer *front, static void intel_frontbuffer_ref(struct intel_frontbuffer *front) { - intel_bo_frontbuffer_ref(front); + intel_parent_frontbuffer_ref(front->display, front); } static void intel_frontbuffer_flush_work(struct work_struct *work) @@ -209,12 +209,14 @@ void intel_frontbuffer_fini(struct intel_frontbuffer *front) struct intel_frontbuffer *intel_frontbuffer_get(struct drm_gem_object *obj) { - return intel_bo_frontbuffer_get(obj); + struct intel_display *display = to_intel_display(obj->dev); + + return intel_parent_frontbuffer_get(display, obj); } void intel_frontbuffer_put(struct intel_frontbuffer *front) { - intel_bo_frontbuffer_put(front); + intel_parent_frontbuffer_put(front->display, front); } /** diff --git a/drivers/gpu/drm/i915/display/intel_parent.c b/drivers/gpu/drm/i915/display/intel_parent.c index 0c5962cb2f6d..2e3bad2b3e6b 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.c +++ b/drivers/gpu/drm/i915/display/intel_parent.c @@ -51,6 +51,27 @@ void intel_parent_dpt_resume(struct intel_display *display, struct intel_dpt *dp display->parent->dpt->resume(dpt); } +/* frontbuffer */ +struct intel_frontbuffer *intel_parent_frontbuffer_get(struct intel_display *display, struct drm_gem_object *obj) +{ + return display->parent->frontbuffer->get(obj); +} + +void intel_parent_frontbuffer_ref(struct intel_display *display, struct intel_frontbuffer *front) +{ + display->parent->frontbuffer->ref(front); +} + +void intel_parent_frontbuffer_put(struct intel_display *display, struct intel_frontbuffer *front) +{ + display->parent->frontbuffer->put(front); +} + +void intel_parent_frontbuffer_flush_for_display(struct intel_display *display, struct intel_frontbuffer *front) +{ + display->parent->frontbuffer->flush_for_display(front); +} + /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, struct intel_hdcp_gsc_context *gsc_context, diff --git a/drivers/gpu/drm/i915/display/intel_parent.h b/drivers/gpu/drm/i915/display/intel_parent.h index 6e7d09133aee..2013e5ed5aa9 100644 --- a/drivers/gpu/drm/i915/display/intel_parent.h +++ b/drivers/gpu/drm/i915/display/intel_parent.h @@ -13,6 +13,7 @@ struct drm_scanout_buffer; struct i915_vma; struct intel_display; struct intel_dpt; +struct intel_frontbuffer; struct intel_hdcp_gsc_context; struct intel_panic; struct intel_stolen_node; @@ -24,6 +25,12 @@ void intel_parent_dpt_destroy(struct intel_display *display, struct intel_dpt *d void intel_parent_dpt_suspend(struct intel_display *display, struct intel_dpt *dpt); void intel_parent_dpt_resume(struct intel_display *display, struct intel_dpt *dpt); +/* frontbuffer */ +struct intel_frontbuffer *intel_parent_frontbuffer_get(struct intel_display *display, struct drm_gem_object *obj); +void intel_parent_frontbuffer_ref(struct intel_display *display, struct intel_frontbuffer *front); +void intel_parent_frontbuffer_put(struct intel_display *display, struct intel_frontbuffer *front); +void intel_parent_frontbuffer_flush_for_display(struct intel_display *display, struct intel_frontbuffer *front); + /* hdcp */ ssize_t intel_parent_hdcp_gsc_msg_send(struct intel_display *display, struct intel_hdcp_gsc_context *gsc_context, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.c index cf0b66eaf11b..f885c4fb1326 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: MIT /* Copyright © 2025 Intel Corporation */ +#include + #include "i915_drv.h" #include "i915_gem_object_frontbuffer.h" @@ -125,3 +127,46 @@ void __i915_gem_object_frontbuffer_invalidate(struct drm_i915_gem_object *obj, i915_gem_object_frontbuffer_put(front); } } + +static struct intel_frontbuffer *i915_frontbuffer_get(struct drm_gem_object *_obj) +{ + struct drm_i915_gem_object *obj = to_intel_bo(_obj); + struct i915_frontbuffer *front; + + front = i915_gem_object_frontbuffer_get(obj); + if (!front) + return NULL; + + return &front->base; +} + +static void i915_frontbuffer_ref(struct intel_frontbuffer *_front) +{ + struct i915_frontbuffer *front = + container_of(_front, typeof(*front), base); + + i915_gem_object_frontbuffer_ref(front); +} + +static void i915_frontbuffer_put(struct intel_frontbuffer *_front) +{ + struct i915_frontbuffer *front = + container_of(_front, typeof(*front), base); + + return i915_gem_object_frontbuffer_put(front); +} + +static void i915_frontbuffer_flush_for_display(struct intel_frontbuffer *_front) +{ + struct i915_frontbuffer *front = + container_of(_front, typeof(*front), base); + + i915_gem_object_flush_if_display(front->obj); +} + +const struct intel_display_frontbuffer_interface i915_display_frontbuffer_interface = { + .get = i915_frontbuffer_get, + .ref = i915_frontbuffer_ref, + .put = i915_frontbuffer_put, + .flush_for_display = i915_frontbuffer_flush_for_display, +}; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.h b/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.h index 46124048a59f..9c6d91f21c19 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_frontbuffer.h @@ -91,4 +91,6 @@ i915_gem_object_frontbuffer_lookup(const struct drm_i915_gem_object *obj) return front; } +extern const struct intel_display_frontbuffer_interface i915_display_frontbuffer_interface; + #endif diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 79ded07b5db5..7a8c59a8c865 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -78,6 +78,7 @@ #include "gem/i915_gem_dmabuf.h" #include "gem/i915_gem_ioctls.h" #include "gem/i915_gem_mman.h" +#include "gem/i915_gem_object_frontbuffer.h" #include "gem/i915_gem_pm.h" #include "gt/intel_gt.h" #include "gt/intel_gt_pm.h" @@ -766,6 +767,7 @@ static bool vgpu_active(struct drm_device *drm) static const struct intel_display_parent_interface parent = { .dpt = &i915_display_dpt_interface, .dsb = &i915_display_dsb_interface, + .frontbuffer = &i915_display_frontbuffer_interface, .hdcp = &i915_display_hdcp_interface, .initial_plane = &i915_display_initial_plane_interface, .irq = &i915_display_irq_interface, diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index c4fb9f13371a..7c5bb6e8fb8d 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -218,6 +218,7 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ display/xe_display_wa.o \ display/xe_dsb_buffer.o \ display/xe_fb_pin.o \ + display/xe_frontbuffer.o \ display/xe_hdcp_gsc.o \ display/xe_initial_plane.o \ display/xe_panic.o \ diff --git a/drivers/gpu/drm/xe/display/intel_bo.c b/drivers/gpu/drm/xe/display/intel_bo.c index 05d5e5c0a0de..fa1f2c796b81 100644 --- a/drivers/gpu/drm/xe/display/intel_bo.c +++ b/drivers/gpu/drm/xe/display/intel_bo.c @@ -47,62 +47,6 @@ int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, i return xe_bo_read(bo, offset, dst, size); } -struct xe_frontbuffer { - struct intel_frontbuffer base; - struct drm_gem_object *obj; - struct kref ref; -}; - -struct intel_frontbuffer *intel_bo_frontbuffer_get(struct drm_gem_object *obj) -{ - struct xe_frontbuffer *front; - - front = kmalloc_obj(*front); - if (!front) - return NULL; - - intel_frontbuffer_init(&front->base, obj->dev); - - kref_init(&front->ref); - - drm_gem_object_get(obj); - front->obj = obj; - - return &front->base; -} - -void intel_bo_frontbuffer_ref(struct intel_frontbuffer *_front) -{ - struct xe_frontbuffer *front = - container_of(_front, typeof(*front), base); - - kref_get(&front->ref); -} - -static void frontbuffer_release(struct kref *ref) -{ - struct xe_frontbuffer *front = - container_of(ref, typeof(*front), ref); - - intel_frontbuffer_fini(&front->base); - - drm_gem_object_put(front->obj); - - kfree(front); -} - -void intel_bo_frontbuffer_put(struct intel_frontbuffer *_front) -{ - struct xe_frontbuffer *front = - container_of(_front, typeof(*front), base); - - kref_put(&front->ref, frontbuffer_release); -} - -void intel_bo_frontbuffer_flush_for_display(struct intel_frontbuffer *front) -{ -} - void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) { /* FIXME */ diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index c8dd3faa9b97..f1e1889a52d3 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -38,6 +38,7 @@ #include "xe_display_pcode.h" #include "xe_display_rpm.h" #include "xe_dsb_buffer.h" +#include "xe_frontbuffer.h" #include "xe_hdcp_gsc.h" #include "xe_initial_plane.h" #include "xe_module.h" @@ -541,6 +542,7 @@ static const struct intel_display_irq_interface xe_display_irq_interface = { static const struct intel_display_parent_interface parent = { .dsb = &xe_display_dsb_interface, + .frontbuffer = &xe_display_frontbuffer_interface, .hdcp = &xe_display_hdcp_interface, .initial_plane = &xe_display_initial_plane_interface, .irq = &xe_display_irq_interface, diff --git a/drivers/gpu/drm/xe/display/xe_frontbuffer.c b/drivers/gpu/drm/xe/display/xe_frontbuffer.c new file mode 100644 index 000000000000..113fc017ee94 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_frontbuffer.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MIT +/* Copyright © 2026 Intel Corporation */ + +#include +#include + +#include "intel_frontbuffer.h" +#include "xe_frontbuffer.h" + +struct xe_frontbuffer { + struct intel_frontbuffer base; + struct drm_gem_object *obj; + struct kref ref; +}; + +static struct intel_frontbuffer *xe_frontbuffer_get(struct drm_gem_object *obj) +{ + struct xe_frontbuffer *front; + + front = kmalloc_obj(*front); + if (!front) + return NULL; + + intel_frontbuffer_init(&front->base, obj->dev); + + kref_init(&front->ref); + + drm_gem_object_get(obj); + front->obj = obj; + + return &front->base; +} + +static void xe_frontbuffer_ref(struct intel_frontbuffer *_front) +{ + struct xe_frontbuffer *front = + container_of(_front, typeof(*front), base); + + kref_get(&front->ref); +} + +static void frontbuffer_release(struct kref *ref) +{ + struct xe_frontbuffer *front = + container_of(ref, typeof(*front), ref); + + intel_frontbuffer_fini(&front->base); + + drm_gem_object_put(front->obj); + + kfree(front); +} + +static void xe_frontbuffer_put(struct intel_frontbuffer *_front) +{ + struct xe_frontbuffer *front = + container_of(_front, typeof(*front), base); + + kref_put(&front->ref, frontbuffer_release); +} + +static void xe_frontbuffer_flush_for_display(struct intel_frontbuffer *front) +{ +} + +const struct intel_display_frontbuffer_interface xe_display_frontbuffer_interface = { + .get = xe_frontbuffer_get, + .ref = xe_frontbuffer_ref, + .put = xe_frontbuffer_put, + .flush_for_display = xe_frontbuffer_flush_for_display, +}; diff --git a/drivers/gpu/drm/xe/display/xe_frontbuffer.h b/drivers/gpu/drm/xe/display/xe_frontbuffer.h new file mode 100644 index 000000000000..6b4f59b42ade --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_frontbuffer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef _XE_FRONTBUFFER_H_ +#define _XE_FRONTBUFFER_H_ + +extern const struct intel_display_frontbuffer_interface xe_display_frontbuffer_interface; + +#endif diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 6a88c8640683..c044472b9400 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -17,6 +17,7 @@ struct drm_scanout_buffer; struct i915_vma; struct intel_dpt; struct intel_dsb_buffer; +struct intel_frontbuffer; struct intel_hdcp_gsc_context; struct intel_initial_plane_config; struct intel_panic; @@ -42,6 +43,13 @@ struct intel_display_dsb_interface { void (*flush_map)(struct intel_dsb_buffer *dsb_buf); }; +struct intel_display_frontbuffer_interface { + struct intel_frontbuffer *(*get)(struct drm_gem_object *obj); + void (*ref)(struct intel_frontbuffer *front); + void (*put)(struct intel_frontbuffer *front); + void (*flush_for_display)(struct intel_frontbuffer *front); +}; + struct intel_display_hdcp_interface { ssize_t (*gsc_msg_send)(struct intel_hdcp_gsc_context *gsc_context, void *msg_in, size_t msg_in_len, @@ -172,6 +180,9 @@ struct intel_display_parent_interface { /** @dsb: DSB buffer interface */ const struct intel_display_dsb_interface *dsb; + /** @frontbuffer: Frontbuffer interface */ + const struct intel_display_frontbuffer_interface *frontbuffer; + /** @hdcp: HDCP GSC interface */ const struct intel_display_hdcp_interface *hdcp; -- cgit v1.2.3 From 282b8eec8a4eab9a3ff3addf6dad2ce699594fe8 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 10 Feb 2026 13:22:08 +0100 Subject: net: cdc-ncm: cleanup device descriptor Flags are boolean values, hence they should be typed as bool, not as u8. Signed-off-by: Oliver Neukum Link: https://patch.msgid.link/20260210122208.29244-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/cdc_ncm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 4ac082a63173..97ef37a1ff4a 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -118,8 +118,8 @@ struct cdc_ncm_ctx { u32 timer_interval; u32 max_ndp_size; - u8 is_ndp16; - u8 filtering_supported; + bool is_ndp16; + bool filtering_supported; union { struct usb_cdc_ncm_ndp16 *delayed_ndp16; struct usb_cdc_ncm_ndp32 *delayed_ndp32; -- cgit v1.2.3 From a657bebd7f02d0ec1ddb08c1d8c572fe1e187f9c Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:43:22 +0100 Subject: vdso/gettimeofday: Add a helper to test if a clock is namespaced MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently this logic is duplicate multiple times. Add a helper for it to make the code more readable. [ bp: Add a missing clocksource.h include, see https://lore.kernel.org/r/20260311113435-f72f81d8-33a6-4a0f-bd80-4997aad068cc@linutronix.de ] Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260227-vdso-cleanups-v1-3-c848b4bc4850@linutronix.de --- include/vdso/helpers.h | 8 +++++++- lib/vdso/gettimeofday.c | 9 +++------ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 9ccf6b53ef50..4d35877dff5f 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -6,6 +6,12 @@ #include #include +#include + +static __always_inline bool vdso_is_timens_clock(const struct vdso_clock *vc) +{ + return IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS; +} static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) { @@ -31,7 +37,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) static __always_inline bool vdso_read_begin_timens(const struct vdso_clock *vc, u32 *seq) { while (unlikely((*seq = READ_ONCE(vc->seq)) & 1)) { - if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + if (vdso_is_timens_clock(vc)) return true; cpu_relax(); } diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index e49369676928..2faed7851635 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -343,8 +343,7 @@ __cvdso_gettimeofday_data(const struct vdso_time_data *vd, } if (unlikely(tz != NULL)) { - if (IS_ENABLED(CONFIG_TIME_NS) && - vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + if (vdso_is_timens_clock(vc)) vd = __arch_get_vdso_u_timens_data(vd); tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; @@ -367,8 +366,7 @@ __cvdso_time_data(const struct vdso_time_data *vd, __kernel_old_time_t *time) const struct vdso_clock *vc = vd->clock_data; __kernel_old_time_t t; - if (IS_ENABLED(CONFIG_TIME_NS) && - vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + if (vdso_is_timens_clock(vc)) { vd = __arch_get_vdso_u_timens_data(vd); vc = vd->clock_data; } @@ -399,8 +397,7 @@ bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t cloc if (!vdso_clockid_valid(clock)) return false; - if (IS_ENABLED(CONFIG_TIME_NS) && - vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + if (vdso_is_timens_clock(vc)) vd = __arch_get_vdso_u_timens_data(vd); /* -- cgit v1.2.3 From 0c02d6df15d4bf7376a965b66d92ad31b0e458fd Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:43:23 +0100 Subject: vdso/gettimeofday: Move the unlikely() into vdso_read_retry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All callers of vdso_read_retry() test its return value with unlikely(). Move the unlikely into the helper to make the code easier to read. This is equivalent to the retry function of non-vDSO seqlocks. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260227-vdso-cleanups-v1-4-c848b4bc4850@linutronix.de --- include/vdso/helpers.h | 2 +- lib/vdso/gettimeofday.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 4d35877dff5f..197d233667c2 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -53,7 +53,7 @@ static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, smp_rmb(); seq = READ_ONCE(vc->seq); - return seq != start; + return unlikely(seq != start); } static __always_inline void vdso_write_seq_begin(struct vdso_clock *vc) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 2faed7851635..efd1b82af614 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -135,7 +135,7 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) return false; - } while (unlikely(vdso_read_retry(vc, seq))); + } while (vdso_read_retry(vc, seq)); /* Add the namespace offset */ sec += offs->sec; @@ -163,7 +163,7 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) return false; - } while (unlikely(vdso_read_retry(vc, seq))); + } while (vdso_read_retry(vc, seq)); vdso_set_timespec(ts, sec, ns); @@ -188,7 +188,7 @@ bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock seq = vdso_read_begin(vc); sec = vdso_ts->sec; nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vc, seq))); + } while (vdso_read_retry(vc, seq)); /* Add the namespace offset */ sec += offs->sec; @@ -212,7 +212,7 @@ bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vc, seq))); + } while (vdso_read_retry(vc, seq)); return true; } @@ -244,7 +244,7 @@ bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_ti if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns)) return false; - } while (unlikely(vdso_read_retry(vc, seq))); + } while (vdso_read_retry(vc, seq)); vdso_set_timespec(ts, sec, ns); -- cgit v1.2.3 From 8bd49acb4e81d2859f66a30e8edfd984f91c6c9c Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:44:37 +0100 Subject: vdso/helpers: Explicitly include vdso/processor.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The usage of cpu_relax() requires vdso/processor.h. Currently this header is included transitively, but that transitive inclusion is about to go away. Explicitly include the header. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260227-vdso-header-cleanups-v2-11-35d60acf7410@linutronix.de --- include/vdso/helpers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 197d233667c2..a3bf4f1c0d37 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -6,6 +6,7 @@ #include #include +#include #include static __always_inline bool vdso_is_timens_clock(const struct vdso_clock *vc) -- cgit v1.2.3 From 750d8cc84901757d9e5fe96207f5aa6b3e2acf92 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:44:38 +0100 Subject: vdso/datapage: Remove inclusion of gettimeofday.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vdso/datapage.h is useful without pulling in the architecture-specific gettimeofday() helpers. Move the include to the only users which needs it. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260227-vdso-header-cleanups-v2-12-35d60acf7410@linutronix.de --- include/vdso/datapage.h | 11 ----------- lib/vdso/gettimeofday.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 07c2e086d8f4..339a34e88c73 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -184,17 +184,6 @@ enum vdso_pages { VDSO_NR_PAGES }; -/* - * The generic vDSO implementation requires that gettimeofday.h - * provides: - * - __arch_get_hw_counter(): to get the hw counter based on the - * clock_mode. - * - gettimeofday_fallback(): fallback for gettimeofday. - * - clock_gettime_fallback(): fallback for clock_gettime. - * - clock_getres_fallback(): fallback for clock_getres. - */ -#include - #else /* !__ASSEMBLY__ */ #ifdef CONFIG_VDSO_GETRANDOM diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index ad79642056d5..a5798bd26d20 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -12,6 +12,17 @@ #include #include +/* + * The generic vDSO implementation requires that gettimeofday.h + * provides: + * - __arch_get_hw_counter(): to get the hw counter based on the + * clock_mode. + * - gettimeofday_fallback(): fallback for gettimeofday. + * - clock_gettime_fallback(): fallback for clock_gettime. + * - clock_getres_fallback(): fallback for clock_getres. + */ +#include + /* Bring in default accessors */ #include -- cgit v1.2.3 From f5e386fe5f1c26b24fb9ffc616f8e857f43cf88d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 27 Feb 2026 07:44:39 +0100 Subject: vdso/datapage: Trim down unnecessary includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vdso/datapage.h includes a lot of headers which are not strictly necessary. Some of those headers include architecture-specific vDSO headers which prevent the usage of vdso/datapage.h in kernel code on architectures without an vDSO. This would be useful however to write generic code using IS_ENABLED(), for example in drivers/char/random.c. Remove the unnecessary includes. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260227-vdso-header-cleanups-v2-13-35d60acf7410@linutronix.de --- include/vdso/datapage.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 339a34e88c73..5977723fb3b5 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -4,24 +4,16 @@ #ifndef __ASSEMBLY__ -#include +#include + #include #include -#include -#include #include #include #include -#include -#include -#include -#include #include -#include #include -#include -#include #ifdef CONFIG_ARCH_HAS_VDSO_TIME_DATA #include -- cgit v1.2.3 From ef22555fbee7c284a6ab55238fcbe4eea9dbb2a4 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Feb 2026 20:05:37 +0000 Subject: dt-bindings: connector: Add sink properties to comply with PD 3.1 spec Add additional properties for ports supporting sink mode. The properties define certain hardware and electrical properties such as sink load step, sink load characteristics, sink compliance and charging adapter Power Delivery Profile (PDP) for the connector. These properties need to be defined for a Type-C port in compliance with the PD 3.1 spec. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-skedb-v2-1-60675765bc7e@google.com Signed-off-by: Greg Kroah-Hartman --- .../bindings/connector/usb-connector.yaml | 34 ++++++++++++++++++++++ .../devicetree/bindings/usb/maxim,max33359.yaml | 4 +++ include/dt-bindings/usb/pd.h | 18 ++++++++++++ 3 files changed, 56 insertions(+) (limited to 'include') diff --git a/Documentation/devicetree/bindings/connector/usb-connector.yaml b/Documentation/devicetree/bindings/connector/usb-connector.yaml index 11e40d225b9f..901986de3e2b 100644 --- a/Documentation/devicetree/bindings/connector/usb-connector.yaml +++ b/Documentation/devicetree/bindings/connector/usb-connector.yaml @@ -300,6 +300,40 @@ properties: $ref: /schemas/types.yaml#/definitions/uint8-array maxItems: 4 + sink-load-step: + description: Indicates the preferred load step slew rate in mA/usec for + the port (in sink mode). This property is defined in "6.5.13.7" of + "USB Power Delivery Specification Revision 3.1 Version 1.8". + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [150, 500] + default: 150 + + sink-load-characteristics: + description: Indicates the port's (in sink mode) preferred load + characteristics. Users can leverage SINK_LOAD_CHAR() defined in + dt-bindings/usb/pd.h to populate this field. This property is defined in + "6.5.13.8" of "USB Power Delivery Specification Revision 3.1 Version 1.8". + $ref: /schemas/types.yaml#/definitions/uint16 + + sink-compliance: + description: Represents the types of sources the sink device has been tested + and certified with. This property is defined in "6.5.13.9" of + "USB Power Delivery Specification Revision 3.1 Version 1.8" + Bit 0 when set indicates it has been tested on LPS compliant source + Bit 1 when set indicates it has been tested on PS1 compliant source + Bit 2 when set indicates it has been tested on PS2 compliant source + $ref: /schemas/types.yaml#/definitions/uint8 + maximum: 7 + + charging-adapter-pdp-milliwatt: + description: This corresponds to the Power Delivery Profile rating of the + charging adapter shipped or recommended for use with the connector port. + This property is a requirement to infer the USB PD property + "SPR Sink Operational PDP" given in "6.5.13.14" of + "USB Power Delivery Specification Revision 3.1 Version 1.8". + minimum: 0 + maximum: 100000 + dependencies: sink-vdos-v1: [ sink-vdos ] sink-vdos: [ sink-vdos-v1 ] diff --git a/Documentation/devicetree/bindings/usb/maxim,max33359.yaml b/Documentation/devicetree/bindings/usb/maxim,max33359.yaml index 3de4dc40b791..46a3748c8be4 100644 --- a/Documentation/devicetree/bindings/usb/maxim,max33359.yaml +++ b/Documentation/devicetree/bindings/usb/maxim,max33359.yaml @@ -75,6 +75,10 @@ examples: PDO_FIXED(9000, 2000, 0)>; sink-bc12-completion-time-ms = <500>; pd-revision = /bits/ 8 <0x03 0x01 0x01 0x08>; + sink-load-step = <150>; + sink-load-characteristics = /bits/ 16 ; + sink-compliance = /bits/ 8 <(COMPLIANCE_LPS | COMPLIANCE_PS1)>; + charging-adapter-pdp-milliwatt = <18000>; }; }; }; diff --git a/include/dt-bindings/usb/pd.h b/include/dt-bindings/usb/pd.h index e6526b138174..6cff2339bda3 100644 --- a/include/dt-bindings/usb/pd.h +++ b/include/dt-bindings/usb/pd.h @@ -465,4 +465,22 @@ | ((vbm) & 0x3) << 15 | (curr) << 14 | ((vbi) & 0x3f) << 7 \ | ((gi) & 0x3f) << 1 | (ct)) +/* + * Sink Load Characteristics + * ------------------------- + * <15> :: Can tolerate vbus voltage droop + * <11:14> :: Duty cycle in 5% increments when bits 4:0 are non-zero + * <10:5> :: Overload period in 20ms when bits 4:0 are non-zero + * <4:0> :: Percent overload in 10% increments. Values higher than 25 are + * clipped to 250% + */ +#define SINK_LOAD_CHAR(vdroop, duty_cycle, period, percent_ol) \ + (((vdroop) & 0x1) << 15 | ((duty_cycle) & 0xf) << 11 | \ + ((period) & 0x3f) << 5 | ((percent_ol) & 0x1f)) + +/* Compliance */ +#define COMPLIANCE_LPS (1 << 0) +#define COMPLIANCE_PS1 (1 << 1) +#define COMPLIANCE_PS2 (1 << 2) + #endif /* __DT_POWER_DELIVERY_H */ -- cgit v1.2.3 From b558a9cc107287bd49bd9256e5d965afa80acfd6 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Feb 2026 20:05:38 +0000 Subject: usb: typec: tcpm: add support for Sink Cap Extended msg response Add support for responding to Sink Cap Extended msg request. To achieve this, include parsing support for DT properties related to Sink Cap Extended. The request for Sink Cap Ext is a control message while the response is an extended message (chunked). As the Sink Caps Extended Data Block size (24 Byte) is less than MaxExtendedMsgChunkLen (26 Byte), a single chunk is sufficient to complete this AMS. Supporting sink cap extended messages while responding to a Get_Sink_Caps_Extended request when port is in Sink role is required in order to be compliant with at least USB PD Rev3.1 Ver1.8. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260223-skedb-v2-2-60675765bc7e@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 253 +++++++++++++++++++++++++++++++++++++++++- include/linux/usb/pd.h | 82 +++++++++++++- 2 files changed, 332 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 3295d804cf87..5ea0b0e99e4d 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,8 @@ S(STRUCTURED_VDMS), \ S(COUNTRY_INFO), \ S(COUNTRY_CODES), \ - S(REVISION_INFORMATION) + S(REVISION_INFORMATION), \ + S(GETTING_SINK_EXTENDED_CAPABILITIES) #define GENERATE_ENUM(e) e #define GENERATE_STRING(s) #s @@ -229,6 +231,7 @@ enum pd_msg_request { PD_MSG_DATA_SINK_CAP, PD_MSG_DATA_SOURCE_CAP, PD_MSG_DATA_REV, + PD_MSG_EXT_SINK_CAP_EXT }; enum adev_actions { @@ -337,6 +340,42 @@ struct pd_timings { u32 snk_bc12_cmpletion_time; }; +/* Convert microwatt to watt */ +#define UW_TO_W(pow) ((pow) / 1000000) + +/* + * struct pd_identifier - Contains info about PD identifiers + * @vid: Vendor ID (assigned by USB-IF) + * @pid: Product ID (assigned by manufacturer) + * @xid: Value assigned by USB-IF for product + */ +struct pd_identifier { + u16 vid; + u16 pid; + u32 xid; +}; + +/* + * struct sink_caps_ext_data - Sink extended capability data + * @load_step: Indicates the load step slew rate. Value of 0 indicates 150mA/us + * & 1 indicates 500 mA/us + * @load_char: Snk overload characteristics + * @compliance: Types of sources the sink has been tested & certified on + * @modes: Charging caps & power sources supported + * @spr_min_pdp: Sink Minimum PDP for SPR mode (in Watts) + * @spr_op_pdp: Sink Operational PDP for SPR mode (in Watts) + * @spr_max_pdp: Sink Maximum PDP for SPR mode (in Watts) + */ +struct sink_caps_ext_data { + u8 load_step; + u16 load_char; + u8 compliance; + u8 modes; + u8 spr_min_pdp; + u8 spr_op_pdp; + u8 spr_max_pdp; +}; + struct tcpm_port { struct device *dev; @@ -585,6 +624,9 @@ struct tcpm_port { /* Indicates maximum (revision, version) supported */ struct pd_revision_info pd_rev; + + struct pd_identifier pd_ident; + struct sink_caps_ext_data sink_caps_ext; #ifdef CONFIG_DEBUG_FS struct dentry *dentry; struct mutex logbuffer_lock; /* log buffer access lock */ @@ -1367,6 +1409,64 @@ static int tcpm_pd_send_sink_caps(struct tcpm_port *port) return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg); } +static int tcpm_pd_send_sink_cap_ext(struct tcpm_port *port) +{ + u16 operating_snk_watt = port->operating_snk_mw / 1000; + struct sink_caps_ext_data *data = &port->sink_caps_ext; + struct pd_identifier *pd_ident = &port->pd_ident; + struct sink_caps_ext_msg skedb = {0}; + struct pd_message msg; + u8 data_obj_cnt; + + if (!port->self_powered) + data->spr_op_pdp = operating_snk_watt; + + /* + * SPR Sink Minimum PDP indicates the minimum power required to operate + * a sink device in its lowest level of functionality without requiring + * power from the battery. We can use the operating_snk_watt value to + * populate it, as operating_snk_watt indicates device's min operating + * power. + */ + data->spr_min_pdp = operating_snk_watt; + + if (data->spr_op_pdp < data->spr_min_pdp || + data->spr_max_pdp < data->spr_op_pdp) { + tcpm_log(port, + "Invalid PDP values, Min PDP:%u, Op PDP:%u, Max PDP:%u", + data->spr_min_pdp, data->spr_op_pdp, data->spr_max_pdp); + return -EOPNOTSUPP; + } + + memset(&msg, 0, sizeof(msg)); + skedb.vid = cpu_to_le16(pd_ident->vid); + skedb.pid = cpu_to_le16(pd_ident->pid); + skedb.xid = cpu_to_le32(pd_ident->xid); + skedb.skedb_ver = SKEDB_VER_1_0; + skedb.load_step = data->load_step; + skedb.load_char = cpu_to_le16(data->load_char); + skedb.compliance = data->compliance; + skedb.modes = data->modes; + skedb.spr_min_pdp = data->spr_min_pdp; + skedb.spr_op_pdp = data->spr_op_pdp; + skedb.spr_max_pdp = data->spr_max_pdp; + memcpy(msg.ext_msg.data, &skedb, sizeof(skedb)); + msg.ext_msg.header = PD_EXT_HDR_LE(sizeof(skedb), + 0, /* Denotes if request chunk */ + 0, /* Chunk Number */ + 1 /* Chunked */); + + data_obj_cnt = count_chunked_data_objs(sizeof(skedb)); + msg.header = cpu_to_le16(PD_HEADER(PD_EXT_SINK_CAP_EXT, + port->pwr_role, + port->data_role, + port->negotiated_rev, + port->message_id, + data_obj_cnt, + 1 /* Denotes if ext header */)); + return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg); +} + static void mod_tcpm_delayed_work(struct tcpm_port *port, unsigned int delay_ms) { if (delay_ms) { @@ -3646,6 +3746,19 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS); break; + case PD_CTRL_GET_SINK_CAP_EXT: + /* This is an unsupported message if port type is SRC */ + if (port->negotiated_rev >= PD_REV30 && + port->port_type != TYPEC_PORT_SRC) + tcpm_pd_handle_msg(port, PD_MSG_EXT_SINK_CAP_EXT, + GETTING_SINK_EXTENDED_CAPABILITIES); + else + tcpm_pd_handle_msg(port, + port->negotiated_rev < PD_REV30 ? + PD_MSG_CTRL_REJECT : + PD_MSG_CTRL_NOT_SUPP, + NONE_AMS); + break; default: tcpm_pd_handle_msg(port, port->negotiated_rev < PD_REV30 ? @@ -3898,6 +4011,16 @@ static bool tcpm_send_queued_message(struct tcpm_port *port) ret); tcpm_ams_finish(port); break; + case PD_MSG_EXT_SINK_CAP_EXT: + ret = tcpm_pd_send_sink_cap_ext(port); + if (ret == -EOPNOTSUPP) + tcpm_pd_send_control(port, PD_CTRL_NOT_SUPP, TCPC_TX_SOP); + else if (ret < 0) + tcpm_log(port, + "Unable to transmit sink cap extended, ret=%d", + ret); + tcpm_ams_finish(port); + break; default: break; } @@ -7282,6 +7405,129 @@ static void tcpm_fw_get_timings(struct tcpm_port *port, struct fwnode_handle *fw port->timings.snk_bc12_cmpletion_time = val; } +static void tcpm_fw_get_pd_ident(struct tcpm_port *port) +{ + struct pd_identifier *pd_ident = &port->pd_ident; + u32 *vdo; + + /* First 3 vdo values contain info regarding USB PID, VID & XID */ + if (port->nr_snk_vdo >= 3) + vdo = port->snk_vdo; + else if (port->nr_snk_vdo_v1 >= 3) + vdo = port->snk_vdo_v1; + else + return; + + pd_ident->vid = PD_IDH_VID(vdo[0]); + pd_ident->pid = PD_PRODUCT_PID(vdo[2]); + pd_ident->xid = PD_CSTAT_XID(vdo[1]); + tcpm_log(port, "vid:%#x pid:%#x xid:%#x", + pd_ident->vid, pd_ident->pid, pd_ident->xid); +} + +static void tcpm_parse_snk_pdos(struct tcpm_port *port) +{ + struct sink_caps_ext_data *caps = &port->sink_caps_ext; + u32 max_mv, max_ma; + u8 avs_tier1_pdp, avs_tier2_pdp; + int i, pdo_itr; + u32 *snk_pdos; + + for (i = 0; i < port->pd_count; ++i) { + snk_pdos = port->pd_list[i]->sink_desc.pdo; + for (pdo_itr = 0; pdo_itr < PDO_MAX_OBJECTS && snk_pdos[pdo_itr]; + ++pdo_itr) { + u32 pdo = snk_pdos[pdo_itr]; + u8 curr_snk_pdp = 0; + + switch (pdo_type(pdo)) { + case PDO_TYPE_FIXED: + max_mv = pdo_fixed_voltage(pdo); + max_ma = pdo_fixed_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + break; + case PDO_TYPE_BATT: + curr_snk_pdp = UW_TO_W(pdo_max_power(pdo)); + break; + case PDO_TYPE_VAR: + max_mv = pdo_max_voltage(pdo); + max_ma = pdo_max_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + break; + case PDO_TYPE_APDO: + if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) { + max_mv = pdo_pps_apdo_max_voltage(pdo); + max_ma = pdo_pps_apdo_max_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + caps->modes |= SINK_MODE_PPS; + } else if (pdo_apdo_type(pdo) == + APDO_TYPE_SPR_AVS) { + avs_tier1_pdp = UW_TO_W(SPR_AVS_TIER1_MAX_VOLT_MV + * pdo_spr_avs_apdo_9v_to_15v_max_current_ma(pdo)); + avs_tier2_pdp = UW_TO_W(SPR_AVS_TIER2_MAX_VOLT_MV + * pdo_spr_avs_apdo_15v_to_20v_max_current_ma(pdo)); + curr_snk_pdp = max(avs_tier1_pdp, avs_tier2_pdp); + caps->modes |= SINK_MODE_AVS; + } + break; + default: + tcpm_log(port, "Invalid source PDO type, ignoring"); + continue; + } + + caps->spr_max_pdp = max(caps->spr_max_pdp, + curr_snk_pdp); + } + } +} + +static void tcpm_fw_get_sink_caps_ext(struct tcpm_port *port, + struct fwnode_handle *fwnode) +{ + struct sink_caps_ext_data *caps = &port->sink_caps_ext; + int ret; + u32 val; + + /* + * Load step represents the change in current per usec that a given + * source can tolerate while maintaining Vbus within the vSrcValid + * range. For a sink this represents the "preferred" load-step value. It + * can only have 2 values (150 mA/usec or 500 mA/usec) with 150 mA/usec + * being the default. + */ + ret = fwnode_property_read_u32(fwnode, "sink-load-step", &val); + if (!ret) + caps->load_step = val == 500 ? 1 : 0; + + fwnode_property_read_u16(fwnode, "sink-load-characteristics", + &caps->load_char); + fwnode_property_read_u8(fwnode, "sink-compliance", &caps->compliance); + caps->modes = SINK_MODE_VBUS; + + /* + * As per "6.5.13.14" SPR Sink Operational PDP definition, for battery + * powered devices, this value will correspond to the PDP of the + * charging adapter either shipped or recommended for use with it. For + * batteryless sink devices SPR Operational PDP indicates the power + * required to operate all the device's functional modes. Hence, this + * value may be considered equal to port's operating_snk_mw. As + * operating_sink_mw can change as per the pd set used thus, OP PDP + * is determined when populating Sink Caps Extended Data Block. + */ + if (port->self_powered) { + fwnode_property_read_u32(fwnode, "charging-adapter-pdp-milliwatt", + &val); + caps->spr_op_pdp = (u8)(val / 1000); + caps->modes |= SINK_MODE_BATT; + } + + tcpm_parse_snk_pdos(port); + tcpm_log(port, + "load-step:%#x load-char:%#x compl:%#x op-pdp:%#x max-pdp:%#x", + caps->load_step, caps->load_char, caps->compliance, + caps->spr_op_pdp, caps->spr_max_pdp); +} + static int tcpm_fw_get_caps(struct tcpm_port *port, struct fwnode_handle *fwnode) { struct fwnode_handle *capabilities, *caps = NULL; @@ -7455,6 +7701,9 @@ static int tcpm_fw_get_caps(struct tcpm_port *port, struct fwnode_handle *fwnode } } + if (port->port_type != TYPEC_PORT_SRC) + tcpm_fw_get_sink_caps_ext(port, fwnode); + put_caps: if (caps != fwnode) fwnode_handle_put(caps); @@ -7497,6 +7746,8 @@ static int tcpm_fw_get_snk_vdos(struct tcpm_port *port, struct fwnode_handle *fw return ret; } + tcpm_fw_get_pd_ident(port); + return 0; } diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 6ccd1b2af993..5a98983195cb 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -34,7 +34,8 @@ enum pd_ctrl_msg_type { PD_CTRL_FR_SWAP = 19, PD_CTRL_GET_PPS_STATUS = 20, PD_CTRL_GET_COUNTRY_CODES = 21, - /* 22-23 Reserved */ + PD_CTRL_GET_SINK_CAP_EXT = 22, + /* 23 Reserved */ PD_CTRL_GET_REVISION = 24, /* 25-31 Reserved */ }; @@ -72,7 +73,8 @@ enum pd_ext_msg_type { PD_EXT_PPS_STATUS = 12, PD_EXT_COUNTRY_INFO = 13, PD_EXT_COUNTRY_CODES = 14, - /* 15-31 Reserved */ + PD_EXT_SINK_CAP_EXT = 15, + /* 16-31 Reserved */ }; #define PD_REV10 0x0 @@ -205,6 +207,72 @@ struct pd_message { }; } __packed; +/* + * count_chunked_data_objs - Helper to calculate number of Data Objects on a 4 + * byte boundary. + * @size: Size of data block for extended message. Should *not* include extended + * header size. + */ +static inline u8 count_chunked_data_objs(u32 size) +{ + size += offsetof(struct pd_chunked_ext_message_data, data); + return ((size / 4) + (size % 4 ? 1 : 0)); +} + +/* Sink Caps Extended Data Block Version */ +#define SKEDB_VER_1_0 1 + +/* Sink Caps Extended Sink Modes */ +#define SINK_MODE_PPS BIT(0) +#define SINK_MODE_VBUS BIT(1) +#define SINK_MODE_AC_SUPPLY BIT(2) +#define SINK_MODE_BATT BIT(3) +#define SINK_MODE_BATT_UL BIT(4) /* Unlimited battery power supply */ +#define SINK_MODE_AVS BIT(5) + +/** + * struct sink_caps_ext_msg - Sink extended capability PD message + * @vid: Vendor ID + * @pid: Product ID + * @xid: Value assigned by USB-IF for product + * @fw: Firmware version + * @hw: Hardware version + * @skedb_ver: Sink Caps Extended Data Block (SKEDB) Version + * @load_step: Indicates the load step slew rate. + * @load_char: Sink overload characteristics + * @compliance: Types of sources the sink has been tested & certified on + * @touch_temp: Indicates the IEC standard to which the touch temperature + * conforms to (if applicable). + * @batt_info: Indicates number batteries and hot swappable ports + * @modes: Charging caps & power sources supported + * @spr_min_pdp: Sink Minimum PDP for SPR mode + * @spr_op_pdp: Sink Operational PDP for SPR mode + * @spr_max_pdp: Sink Maximum PDP for SPR mode + * @epr_min_pdp: Sink Minimum PDP for EPR mode + * @epr_op_pdp: Sink Operational PDP for EPR mode + * @epr_max_pdp: Sink Maximum PDP for EPR mode + */ +struct sink_caps_ext_msg { + __le16 vid; + __le16 pid; + __le32 xid; + u8 fw; + u8 hw; + u8 skedb_ver; + u8 load_step; + __le16 load_char; + u8 compliance; + u8 touch_temp; + u8 batt_info; + u8 modes; + u8 spr_min_pdp; + u8 spr_op_pdp; + u8 spr_max_pdp; + u8 epr_min_pdp; + u8 epr_op_pdp; + u8 epr_max_pdp; +} __packed; + /* PDO: Power Data Object */ #define PDO_MAX_OBJECTS 7 @@ -329,6 +397,11 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ +/* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -339,6 +412,11 @@ static inline unsigned int pdo_fixed_voltage(u32 pdo) return ((pdo >> PDO_FIXED_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; } +static inline unsigned int pdo_fixed_current(u32 pdo) +{ + return ((pdo >> PDO_FIXED_CURR_SHIFT) & PDO_CURR_MASK) * 10; +} + static inline unsigned int pdo_min_voltage(u32 pdo) { return ((pdo >> PDO_VAR_MIN_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; -- cgit v1.2.3 From e19eaffc5213fdd6179e849d3032929fae0d8c2c Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 5 Mar 2026 14:44:10 -0800 Subject: mtd: concat: replace alloc + calloc with 1 alloc A flex array can be used to reduce the allocation to 1. And actually mtdconcat was using the pointer + 1 trick to point to the overallocated area. Better alternatives exist. Signed-off-by: Rosen Penev Signed-off-by: Miquel Raynal --- drivers/mtd/mtd_virt_concat.c | 8 +------- drivers/mtd/mtdconcat.c | 5 +---- include/linux/mtd/concat.h | 2 +- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/mtd/mtd_virt_concat.c b/drivers/mtd/mtd_virt_concat.c index 72689545e48e..37075ead0f33 100644 --- a/drivers/mtd/mtd_virt_concat.c +++ b/drivers/mtd/mtd_virt_concat.c @@ -182,18 +182,12 @@ static int mtd_virt_concat_create_item(struct device_node *parts, for (i = 1; i < count; i++) item->nodes[i] = of_parse_phandle(parts, CONCAT_PROP, (i - 1)); - concat = kzalloc(sizeof(*concat), GFP_KERNEL); + concat = kzalloc_flex(*concat, subdev, count, GFP_KERNEL); if (!concat) { kfree(item); return -ENOMEM; } - concat->subdev = kcalloc(count, sizeof(*concat->subdev), GFP_KERNEL); - if (!concat->subdev) { - kfree(item); - kfree(concat); - return -ENOMEM; - } item->concat = concat; list_add_tail(&item->head, &concat_node_list); diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 241d15235d01..c97167d51fe2 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -627,7 +627,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c const char *name) { /* name for the new device */ int i; - size_t size; struct mtd_concat *concat; struct mtd_info *subdev_master = NULL; uint32_t max_erasesize, curr_erasesize; @@ -640,15 +639,13 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c printk(KERN_NOTICE "into device \"%s\"\n", name); /* allocate the device structure */ - size = SIZEOF_STRUCT_MTD_CONCAT(num_devs); - concat = kzalloc(size, GFP_KERNEL); + concat = kzalloc_flex(*concat, subdev, num_devs, GFP_KERNEL); if (!concat) { printk ("memory allocation error while creating concatenated device \"%s\"\n", name); return NULL; } - concat->subdev = (struct mtd_info **) (concat + 1); /* * Set up the new "super" device's MTD object structure, check for diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index 2cd9d48958a8..f8d4d6ac1fc1 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -18,7 +18,7 @@ struct mtd_concat { struct mtd_info mtd; int num_subdev; - struct mtd_info **subdev; + struct mtd_info *subdev[]; }; struct mtd_info *mtd_concat_create( -- cgit v1.2.3 From f66d6cc6891e41be96380261943837b1909107b3 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 11 Mar 2026 10:18:42 -0700 Subject: accel/amdxdna: Support sensors for column utilization The AMD PMF driver provides realtime column utilization (npu_busy) metrics for the NPU. Extend the DRM_IOCTL_AMDXDNA_GET_INFO sensor query to expose these metrics to userspace. Add AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION to the sensor type enum and update aie2_get_sensors() to return both the total power and up to 8 column utilization sensors if the user buffer permits. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Lizhi Hou [lizhi: support legacy tool which uses small buffer. checkpatch cleanup] Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260311171842.473453-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_pci.c | 34 +++++++++++++++++++++++++++++----- drivers/accel/amdxdna/aie2_pci.h | 8 ++++++++ include/uapi/drm/amdxdna_accel.h | 3 ++- 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index a2e586512e26..c57c785a2d15 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -787,16 +787,18 @@ static int aie2_get_clock_metadata(struct amdxdna_client *client, static int aie2_get_sensors(struct amdxdna_client *client, struct amdxdna_drm_get_info *args) { + struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle; struct amdxdna_drm_query_sensor sensor = {}; + struct amd_pmf_npu_metrics npu_metrics; + u32 sensors_count = 0, i; int ret; - if (args->buffer_size < sizeof(sensor)) - return -EINVAL; - - ret = AIE2_GET_PMF_NPU_DATA(npu_power, sensor.input); + ret = AIE2_GET_PMF_NPU_METRICS(&npu_metrics); if (ret) return ret; + sensor.type = AMDXDNA_SENSOR_TYPE_POWER; + sensor.input = npu_metrics.npu_power; sensor.unitm = -3; scnprintf(sensor.label, sizeof(sensor.label), "Total Power"); scnprintf(sensor.units, sizeof(sensor.units), "mW"); @@ -804,7 +806,29 @@ static int aie2_get_sensors(struct amdxdna_client *client, if (copy_to_user(u64_to_user_ptr(args->buffer), &sensor, sizeof(sensor))) return -EFAULT; - args->buffer_size = sizeof(sensor); + sensors_count++; + if (args->buffer_size <= sensors_count * sizeof(sensor)) + goto out; + + for (i = 0; i < min_t(u32, ndev->total_col, 8); i++) { + memset(&sensor, 0, sizeof(sensor)); + sensor.input = npu_metrics.npu_busy[i]; + sensor.type = AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION; + sensor.unitm = 0; + scnprintf(sensor.label, sizeof(sensor.label), "Column %d Utilization", i); + scnprintf(sensor.units, sizeof(sensor.units), "%%"); + + if (copy_to_user(u64_to_user_ptr(args->buffer) + sensors_count * sizeof(sensor), + &sensor, sizeof(sensor))) + return -EFAULT; + + sensors_count++; + if (args->buffer_size <= sensors_count * sizeof(sensor)) + goto out; + } + +out: + args->buffer_size = sensors_count * sizeof(sensor); return 0; } diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h index 1bb88711bedb..0ae174862592 100644 --- a/drivers/accel/amdxdna/aie2_pci.h +++ b/drivers/accel/amdxdna/aie2_pci.h @@ -48,6 +48,7 @@ }) #if IS_ENABLED(CONFIG_AMD_PMF) +#define AIE2_GET_PMF_NPU_METRICS(metrics) amd_pmf_get_npu_data(metrics) #define AIE2_GET_PMF_NPU_DATA(field, val) \ ({ \ struct amd_pmf_npu_metrics _npu_metrics; \ @@ -58,6 +59,13 @@ (_ret); \ }) #else +#define AIE2_GET_PMF_NPU_METRICS(metrics) \ +({ \ + typeof(metrics) _m = metrics; \ + memset(_m, 0xff, sizeof(*_m)); \ + (-EOPNOTSUPP); \ +}) + #define SENSOR_DEFAULT_npu_power U32_MAX #define AIE2_GET_PMF_NPU_DATA(field, val) \ ({ \ diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h index 9c44db2b3dcd..5bd13f4435f5 100644 --- a/include/uapi/drm/amdxdna_accel.h +++ b/include/uapi/drm/amdxdna_accel.h @@ -353,7 +353,8 @@ struct amdxdna_drm_query_clock_metadata { }; enum amdxdna_sensor_type { - AMDXDNA_SENSOR_TYPE_POWER + AMDXDNA_SENSOR_TYPE_POWER, + AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION }; /** -- cgit v1.2.3 From 77dd8adabbc8ff845177b460de48b9d2cd579966 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 9 Mar 2026 13:52:22 +0100 Subject: efi: Drop unused efi_range_is_wc() function efi_range_is_wc() has no callers, so remove it. Reviewed-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 664898d09ff5..72e76ec54641 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -832,27 +832,6 @@ extern int __init parse_efi_signature_list( const void *data, size_t size, efi_element_handler_t (*get_handler_for_guid)(const efi_guid_t *)); -/** - * efi_range_is_wc - check the WC bit on an address range - * @start: starting kvirt address - * @len: length of range - * - * Consult the EFI memory map and make sure it's ok to set this range WC. - * Returns true or false. - */ -static inline int efi_range_is_wc(unsigned long start, unsigned long len) -{ - unsigned long i; - - for (i = 0; i < len; i += (1UL << EFI_PAGE_SHIFT)) { - unsigned long paddr = __pa(start + i); - if (!(efi_mem_attributes(paddr) & EFI_MEMORY_WC)) - return 0; - } - /* The range checked out */ - return 1; -} - /* * We play games with efi_enabled so that the compiler will, if * possible, remove EFI-related code altogether. -- cgit v1.2.3 From 786ee8ddf47a2333aa5ffd16f68a3c0e9c7d1fbf Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Mar 2026 16:44:44 -0400 Subject: RDMA/OPA: Update OPA link speed list Update the list of available link speeds. Fix comments. Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://patch.msgid.link/177308908456.1279894.16723781060261360236.stgit@awdrv-04.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- include/rdma/opa_port_info.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 73bcac90a048..fb66d3a1dfa9 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -93,9 +93,11 @@ #define OPA_LINKINIT_QUARANTINED (9 << 4) #define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4) -#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */ -#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ -#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */ +#define OPA_LINK_SPEED_NOP 0x0000 /* no change */ +#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ +#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125 Gbps */ +#define OPA_LINK_SPEED_50G 0x0004 /* 53.125 Gbps */ +#define OPA_LINK_SPEED_100G 0x0008 /* 106.25 Gbps */ #define OPA_LINK_WIDTH_1X 0x0001 #define OPA_LINK_WIDTH_2X 0x0002 -- cgit v1.2.3 From 679eb25de4ee537f209c6d81f7808ad65b03bbbc Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Wed, 11 Mar 2026 13:28:03 -0400 Subject: RDMA/rdmavt: Add ucontext alloc/dealloc passthrough Add a private data pointer to the ucontext structure and add per-client pass-throughs. Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://patch.msgid.link/177325008318.52243.7367786996925601681.stgit@awdrv-04.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/vt.c | 8 ++++++++ include/rdma/rdma_vt.h | 7 +++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 0c28b412d81a..033d8932aff1 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -244,6 +244,10 @@ static int rvt_query_gid(struct ib_device *ibdev, u32 port_num, */ static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct rvt_dev_info *rdi = ib_to_rvt(uctx->device); + + if (rdi->driver_f.alloc_ucontext) + return rdi->driver_f.alloc_ucontext(uctx, udata); return 0; } @@ -253,6 +257,10 @@ static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) */ static void rvt_dealloc_ucontext(struct ib_ucontext *context) { + struct rvt_dev_info *rdi = ib_to_rvt(context->device); + + if (rdi->driver_f.dealloc_ucontext) + rdi->driver_f.dealloc_ucontext(context); return; } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index c429d6ddb129..8671c6da16bb 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,7 @@ struct rvt_driver_params { /* User context */ struct rvt_ucontext { struct ib_ucontext ibucontext; + void *priv; }; /* Protection domain */ @@ -359,6 +360,12 @@ struct rvt_driver_provided { /* Get and return CPU to pin CQ processing thread */ int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect); + + /* allocate a ucontext */ + int (*alloc_ucontext)(struct ib_ucontext *uctx, struct ib_udata *udata); + + /* deallocate a ucontext */ + void (*dealloc_ucontext)(struct ib_ucontext *context); }; struct rvt_dev_info { -- cgit v1.2.3 From 6be4ca0ab3a2363a850787079f2342d41d377487 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Mar 2026 16:44:59 -0400 Subject: RDMA/rdmavt: Add driver mmap callback Add a reserved range and a driver callback to allow the driver to have custom mmaps. Generated mmap offsets are cookies and are not related to the size of the mmap. Advance the mmap offset by the minimum, PAGE_SIZE, rather than the size of the mmap. Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://patch.msgid.link/177308909972.1279894.15543003811821875042.stgit@awdrv-04.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rdmavt/mmap.c | 22 +++++++++++++++++----- include/rdma/rdma_vt.h | 3 +++ 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c index 46e3b3e0643a..473f464f33fa 100644 --- a/drivers/infiniband/sw/rdmavt/mmap.c +++ b/drivers/infiniband/sw/rdmavt/mmap.c @@ -9,6 +9,11 @@ #include #include "mmap.h" +/* number of reserved mmaps for the driver */ +#define MMAP_RESERVED 256 +/* start point for dynamic offsets */ +#define MMAP_OFFSET_START (MMAP_RESERVED * PAGE_SIZE) + /** * rvt_mmap_init - init link list and lock for mem map * @rdi: rvt dev struct @@ -17,7 +22,7 @@ void rvt_mmap_init(struct rvt_dev_info *rdi) { INIT_LIST_HEAD(&rdi->pending_mmaps); spin_lock_init(&rdi->pending_lock); - rdi->mmap_offset = PAGE_SIZE; + rdi->mmap_offset = MMAP_OFFSET_START; spin_lock_init(&rdi->mmap_offset_lock); } @@ -73,6 +78,13 @@ int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct rvt_mmap_info *ip, *pp; int ret = -EINVAL; + /* call driver if in reserved range */ + if (offset < MMAP_OFFSET_START) { + if (rdi->driver_f.mmap) + return rdi->driver_f.mmap(context, vma); + return -EINVAL; + } + /* * Search the device's list of objects waiting for a mmap call. * Normally, this list is very short since a call to create a @@ -129,9 +141,9 @@ struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size, spin_lock_irq(&rdi->mmap_offset_lock); if (rdi->mmap_offset == 0) - rdi->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA); + rdi->mmap_offset = MMAP_OFFSET_START; ip->offset = rdi->mmap_offset; - rdi->mmap_offset += ALIGN(size, SHMLBA); + rdi->mmap_offset += PAGE_SIZE; spin_unlock_irq(&rdi->mmap_offset_lock); INIT_LIST_HEAD(&ip->pending_mmaps); @@ -159,9 +171,9 @@ void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip, spin_lock_irq(&rdi->mmap_offset_lock); if (rdi->mmap_offset == 0) - rdi->mmap_offset = PAGE_SIZE; + rdi->mmap_offset = MMAP_OFFSET_START; ip->offset = rdi->mmap_offset; - rdi->mmap_offset += size; + rdi->mmap_offset += PAGE_SIZE; spin_unlock_irq(&rdi->mmap_offset_lock); ip->size = size; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 8671c6da16bb..7d8de561f71b 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -366,6 +366,9 @@ struct rvt_driver_provided { /* deallocate a ucontext */ void (*dealloc_ucontext)(struct ib_ucontext *context); + + /* driver mmap */ + int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); }; struct rvt_dev_info { -- cgit v1.2.3 From 8e3a93e7a1a3a788109ba005edf6223d389ab04f Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Wed, 11 Mar 2026 16:46:31 +0200 Subject: dt-bindings: clock: qcom: document the Eliza Global Clock Controller Add bindings documentation for the Global Clock Controller on Qualcomm Eliza SoC. Reuse the Milos bindings schema since the controller resources are exactly the same, even though the controllers are incompatible between them. Signed-off-by: Taniya Das Reviewed-by: Krzysztof Kozlowski Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20260311-eliza-clocks-v6-1-453c4cf657a2@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/clock/qcom,milos-gcc.yaml | 9 +- include/dt-bindings/clock/qcom,eliza-gcc.h | 210 +++++++++++++++++++++ 2 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 include/dt-bindings/clock/qcom,eliza-gcc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml index cf244c155f9a..60f1c8ca2c13 100644 --- a/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml @@ -8,16 +8,21 @@ title: Qualcomm Global Clock & Reset Controller on Milos maintainers: - Luca Weiss + - Taniya Das description: | Qualcomm global clock control module provides the clocks, resets and power domains on Milos. - See also: include/dt-bindings/clock/qcom,milos-gcc.h + See also: + - include/dt-bindings/clock/qcom,eliza-gcc.h + - include/dt-bindings/clock/qcom,milos-gcc.h properties: compatible: - const: qcom,milos-gcc + enum: + - qcom,eliza-gcc + - qcom,milos-gcc clocks: items: diff --git a/include/dt-bindings/clock/qcom,eliza-gcc.h b/include/dt-bindings/clock/qcom,eliza-gcc.h new file mode 100644 index 000000000000..4d27b329ae99 --- /dev/null +++ b/include/dt-bindings/clock/qcom,eliza-gcc.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GCC_ELIZA_H +#define _DT_BINDINGS_CLK_QCOM_GCC_ELIZA_H + +/* GCC clocks */ +#define GCC_AGGRE_NOC_PCIE_AXI_CLK 0 +#define GCC_AGGRE_UFS_PHY_AXI_CLK 1 +#define GCC_AGGRE_USB3_PRIM_AXI_CLK 2 +#define GCC_BOOT_ROM_AHB_CLK 3 +#define GCC_CAM_BIST_MCLK_AHB_CLK 4 +#define GCC_CAMERA_AHB_CLK 5 +#define GCC_CAMERA_HF_AXI_CLK 6 +#define GCC_CAMERA_SF_AXI_CLK 7 +#define GCC_CAMERA_XO_CLK 8 +#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK 9 +#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK 10 +#define GCC_CNOC_PCIE_SF_AXI_CLK 11 +#define GCC_DDRSS_GPU_AXI_CLK 12 +#define GCC_DDRSS_PCIE_SF_QTB_CLK 13 +#define GCC_DISP_AHB_CLK 14 +#define GCC_DISP_HF_AXI_CLK 15 +#define GCC_GP1_CLK 16 +#define GCC_GP1_CLK_SRC 17 +#define GCC_GP2_CLK 18 +#define GCC_GP2_CLK_SRC 19 +#define GCC_GP3_CLK 20 +#define GCC_GP3_CLK_SRC 21 +#define GCC_GPLL0 22 +#define GCC_GPLL0_OUT_EVEN 23 +#define GCC_GPLL4 24 +#define GCC_GPLL7 25 +#define GCC_GPLL8 26 +#define GCC_GPLL9 27 +#define GCC_GPU_CFG_AHB_CLK 28 +#define GCC_GPU_GEMNOC_GFX_CLK 29 +#define GCC_GPU_GPLL0_CPH_CLK_SRC 30 +#define GCC_GPU_GPLL0_DIV_CPH_CLK_SRC 31 +#define GCC_GPU_SMMU_VOTE_CLK 32 +#define GCC_MMU_TCU_VOTE_CLK 33 +#define GCC_PCIE_0_AUX_CLK 34 +#define GCC_PCIE_0_AUX_CLK_SRC 35 +#define GCC_PCIE_0_CFG_AHB_CLK 36 +#define GCC_PCIE_0_MSTR_AXI_CLK 37 +#define GCC_PCIE_0_PHY_RCHNG_CLK 38 +#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC 39 +#define GCC_PCIE_0_PIPE_CLK 40 +#define GCC_PCIE_0_PIPE_CLK_SRC 41 +#define GCC_PCIE_0_PIPE_DIV2_CLK 42 +#define GCC_PCIE_0_PIPE_DIV2_CLK_SRC 43 +#define GCC_PCIE_0_SLV_AXI_CLK 44 +#define GCC_PCIE_0_SLV_Q2A_AXI_CLK 45 +#define GCC_PCIE_1_AUX_CLK 46 +#define GCC_PCIE_1_AUX_CLK_SRC 47 +#define GCC_PCIE_1_CFG_AHB_CLK 48 +#define GCC_PCIE_1_MSTR_AXI_CLK 49 +#define GCC_PCIE_1_PHY_RCHNG_CLK 50 +#define GCC_PCIE_1_PHY_RCHNG_CLK_SRC 51 +#define GCC_PCIE_1_PIPE_CLK 52 +#define GCC_PCIE_1_PIPE_CLK_SRC 53 +#define GCC_PCIE_1_PIPE_DIV2_CLK 54 +#define GCC_PCIE_1_PIPE_DIV2_CLK_SRC 55 +#define GCC_PCIE_1_SLV_AXI_CLK 56 +#define GCC_PCIE_1_SLV_Q2A_AXI_CLK 57 +#define GCC_PCIE_RSCC_CFG_AHB_CLK 58 +#define GCC_PCIE_RSCC_XO_CLK 59 +#define GCC_PDM2_CLK 60 +#define GCC_PDM2_CLK_SRC 61 +#define GCC_PDM_AHB_CLK 62 +#define GCC_PDM_XO4_CLK 63 +#define GCC_QMIP_CAMERA_CMD_AHB_CLK 64 +#define GCC_QMIP_CAMERA_NRT_AHB_CLK 65 +#define GCC_QMIP_CAMERA_RT_AHB_CLK 66 +#define GCC_QMIP_GPU_AHB_CLK 67 +#define GCC_QMIP_PCIE_AHB_CLK 68 +#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK 69 +#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK 70 +#define GCC_QUPV3_WRAP1_CORE_2X_CLK 71 +#define GCC_QUPV3_WRAP1_CORE_CLK 72 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK 73 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK_SRC 74 +#define GCC_QUPV3_WRAP1_S0_CLK 75 +#define GCC_QUPV3_WRAP1_S0_CLK_SRC 76 +#define GCC_QUPV3_WRAP1_S1_CLK 77 +#define GCC_QUPV3_WRAP1_S1_CLK_SRC 78 +#define GCC_QUPV3_WRAP1_S2_CLK 79 +#define GCC_QUPV3_WRAP1_S2_CLK_SRC 80 +#define GCC_QUPV3_WRAP1_S3_CLK 81 +#define GCC_QUPV3_WRAP1_S3_CLK_SRC 82 +#define GCC_QUPV3_WRAP1_S4_CLK 83 +#define GCC_QUPV3_WRAP1_S4_CLK_SRC 84 +#define GCC_QUPV3_WRAP1_S5_CLK 85 +#define GCC_QUPV3_WRAP1_S5_CLK_SRC 86 +#define GCC_QUPV3_WRAP1_S6_CLK 87 +#define GCC_QUPV3_WRAP1_S6_CLK_SRC 88 +#define GCC_QUPV3_WRAP1_S7_CLK 89 +#define GCC_QUPV3_WRAP1_S7_CLK_SRC 90 +#define GCC_QUPV3_WRAP2_CORE_2X_CLK 91 +#define GCC_QUPV3_WRAP2_CORE_CLK 92 +#define GCC_QUPV3_WRAP2_S0_CLK 93 +#define GCC_QUPV3_WRAP2_S0_CLK_SRC 94 +#define GCC_QUPV3_WRAP2_S1_CLK 95 +#define GCC_QUPV3_WRAP2_S1_CLK_SRC 96 +#define GCC_QUPV3_WRAP2_S2_CLK 97 +#define GCC_QUPV3_WRAP2_S2_CLK_SRC 98 +#define GCC_QUPV3_WRAP2_S3_CLK 99 +#define GCC_QUPV3_WRAP2_S3_CLK_SRC 100 +#define GCC_QUPV3_WRAP2_S4_CLK 101 +#define GCC_QUPV3_WRAP2_S4_CLK_SRC 102 +#define GCC_QUPV3_WRAP2_S5_CLK 103 +#define GCC_QUPV3_WRAP2_S5_CLK_SRC 104 +#define GCC_QUPV3_WRAP2_S6_CLK 105 +#define GCC_QUPV3_WRAP2_S6_CLK_SRC 106 +#define GCC_QUPV3_WRAP2_S7_CLK 107 +#define GCC_QUPV3_WRAP2_S7_CLK_SRC 108 +#define GCC_QUPV3_WRAP_1_M_AHB_CLK 109 +#define GCC_QUPV3_WRAP_1_S_AHB_CLK 110 +#define GCC_QUPV3_WRAP_2_M_AHB_CLK 111 +#define GCC_QUPV3_WRAP_2_S_AHB_CLK 112 +#define GCC_SDCC1_AHB_CLK 113 +#define GCC_SDCC1_APPS_CLK 114 +#define GCC_SDCC1_APPS_CLK_SRC 115 +#define GCC_SDCC1_ICE_CORE_CLK 116 +#define GCC_SDCC1_ICE_CORE_CLK_SRC 117 +#define GCC_SDCC2_AHB_CLK 118 +#define GCC_SDCC2_APPS_CLK 119 +#define GCC_SDCC2_APPS_CLK_SRC 120 +#define GCC_UFS_PHY_AHB_CLK 121 +#define GCC_UFS_PHY_AXI_CLK 122 +#define GCC_UFS_PHY_AXI_CLK_SRC 123 +#define GCC_UFS_PHY_ICE_CORE_CLK 124 +#define GCC_UFS_PHY_ICE_CORE_CLK_SRC 125 +#define GCC_UFS_PHY_PHY_AUX_CLK 126 +#define GCC_UFS_PHY_PHY_AUX_CLK_SRC 127 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK 128 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC 129 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK 130 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC 131 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK 132 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC 133 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK 134 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC 135 +#define GCC_USB30_PRIM_ATB_CLK 136 +#define GCC_USB30_PRIM_MASTER_CLK 137 +#define GCC_USB30_PRIM_MASTER_CLK_SRC 138 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK 139 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC 140 +#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC 141 +#define GCC_USB30_PRIM_SLEEP_CLK 142 +#define GCC_USB3_PRIM_PHY_AUX_CLK 143 +#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC 144 +#define GCC_USB3_PRIM_PHY_COM_AUX_CLK 145 +#define GCC_USB3_PRIM_PHY_PIPE_CLK 146 +#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC 147 +#define GCC_VIDEO_AHB_CLK 148 +#define GCC_VIDEO_AXI0_CLK 149 +#define GCC_VIDEO_AXI1_CLK 150 +#define GCC_VIDEO_XO_CLK 151 + +/* GCC power domains */ +#define GCC_PCIE_0_GDSC 0 +#define GCC_PCIE_0_PHY_GDSC 1 +#define GCC_PCIE_1_GDSC 2 +#define GCC_PCIE_1_PHY_GDSC 3 +#define GCC_UFS_MEM_PHY_GDSC 4 +#define GCC_UFS_PHY_GDSC 5 +#define GCC_USB30_PRIM_GDSC 6 +#define GCC_USB3_PHY_GDSC 7 + +/* GCC resets */ +#define GCC_CAMERA_BCR 0 +#define GCC_DISPLAY_BCR 1 +#define GCC_GPU_BCR 2 +#define GCC_PCIE_0_BCR 3 +#define GCC_PCIE_0_LINK_DOWN_BCR 4 +#define GCC_PCIE_0_NOCSR_COM_PHY_BCR 5 +#define GCC_PCIE_0_PHY_BCR 6 +#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR 7 +#define GCC_PCIE_1_BCR 8 +#define GCC_PCIE_1_LINK_DOWN_BCR 9 +#define GCC_PCIE_1_NOCSR_COM_PHY_BCR 10 +#define GCC_PCIE_1_PHY_BCR 11 +#define GCC_PCIE_1_PHY_NOCSR_COM_PHY_BCR 12 +#define GCC_PCIE_PHY_BCR 13 +#define GCC_PCIE_PHY_CFG_AHB_BCR 14 +#define GCC_PCIE_PHY_COM_BCR 15 +#define GCC_PCIE_RSCC_BCR 16 +#define GCC_PDM_BCR 17 +#define GCC_QUPV3_WRAPPER_1_BCR 18 +#define GCC_QUPV3_WRAPPER_2_BCR 19 +#define GCC_QUSB2PHY_PRIM_BCR 20 +#define GCC_QUSB2PHY_SEC_BCR 21 +#define GCC_SDCC1_BCR 22 +#define GCC_SDCC2_BCR 23 +#define GCC_UFS_PHY_BCR 24 +#define GCC_USB30_PRIM_BCR 25 +#define GCC_USB3_DP_PHY_PRIM_BCR 26 +#define GCC_USB3_DP_PHY_SEC_BCR 27 +#define GCC_USB3_PHY_PRIM_BCR 28 +#define GCC_USB3_PHY_SEC_BCR 29 +#define GCC_USB3PHY_PHY_PRIM_BCR 30 +#define GCC_USB3PHY_PHY_SEC_BCR 31 +#define GCC_VIDEO_AXI0_CLK_ARES 32 +#define GCC_VIDEO_AXI1_CLK_ARES 33 +#define GCC_VIDEO_BCR 34 + +#endif -- cgit v1.2.3 From b7518e0d1c0f4da4c0cc7940eed4679a5ff69a2e Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Wed, 11 Mar 2026 16:46:32 +0200 Subject: dt-bindings: clock: qcom: Document the Eliza TCSR Clock Controller Add bindings documentation for TCSR Clock Controller for Eliza SoC. Signed-off-by: Taniya Das Acked-by: Krzysztof Kozlowski Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20260311-eliza-clocks-v6-2-453c4cf657a2@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/clock/qcom,sm8550-tcsr.yaml | 2 ++ include/dt-bindings/clock/qcom,eliza-tcsr.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,eliza-tcsr.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml index 784fef830681..ae9aef0e54e8 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml @@ -15,6 +15,7 @@ description: | power domains on SM8550 See also: + - include/dt-bindings/clock/qcom,eliza-tcsr.h - include/dt-bindings/clock/qcom,glymur-tcsr.h - include/dt-bindings/clock/qcom,sm8550-tcsr.h - include/dt-bindings/clock/qcom,sm8650-tcsr.h @@ -24,6 +25,7 @@ properties: compatible: items: - enum: + - qcom,eliza-tcsr - qcom,glymur-tcsr - qcom,kaanapali-tcsr - qcom,milos-tcsr diff --git a/include/dt-bindings/clock/qcom,eliza-tcsr.h b/include/dt-bindings/clock/qcom,eliza-tcsr.h new file mode 100644 index 000000000000..aeb5e2b1a47b --- /dev/null +++ b/include/dt-bindings/clock/qcom,eliza-tcsr.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_TCSR_CC_ELIZA_H +#define _DT_BINDINGS_CLK_QCOM_TCSR_CC_ELIZA_H + +/* TCSR_CC clocks */ +#define TCSR_HDMI_CLKREF_EN 0 +#define TCSR_PCIE_0_CLKREF_EN 1 +#define TCSR_PCIE_1_CLKREF_EN 2 +#define TCSR_UFS_CLKREF_EN 3 +#define TCSR_USB2_CLKREF_EN 4 +#define TCSR_USB3_CLKREF_EN 5 + +#endif -- cgit v1.2.3 From a5c7b4fc8405846c613e7a01805a77d2e0cb75bd Mon Sep 17 00:00:00 2001 From: Val Packett Date: Tue, 3 Mar 2026 00:41:20 -0300 Subject: dt-bindings: clock: qcom,sm6115-dispcc: Define MDSS resets Add the missing defines for MDSS resets, which are necessary to reset the display subsystem in order to avoid issues caused by state left over from the bootloader. While here, align comment style with other SoCs. Acked-by: Krzysztof Kozlowski Signed-off-by: Val Packett Link: https://lore.kernel.org/r/20260303034847.13870-2-val@packett.cool Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm6115-dispcc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm6115-dispcc.h b/include/dt-bindings/clock/qcom,sm6115-dispcc.h index d1a6c45b5029..ab8d312ade37 100644 --- a/include/dt-bindings/clock/qcom,sm6115-dispcc.h +++ b/include/dt-bindings/clock/qcom,sm6115-dispcc.h @@ -6,7 +6,7 @@ #ifndef _DT_BINDINGS_CLK_QCOM_DISP_CC_SM6115_H #define _DT_BINDINGS_CLK_QCOM_DISP_CC_SM6115_H -/* DISP_CC clocks */ +/* Clocks */ #define DISP_CC_PLL0 0 #define DISP_CC_PLL0_OUT_MAIN 1 #define DISP_CC_MDSS_AHB_CLK 2 @@ -30,7 +30,10 @@ #define DISP_CC_SLEEP_CLK 20 #define DISP_CC_SLEEP_CLK_SRC 21 -/* DISP_CC GDSCR */ +/* Resets */ +#define DISP_CC_MDSS_CORE_BCR 0 + +/* GDSCs */ #define MDSS_GDSC 0 #endif -- cgit v1.2.3 From 0221b14be8aae98d687efab066133a114bea02d8 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Tue, 3 Mar 2026 00:41:21 -0300 Subject: dt-bindings: clock: qcom,dispcc-sm6125: Define MDSS resets Add the missing defines for MDSS resets, which are necessary to reset the display subsystem in order to avoid issues caused by state left over from the bootloader. While here, align comment style with other SoCs. Acked-by: Krzysztof Kozlowski Signed-off-by: Val Packett Link: https://lore.kernel.org/r/20260303034847.13870-3-val@packett.cool Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,dispcc-sm6125.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,dispcc-sm6125.h b/include/dt-bindings/clock/qcom,dispcc-sm6125.h index 4ff974f4fcc3..f58b85d2c814 100644 --- a/include/dt-bindings/clock/qcom,dispcc-sm6125.h +++ b/include/dt-bindings/clock/qcom,dispcc-sm6125.h @@ -6,6 +6,7 @@ #ifndef _DT_BINDINGS_CLK_QCOM_DISP_CC_SM6125_H #define _DT_BINDINGS_CLK_QCOM_DISP_CC_SM6125_H +/* Clocks */ #define DISP_CC_PLL0 0 #define DISP_CC_MDSS_AHB_CLK 1 #define DISP_CC_MDSS_AHB_CLK_SRC 2 @@ -35,7 +36,10 @@ #define DISP_CC_MDSS_VSYNC_CLK_SRC 26 #define DISP_CC_XO_CLK 27 -/* DISP_CC GDSCR */ +/* Resets */ +#define DISP_CC_MDSS_CORE_BCR 0 + +/* GDSCs */ #define MDSS_GDSC 0 #endif -- cgit v1.2.3 From 12ae2c81b21cfaa193db2faf035d495807edc3a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:59 +0100 Subject: clone: add CLONE_AUTOREAP Add a new clone3() flag CLONE_AUTOREAP that makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern where the parent simply doesn't care about the child's exit status. No exit signal is delivered so exit_signal must be zero. CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild would inherit exit_signal == 0 from the autoreap parent's group leader but without signal->autoreap. This grandchild would become a zombie that never sends a signal and is never autoreaped - confusing and arguably broken behavior. The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. Link: https://github.com/uapi-group/kernel-features/issues/45 Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/sched/signal.h | 1 + include/uapi/linux/sched.h | 5 +++-- kernel/fork.c | 17 ++++++++++++++++- kernel/ptrace.c | 3 ++- kernel/signal.c | 4 ++++ 5 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..f842c86b806f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -132,6 +132,7 @@ struct signal_struct { */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; + unsigned int autoreap:1; #ifdef CONFIG_POSIX_TIMERS diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..69f7b4f9eb0c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,9 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..10549574fda6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2028,6 +2028,18 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2435,6 +2447,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2897,7 +2911,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | + CLONE_AUTOREAP)) return false; /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 392ec2f75f01..68c17daef8d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { + else if (ignoring_children(tracer->sighand) || + p->signal->autoreap) { __wake_up_parent(p, tracer); dead = true; } diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..e61f39fa8c8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig) if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } + if (!tsk->ptrace && tsk->signal->autoreap) { + autoreap = true; + sig = 0; + } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. -- cgit v1.2.3 From 24baca56fafc33d4fb77cd9858a48c734183cb22 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:00 +0100 Subject: clone: add CLONE_NNP Add a new clone3() flag CLONE_NNP that sets no_new_privs on the child process at clone time. This is analogous to prctl(PR_SET_NO_NEW_PRIVS) but applied at process creation rather than requiring a separate step after the child starts running. CLONE_NNP is rejected with CLONE_THREAD. It's conceptually a lot simpler if the whole thread-group is forced into NNP and not have single threads running around with NNP. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-2-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 1 + kernel/fork.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 69f7b4f9eb0c..386c8d7e89cb 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -37,6 +37,7 @@ #define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ +#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index 10549574fda6..736798e4005a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2040,6 +2040,11 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_NNP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2424,6 +2429,9 @@ __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + if (clone_flags & CLONE_NNP) + task_set_no_new_privs(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2912,7 +2920,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | - CLONE_AUTOREAP)) + CLONE_AUTOREAP | CLONE_NNP)) return false; /* -- cgit v1.2.3 From c8134b5f13ae959de2b3c8cc278e2602b0857345 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:51:01 +0100 Subject: pidfd: add CLONE_PIDFD_AUTOKILL Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- fs/pidfs.c | 38 ++++++++++++++++++++++++++++++++------ include/uapi/linux/pidfd.h | 1 + include/uapi/linux/sched.h | 1 + kernel/fork.c | 29 ++++++++++++++++++++++++++--- 4 files changed, 60 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/pidfs.c b/fs/pidfs.c index 318253344b5c..a8d1bca0395d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return open_namespace(ns_common); } +static int pidfs_file_release(struct inode *inode, struct file *file) +{ + struct pid *pid = inode->i_private; + struct task_struct *task; + + if (!(file->f_flags & PIDFD_AUTOKILL)) + return 0; + + guard(rcu)(); + task = pid_task(pid, PIDTYPE_TGID); + if (!task) + return 0; + + /* Not available for kthreads or user workers for now. */ + if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER))) + return 0; + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID); + return 0; +} + static const struct file_operations pidfs_file_operations = { + .release = pidfs_file_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, @@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) int ret; /* - * Ensure that PIDFD_STALE can be passed as a flag without - * overloading other uapi pidfd flags. + * Ensure that internal pidfd flags don't overlap with each + * other or with uapi pidfd flags. */ - BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); - BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); + BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK | + PIDFD_STALE | PIDFD_AUTOKILL) != 4); ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) @@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) flags &= ~PIDFD_STALE; flags |= O_RDWR; pidfd_file = dentry_open(&path, flags, current_cred()); - /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ + /* + * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as + * do_dentry_open() strips O_EXCL and O_TRUNC. + */ if (!IS_ERR(pidfd_file)) - pidfd_file->f_flags |= (flags & PIDFD_THREAD); + pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL)); return pidfd_file; } diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index ea9a6811fc76..9281956a9f32 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -13,6 +13,7 @@ #ifdef __KERNEL__ #include #define PIDFD_STALE CLONE_PIDFD +#define PIDFD_AUTOKILL O_TRUNC #endif /* Flags for pidfd_send_signal(). */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 386c8d7e89cb..149dbc64923b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -38,6 +38,7 @@ #define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ #define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ +#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index 736798e4005a..99a6cb4e7ab0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2045,6 +2045,24 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD_AUTOKILL) { + if (!(clone_flags & CLONE_PIDFD)) + return ERR_PTR(-EINVAL); + if (!(clone_flags & CLONE_AUTOREAP)) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + /* + * Without CLONE_NNP the child could escalate privileges + * after being spawned, so require CAP_SYS_ADMIN. + * With CLONE_NNP the child can't gain new privileges, + * so allow unprivileged usage. + */ + if (!(clone_flags & CLONE_NNP) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2267,13 +2285,18 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + unsigned flags = PIDFD_STALE; + + if (clone_flags & CLONE_THREAD) + flags |= PIDFD_THREAD; + if (clone_flags & CLONE_PIDFD_AUTOKILL) + flags |= PIDFD_AUTOKILL; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); + retval = pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2920,7 +2943,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | - CLONE_AUTOREAP | CLONE_NNP)) + CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL)) return false; /* -- cgit v1.2.3 From 4ef420b3450026b56807e5d53001f80eb495403c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 10 Mar 2026 18:01:01 -0700 Subject: cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock Replace the global cgroup_file_kn_lock with a per-cgroup_file spinlock to eliminate cross-cgroup contention as it is not really protecting data shared between different cgroups. The lock is initialized in cgroup_add_file() alongside timer_setup(). No lock acquisition is needed during initialization since the cgroup directory is being populated under cgroup_mutex and no concurrent accessors exist at that point. Reported-by: Jakub Kicinski Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup.c | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..ba26b5d05ce3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -167,6 +167,7 @@ struct cgroup_file { struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; + spinlock_t lock; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d161bcaa68f1..6f58efeb9016 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -99,12 +99,6 @@ static bool cgroup_debug __read_mostly; */ static DEFINE_SPINLOCK(cgroup_idr_lock); -/* - * Protects cgroup_file->kn for !self csses. It synchronizes notifications - * against file removal/re-creation across css hiding. - */ -static DEFINE_SPINLOCK(cgroup_file_kn_lock); - DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ @@ -1693,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); WRITE_ONCE(cfile->kn, NULL); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); timer_delete_sync(&cfile->notify_timer); } @@ -4373,10 +4367,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); - - spin_lock_irq(&cgroup_file_kn_lock); - WRITE_ONCE(cfile->kn, kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_lock_init(&cfile->lock); + cfile->kn = kn; } return 0; @@ -4645,13 +4637,13 @@ void cgroup_file_notify(struct cgroup_file *cfile) return; } - spin_lock_irqsave(&cgroup_file_kn_lock, flags); + spin_lock_irqsave(&cfile->lock, flags); if (cfile->kn) { kn = cfile->kn; kernfs_get(kn); WRITE_ONCE(cfile->notified_at, jiffies); } - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); + spin_unlock_irqrestore(&cfile->lock, flags); if (kn) { kernfs_notify(kn); @@ -4669,10 +4661,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); kn = cfile->kn; kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); if (kn) kernfs_show(kn, show); -- cgit v1.2.3 From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:37 -0800 Subject: sched/wait: correct kernel-doc descriptions Use the correct function name and function parameter name to avoid these kernel-doc warnings: Warning: include/linux/wait_bit.h:424 expecting prototype for wait_var_event_killable(). Prototype was for wait_var_event_interruptible() instead Warning: include/linux/wait_bit.h:508 function parameter 'lock' not described in 'wait_var_event_mutex' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org --- include/linux/wait_bit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9e29d79fc790..ace7379d627d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -406,7 +406,7 @@ do { \ schedule()) /** - * wait_var_event_killable - wait for a variable to be updated and notified + * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * @@ -492,7 +492,7 @@ do { \ * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for - * @mutex: the mutex which protects updates to the variable + * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be -- cgit v1.2.3 From 2270bd7124f4d25497d58c293cd40ea014ddaf01 Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Wed, 4 Mar 2026 18:08:00 +0530 Subject: drm/xe: add VM_BIND DECOMPRESS uapi flag Add a new VM_BIND flag, DRM_XE_VM_BIND_FLAG_DECOMPRESS, that lets userspace express intent for the driver to perform on-device in-place decompression for the GPU mapping created by a MAP bind operation. This flag is used by subsequent driver changes to trigger scheduling of GPU work that resolves compressed VRAM pages into an uncompressed PAT VM mapping. Behavior and semantics: - Valid only for DRM_XE_VM_BIND_OP_MAP. IOCTLs using this flag on other ops are rejected (-EINVAL). - The bind's pat_index must select the device "no-compression" PAT entry; otherwise the ioctl is rejected (-EINVAL). - Only meaningful for VRAM-backed BOs on devices that support Flat CCS and the required hardware generation (driver will return -EOPNOTSUPP if not). - On success the driver schedules a migrate/resolve and installs the returned dma_fence into the BO's kernel reservation (DMA_RESV_USAGE_KERNEL). Compute PR: https://github.com/intel/compute-runtime/pull/898 v3: Rebase on latest drm-tip and add compute pr info v2: Add kernel doc (Matt) Cc: Matthew Brost Cc: Matthew Auld Cc: Mrozek, Michal Reviewed-by: Matthew Brost Signed-off-by: Nitin Gote Acked-by: Michal Mrozek Signed-off-by: Matthew Auld Link: https://patch.msgid.link/20260304123758.3050386-6-nitin.r.gote@intel.com --- include/uapi/drm/xe_drm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index f074871b4d96..0497b85fa12a 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1057,6 +1057,13 @@ struct drm_xe_vm_destroy { * not invoke autoreset. Neither will stack variables going out of scope. * Therefore it's recommended to always explicitly reset the madvises when * freeing the memory backing a region used in a &DRM_IOCTL_XE_MADVISE call. + * - DRM_XE_VM_BIND_FLAG_DECOMPRESS - Request on-device decompression for a MAP. + * When set on a MAP bind operation, request the driver schedule an on-device + * in-place decompression (via the migrate/resolve path) for the GPU mapping + * created by this bind. Only valid for DRM_XE_VM_BIND_OP_MAP; usage on + * other ops is rejected. The bind's pat_index must select the device's + * "no-compression" PAT. Only meaningful for VRAM-backed BOs on devices that + * support Flat CCS and the required HW generation XE2+. * * The @prefetch_mem_region_instance for %DRM_XE_VM_BIND_OP_PREFETCH can also be: * - %DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC, which ensures prefetching occurs in @@ -1164,6 +1171,7 @@ struct drm_xe_vm_bind_op { #define DRM_XE_VM_BIND_FLAG_CHECK_PXP (1 << 4) #define DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR (1 << 5) #define DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET (1 << 6) +#define DRM_XE_VM_BIND_FLAG_DECOMPRESS (1 << 7) /** @flags: Bind flags */ __u32 flags; -- cgit v1.2.3 From 0de607dc4fd80ede3b2a35e8a72f99c7a0bbc321 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 4 Mar 2026 23:00:27 +0000 Subject: vsock: add G2H fallback for CIDs not owned by H2G transport When no H2G transport is loaded, vsock currently routes all CIDs to the G2H transport (commit 65b422d9b61b ("vsock: forward all packets to the host when no H2G is registered"). Extend that existing behavior: when an H2G transport is loaded but does not claim a given CID, the connection falls back to G2H in the same way. This matters in environments like Nitro Enclaves, where an instance may run nested VMs via vhost-vsock (H2G) while also needing to reach sibling enclaves at higher CIDs through virtio-vsock-pci (G2H). With the old code, any CID > 2 was unconditionally routed to H2G when vhost was loaded, making those enclaves unreachable without setting VMADDR_FLAG_TO_HOST explicitly on every connect. Requiring every application to set VMADDR_FLAG_TO_HOST creates friction: tools like socat, iperf, and others would all need to learn about it. The flag was introduced 6 years ago and I am still not aware of any tool that supports it. Even if there was support, it would be cumbersome to use. The most natural experience is a single CID address space where H2G only wins for CIDs it actually owns, and everything else falls through to G2H, extending the behavior that already exists when H2G is absent. To give user space at least a hint that the kernel applied this logic, automatically set the VMADDR_FLAG_TO_HOST on the remote address so it can determine the path taken via getpeername(). Add a per-network namespace sysctl net.vsock.g2h_fallback (default 1). At 0 it forces strict routing: H2G always wins for CID > VMADDR_CID_HOST, or ENODEV if H2G is not loaded. Signed-off-by: Alexander Graf Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260304230027.59857-1-graf@amazon.com Signed-off-by: Paolo Abeni --- Documentation/admin-guide/sysctl/net.rst | 28 +++++++++++++++++++++++++ drivers/vhost/vsock.c | 13 ++++++++++++ include/net/af_vsock.h | 9 ++++++++ include/net/netns/vsock.h | 2 ++ net/vmw_vsock/af_vsock.c | 35 +++++++++++++++++++++++++++----- net/vmw_vsock/virtio_transport.c | 7 +++++++ 6 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 3b2ad61995d4..0724a793798f 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -602,3 +602,31 @@ it does not modify the current namespace or any existing children. A namespace with ``ns_mode`` set to ``local`` cannot change ``child_ns_mode`` to ``global`` (returns ``-EPERM``). + +g2h_fallback +------------ + +Controls whether connections to CIDs not owned by the host-to-guest (H2G) +transport automatically fall back to the guest-to-host (G2H) transport. + +When enabled, if a connect targets a CID that the H2G transport (e.g. +vhost-vsock) does not serve, or if no H2G transport is loaded at all, the +connection is routed via the G2H transport (e.g. virtio-vsock) instead. This +allows a host running both nested VMs (via vhost-vsock) and sibling VMs +reachable through the hypervisor (e.g. Nitro Enclaves) to address both using +a single CID space, without requiring applications to set +``VMADDR_FLAG_TO_HOST``. + +When the fallback is taken, ``VMADDR_FLAG_TO_HOST`` is automatically set on +the remote address so that userspace can determine the path via +``getpeername()``. + +Note: With this sysctl enabled, user space that attempts to talk to a guest +CID which is not implemented by the H2G transport will create host vsock +traffic. Environments that rely on H2G-only isolation should set it to 0. + +Values: + + - 0 - Connections to CIDs <= 2 or with VMADDR_FLAG_TO_HOST use G2H; + all others use H2G (or fail with ENODEV if H2G is not loaded). + - 1 - Connections to CIDs not owned by H2G fall back to G2H. (default) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 054f7a718f50..1d8ec6bed53e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -91,6 +91,18 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net) return NULL; } +static bool vhost_transport_has_remote_cid(struct vsock_sock *vsk, u32 cid) +{ + struct sock *sk = sk_vsock(vsk); + struct net *net = sock_net(sk); + bool found; + + rcu_read_lock(); + found = !!vhost_vsock_get(cid, net); + rcu_read_unlock(); + return found; +} + static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) @@ -424,6 +436,7 @@ static struct virtio_transport vhost_transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, + .has_remote_cid = vhost_transport_has_remote_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 533d8e75f7bb..4e40063adab4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -179,6 +179,15 @@ struct vsock_transport { /* Addressing. */ u32 (*get_local_cid)(void); + /* Check if this transport serves a specific remote CID. + * For H2G transports: return true if the CID belongs to a registered + * guest. If not implemented, all CIDs > VMADDR_CID_HOST go to H2G. + * For G2H transports: return true if the transport can reach arbitrary + * CIDs via the hypervisor (i.e. supports the fallback overlay). VMCI + * does not implement this as it only serves CIDs 0 and 2. + */ + bool (*has_remote_cid)(struct vsock_sock *vsk, u32 remote_cid); + /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); diff --git a/include/net/netns/vsock.h b/include/net/netns/vsock.h index dc8cbe45f406..7f84aad92f57 100644 --- a/include/net/netns/vsock.h +++ b/include/net/netns/vsock.h @@ -20,5 +20,7 @@ struct netns_vsock { /* 0 = unlocked, 1 = locked to global, 2 = locked to local */ int child_ns_mode_locked; + + int g2h_fallback; }; #endif /* __NET_NET_NAMESPACE_VSOCK_H */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index f0ab2f13e9db..cc4b225250b9 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -545,9 +545,13 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; - * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field - * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; - * - remote CID > VMADDR_CID_HOST will use host->guest transport; + * - remote CID <= VMADDR_CID_HOST or remote flags field includes + * VMADDR_FLAG_TO_HOST, will use guest->host transport; + * - remote CID > VMADDR_CID_HOST and h2g is loaded and h2g claims that CID, + * will use host->guest transport; + * - h2g not loaded or h2g does not claim that CID and g2h claims the CID via + * has_remote_cid, will use guest->host transport (when g2h_fallback=1) + * - anything else goes to h2g or returns -ENODEV if no h2g is available */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { @@ -581,11 +585,21 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; - else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || + else if (remote_cid <= VMADDR_CID_HOST || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; - else + else if (transport_h2g && + (!transport_h2g->has_remote_cid || + transport_h2g->has_remote_cid(vsk, remote_cid))) + new_transport = transport_h2g; + else if (sock_net(sk)->vsock.g2h_fallback && + transport_g2h && transport_g2h->has_remote_cid && + transport_g2h->has_remote_cid(vsk, remote_cid)) { + vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; + new_transport = transport_g2h; + } else { new_transport = transport_h2g; + } break; default: ret = -ESOCKTNOSUPPORT; @@ -2879,6 +2893,15 @@ static struct ctl_table vsock_table[] = { .mode = 0644, .proc_handler = vsock_net_child_mode_string }, + { + .procname = "g2h_fallback", + .data = &init_net.vsock.g2h_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __net_init vsock_sysctl_register(struct net *net) @@ -2894,6 +2917,7 @@ static int __net_init vsock_sysctl_register(struct net *net) table[0].data = &net->vsock.mode; table[1].data = &net->vsock.child_ns_mode; + table[2].data = &net->vsock.g2h_fallback; } net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, @@ -2928,6 +2952,7 @@ static void vsock_net_init(struct net *net) net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); net->vsock.child_ns_mode = net->vsock.mode; + net->vsock.g2h_fallback = 1; } static __net_init int vsock_sysctl_init_net(struct net *net) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 77fe5b7b066c..57f2d6ec3ffc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -547,11 +547,18 @@ bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid); +static bool virtio_transport_has_remote_cid(struct vsock_sock *vsk, u32 cid) +{ + /* The CID could be implemented by the host. Always assume it is. */ + return true; +} + static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = virtio_transport_get_local_cid, + .has_remote_cid = virtio_transport_has_remote_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, -- cgit v1.2.3 From 754e38d2d1aeeadddac5220f34e07cf263502a46 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:11 +0100 Subject: tracing: Use explicit array size instead of sentinel elements in symbol printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sentinel value added by the wrapper macros __print_symbolic() et al prevents the callers from adding their own trailing comma. This makes constructing symbol list dynamically based on kconfig values tedious. Drop the sentinel elements, so callers can either specify the trailing comma or not, just like in regular array initializers. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-2-095357392669@linutronix.de --- include/linux/trace_events.h | 13 ++++++---- include/trace/stages/stage3_trace_output.h | 40 +++++++++++++++--------------- kernel/trace/trace_events_synth.c | 4 +-- kernel/trace/trace_output.c | 20 +++++++++------ kernel/trace/trace_syscalls.c | 3 +-- 5 files changed, 43 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 37eb2f0f3dd8..40a43a4c7caf 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -22,20 +22,23 @@ union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array); + const struct trace_print_flags *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); + const struct trace_print_flags *symbol_array, + size_t symbol_array_size); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array); + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size); #endif struct trace_iterator; diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index fce85ea2df1c..b7d8ef4b9fe1 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -64,36 +64,36 @@ #define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) #undef __print_flags -#define __print_flags(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags __flags[] = \ - { flag_array, { -1, NULL }}; \ - trace_print_flags_seq(p, delim, flag, __flags); \ +#define __print_flags(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags __flags[] = \ + { flag_array }; \ + trace_print_flags_seq(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) #undef __print_symbolic -#define __print_symbolic(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags symbols[] = \ - { symbol_array, { -1, NULL }}; \ - trace_print_symbols_seq(p, value, symbols); \ +#define __print_symbolic(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #undef __print_flags_u64 #undef __print_symbolic_u64 #if BITS_PER_LONG == 32 -#define __print_flags_u64(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags_u64 __flags[] = \ - { flag_array, { -1, NULL } }; \ - trace_print_flags_seq_u64(p, delim, flag, __flags); \ +#define __print_flags_u64(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags_u64 __flags[] = \ + { flag_array }; \ + trace_print_flags_seq_u64(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) -#define __print_symbolic_u64(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags_u64 symbols[] = \ - { symbol_array, { -1, NULL } }; \ - trace_print_symbols_seq_u64(p, value, symbols); \ +#define __print_symbolic_u64(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags_u64 symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq_u64(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #else #define __print_flags_u64(flag, delim, flag_array...) \ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8bb95b2a6fcf..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1996d7aba038..96e2d22b4364 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 37317b81fcda..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, - { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); @@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat trace_seq_puts(s, "O_RDONLY|"); } - trace_print_flags_seq(s, "|", bits, __flags); + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. -- cgit v1.2.3 From 8ef2807042d0886a85bbcb0aba1a2a277680dc4a Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:15 +0100 Subject: hrtimer: Remove hrtimer_get_expires_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users left. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-6-095357392669@linutronix.de --- include/linux/hrtimer.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c087b7142330..9ced498fefaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -116,11 +116,6 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) return timer->_softexpires; } -static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) -{ - return ktime_to_ns(timer->node.expires); -} - ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -- cgit v1.2.3 From b94c076dd949426d09e5d415304acb3f951d9069 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:17 +0100 Subject: hrtimer: Drop spurious space in 'enum hrtimer_base_type' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This spurious space makes grepping for the enum definition annoying. Remove it. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-8-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 0f851b2432c3..e6d4dc1b61e0 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -35,7 +35,7 @@ struct hrtimer_clock_base { ktime_t offset; } __hrtimer_clock_base_align; -enum hrtimer_base_type { +enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, -- cgit v1.2.3 From bd803783dfa7ddd5e1d44a6abfeee26fdc3a2db7 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:18 +0100 Subject: hrtimer: Drop unnecessary pointer indirection in hrtimer_expire_entry event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pointer indirection is a remnant from when ktime_t was a struct, today it is pointless. Drop the pointer indirection. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-9-095357392669@linutronix.de --- include/trace/events/timer.h | 7 +++---- kernel/time/hrtimer.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index a54613f59e55..07cbb9836b91 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -254,14 +254,13 @@ TRACE_EVENT(hrtimer_start, /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer - * @now: pointer to variable which contains current time of the - * timers base. + * @now: variable which contains current time of the timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, - TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), + TP_PROTO(struct hrtimer *hrtimer, ktime_t now), TP_ARGS(hrtimer, now), @@ -273,7 +272,7 @@ TRACE_EVENT(hrtimer_expire_entry, TP_fast_assign( __entry->hrtimer = hrtimer; - __entry->now = *now; + __entry->now = now; __entry->function = ACCESS_PRIVATE(hrtimer, function); ), diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c7e7d3a0d6bb..a71e2caa1402 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1884,7 +1884,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now, unsigned long flags) + struct hrtimer *timer, ktime_t now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); @@ -1989,7 +1989,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, if (basenow < hrtimer_get_softexpires(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow, flags); + __run_hrtimer(cpu_base, base, timer, basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } -- cgit v1.2.3 From f12ef5cb4e035e15f0c324c41ff402441578ffda Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:19 +0100 Subject: hrtimer: Mark index and clockid of clock base as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are initialized once and are never supposed to change. Mark them as const to make this explicit. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-10-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index e6d4dc1b61e0..a03240c0b14f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -26,8 +26,8 @@ */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; + const unsigned int index; + const clockid_t clockid; seqcount_raw_spinlock_t seq; ktime_t expires_next; struct hrtimer *running; -- cgit v1.2.3 From f27fc117cf8fba56e0619694e685f9bca9b9cb82 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:20 +0100 Subject: hrtimer: Remove trailing comma after HRTIMER_MAX_CLOCK_BASES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HRTIMER_MAX_CLOCK_BASES is required to stay the last value of the enum. Drop the trailing comma so no new members are added after it by mistake. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-11-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index a03240c0b14f..52ed9e46ff13 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -44,7 +44,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, - HRTIMER_MAX_CLOCK_BASES, + HRTIMER_MAX_CLOCK_BASES }; /** -- cgit v1.2.3 From 6f459eda8b60382efa0da2ca025c26a2018adc87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Mar 2026 12:44:51 +0000 Subject: tcp: add tcp_release_cb_cond() helper Majority of tcp_release_cb() calls do nothing at all. Provide tcp_release_cb_cond() helper so that release_sock() can avoid these calls. Also hint the compiler that __release_sock() and wake_up() are rarely called. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-77 (-77) Function old new delta release_sock 258 181 -77 Total: Before=25235790, After=25235713, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260310124451.2280968-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 7 +++++++ include/net/tcp.h | 14 ++++++++++++++ net/core/sock.c | 14 ++++++++------ net/ipv4/tcp_output.c | 5 ----- 4 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c44cf9ae8d16..bcebc4f07532 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -548,6 +548,13 @@ enum tsq_flags { TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; +/* Flags of interest for tcp_release_cb() */ +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED | \ + TCPF_ACK_DEFERRED) + #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f0aee9e5d76..48dffcca0a71 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,7 +375,21 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); + void tcp_release_cb(struct sock *sk); + +static inline bool tcp_release_cb_cond(struct sock *sk) +{ +#ifdef CONFIG_INET + if (likely(sk->sk_prot->release_cb == tcp_release_cb)) { + if (unlikely(smp_load_acquire(&sk->sk_tsq_flags) & TCP_DEFERRED_ALL)) + tcp_release_cb(sk); + return true; + } +#endif + return false; +} + void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index f4e2ff23d60e..fdaf66e6dc18 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3807,16 +3807,18 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_backlog.tail) - __release_sock(sk); - if (sk->sk_prot->release_cb) - INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, - tcp_release_cb, sk); + if (unlikely(sk->sk_backlog.tail)) + __release_sock(sk); + if (sk->sk_prot->release_cb) { + if (!tcp_release_cb_cond(sk)) + sk->sk_prot->release_cb(sk); + } sock_release_ownership(sk); - if (waitqueue_active(&sk->sk_lock.wq)) + if (unlikely(waitqueue_active(&sk->sk_lock.wq))) wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a53802f28dd1..34a25ef61006 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1320,11 +1320,6 @@ static void tcp_tsq_workfn(struct work_struct *work) } } -#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED | \ - TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_MTU_REDUCED_DEFERRED | \ - TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket -- cgit v1.2.3 From 5e8969bd192712419aae511dd5ba26855c2c78db Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Jan 2026 11:48:48 +0100 Subject: mount: add FSMOUNT_NAMESPACE Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount namespace with the newly created filesystem attached to a copy of the real rootfs. This returns a namespace file descriptor instead of an O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for open_tree(). This allows creating a new filesystem and immediately placing it in a new mount namespace in a single operation, which is useful for container runtimes and other namespace-based isolation mechanisms. The rootfs mount is created before copying the real rootfs for the new namespace meaning that the mount namespace id for the mount of the root of the namespace is bigger than the child mounted on top of it. We've never explicitly given the guarantee for such ordering and I doubt anyone relies on it. Accepting that lets us avoid copying the mount again and also avoids having to massage may_copy_tree() to grant an exception for fsmount->mnt->mnt_ns being NULL. Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-3-5ef0a886e646@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 37 ++++++++++++++++++++++++++++++------- include/uapi/linux/mount.h | 1 + 2 files changed, 31 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index b098d1131e69..702e93243505 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3118,11 +3118,26 @@ static struct mnt_namespace *create_new_namespace(struct path *path, } /* - * We don't emulate unshare()ing a mount namespace. We stick - * to the restrictions of creating detached bind-mounts. It - * has a lot saner and simpler semantics. + * We don't emulate unshare()ing a mount namespace. We stick to + * the restrictions of creating detached bind-mounts. It has a + * lot saner and simpler semantics. */ - mnt = __do_loopback(path, recurse, copy_flags); + mnt = real_mount(path->mnt); + if (!mnt->mnt_ns) { + /* + * If we're moving into a new mount namespace via + * fsmount() swap the mount ids so the nullfs mount id + * is the lowest in the mount namespace avoiding another + * useless copy. This is fine we're not attached to any + * mount namespace so the mount ids are pure decoration + * at that point. + */ + swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique); + swap(mnt->mnt_id, new_ns_root->mnt_id); + mntget(&mnt->mnt); + } else { + mnt = __do_loopback(path, recurse, copy_flags); + } scoped_guard(mount_writer) { if (IS_ERR(mnt)) { emptied_ns = new_ns; @@ -4401,11 +4416,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int mnt_flags = 0; long ret; - if (!may_mount()) + if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0) + return -EINVAL; + + if ((flags & FSMOUNT_NAMESPACE) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; - if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) - return -EINVAL; + if (!(flags & FSMOUNT_NAMESPACE) && !may_mount()) + return -EPERM; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; @@ -4472,6 +4491,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, */ vfs_clean_context(fc); + if (flags & FSMOUNT_NAMESPACE) + return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, + open_new_namespace(&new_path, 0)); + ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) return PTR_ERR(ns); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index d9d86598d100..2204708dbf7a 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -110,6 +110,7 @@ enum fsconfig_command { * fsmount() flags. */ #define FSMOUNT_CLOEXEC 0x00000001 +#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */ /* * Mount attributes. -- cgit v1.2.3 From 9d4e752a24f740b31ca827bfab07010e4e7f34b0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Mar 2026 17:28:37 +0100 Subject: namespace: allow creating empty mount namespaces Add support for creating a mount namespace that contains only a copy of the root mount from the caller's mount namespace, with none of the child mounts. This is useful for containers and sandboxes that want to start with a minimal mount table and populate it from scratch rather than inheriting and then tearing down the full mount tree. Two new flags are introduced: - CLONE_EMPTY_MNTNS for clone3(), using the 64-bit flag space. - UNSHARE_EMPTY_MNTNS for unshare(), reusing the CLONE_PARENT_SETTID bit which has no meaning for unshare. Both flags imply CLONE_NEWNS. For the unshare path, UNSHARE_EMPTY_MNTNS is converted to CLONE_EMPTY_MNTNS in unshare_nsproxy_namespaces() before it reaches copy_mnt_ns(), so the mount namespace code only needs to handle a single flag. In copy_mnt_ns(), when CLONE_EMPTY_MNTNS is set, clone_mnt() is used instead of copy_tree() to clone only the root mount. The caller's root and working directory are both reset to the root dentry of the new mount. The cleanup variables are changed from vfsmount pointers with __free(mntput) to struct path with __free(path_put) because the empty mount namespace path needs to release both mount and dentry references when replacing the caller's root and pwd. In the normal (non-empty) path only the mount component is set, and dput(NULL) is a no-op so path_put remains correct there as well. Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-1-6eb30529bbb0@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 85 ++++++++++++++++++++++++++++++---------------- include/uapi/linux/sched.h | 7 ++++ kernel/fork.c | 17 ++++++++-- kernel/nsproxy.c | 21 +++++++++--- 4 files changed, 94 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/fs/namespace.c b/fs/namespace.c index 702e93243505..555f0a10de9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt __free(mntput) = NULL; - struct vfsmount *pwdmnt __free(mntput) = NULL; + struct path old_root __free(path_put) = {}; + struct path old_pwd __free(path_put) = {}; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, return new_ns; guard(namespace_excl)(); - /* First pass: copy the tree topology */ - copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + + if (flags & CLONE_EMPTY_MNTNS) + copy_flags = 0; + else + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; - new = copy_tree(old, old->mnt.mnt_root, copy_flags); + + if (flags & CLONE_EMPTY_MNTNS) + new = clone_mnt(old, old->mnt.mnt_root, copy_flags); + else + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; return ERR_CAST(new); @@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, } new_ns->root = new; - /* - * Second pass: switch the tsk->fs->* elements and mark new vfsmounts - * as belonging to new namespace. We have already acquired a private - * fs_struct, so tsk->fs->lock is not needed. - */ - p = old; - q = new; - while (p) { - mnt_add_to_ns(new_ns, q); - new_ns->nr_mounts++; + if (flags & CLONE_EMPTY_MNTNS) { + /* + * Empty mount namespace: only the root mount exists. + * Reset root and pwd to the cloned mount's root dentry. + */ if (new_fs) { - if (&p->mnt == new_fs->root.mnt) { - new_fs->root.mnt = mntget(&q->mnt); - rootmnt = &p->mnt; - } - if (&p->mnt == new_fs->pwd.mnt) { - new_fs->pwd.mnt = mntget(&q->mnt); - pwdmnt = &p->mnt; + old_root = new_fs->root; + old_pwd = new_fs->pwd; + + new_fs->root.mnt = mntget(&new->mnt); + new_fs->root.dentry = dget(new->mnt.mnt_root); + + new_fs->pwd.mnt = mntget(&new->mnt); + new_fs->pwd.dentry = dget(new->mnt.mnt_root); + } + mnt_add_to_ns(new_ns, new); + new_ns->nr_mounts++; + } else { + /* + * Full copy: walk old and new trees in parallel, switching + * the tsk->fs->* elements and marking new vfsmounts as + * belonging to new namespace. We have already acquired a + * private fs_struct, so tsk->fs->lock is not needed. + */ + p = old; + q = new; + while (p) { + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + old_root.mnt = new_fs->root.mnt; + new_fs->root.mnt = mntget(&q->mnt); + } + if (&p->mnt == new_fs->pwd.mnt) { + old_pwd.mnt = new_fs->pwd.mnt; + new_fs->pwd.mnt = mntget(&q->mnt); + } } + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + // an mntns binding we'd skipped? + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(skip_mnt_tree(p), old); } - p = next_mnt(p, old); - q = next_mnt(q, new); - if (!q) - break; - // an mntns binding we'd skipped? - while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(skip_mnt_tree(p), old); } ns_tree_add_raw(new_ns); return new_ns; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..4e76fce9f777 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,6 +36,7 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -43,6 +44,12 @@ */ #define CLONE_NEWTIME 0x00000080 /* New time namespace */ +/* + * unshare flags share the bit space with clone flags but only apply to the + * unshare syscall: + */ +#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/kernel/fork.c b/kernel/fork.c index 65113a304518..dea6b3454447 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args) int trace = 0; pid_t nr; + /* + * Creating an empty mount namespace implies creating a new mount + * namespace. Set this before copy_process() so that the + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. + */ + if (clone_flags & CLONE_EMPTY_MNTNS) { + clone_flags |= CLONE_NEWNS; + args->flags = clone_flags; + } + /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are @@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | + CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS)) return false; /* @@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| - CLONE_NEWTIME)) + CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags) /* * If unsharing namespace, must also unshare filesystem information. */ + if (unshare_flags & UNSHARE_EMPTY_MNTNS) + unshare_flags |= CLONE_NEWNS; if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 259c4b4f1eeb..1bdc5be2dd20 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, + user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; @@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; + u64 flags = unshare_flags; int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | - CLONE_NEWTIME))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + /* + * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases + * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS. + */ + if (flags & UNSHARE_EMPTY_MNTNS) { + flags &= ~(u64)UNSHARE_EMPTY_MNTNS; + flags |= CLONE_EMPTY_MNTNS; + } + + *new_nsp = create_new_namespaces(flags, current, user_ns, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); -- cgit v1.2.3 From c670267ff50d5f9beb486f0203cdede580a99ae3 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Feb 2026 14:20:03 +0800 Subject: tty: constify tty_ldisc_ops tty_ldisc_ops is not modified once registered, so make it const. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260206062004.1273890-1-dqfext@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_ldisc.c | 16 ++++++++-------- include/linux/tty_ldisc.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 888f2f8f9481..27fe8236f662 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -44,7 +44,7 @@ enum { static DEFINE_RAW_SPINLOCK(tty_ldiscs_lock); /* Line disc dispatch table */ -static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; +static const struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; /** * tty_register_ldisc - install a line discipline @@ -55,7 +55,7 @@ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ -int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc) +int tty_register_ldisc(const struct tty_ldisc_ops *new_ldisc) { unsigned long flags; @@ -80,7 +80,7 @@ EXPORT_SYMBOL(tty_register_ldisc); * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ -void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) +void tty_unregister_ldisc(const struct tty_ldisc_ops *ldisc) { unsigned long flags; @@ -90,10 +90,10 @@ void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) } EXPORT_SYMBOL(tty_unregister_ldisc); -static struct tty_ldisc_ops *get_ldops(int disc) +static const struct tty_ldisc_ops *get_ldops(int disc) { unsigned long flags; - struct tty_ldisc_ops *ldops, *ret; + const struct tty_ldisc_ops *ldops, *ret; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); ret = ERR_PTR(-EINVAL); @@ -107,7 +107,7 @@ static struct tty_ldisc_ops *get_ldops(int disc) return ret; } -static void put_ldops(struct tty_ldisc_ops *ldops) +static void put_ldops(const struct tty_ldisc_ops *ldops) { unsigned long flags; @@ -139,7 +139,7 @@ int tty_ldisc_autoload = IS_BUILTIN(CONFIG_LDISC_AUTOLOAD); static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; - struct tty_ldisc_ops *ldops; + const struct tty_ldisc_ops *ldops; if (disc < N_TTY || disc >= NR_LDISCS) return ERR_PTR(-EINVAL); @@ -202,7 +202,7 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) static int tty_ldiscs_seq_show(struct seq_file *m, void *v) { int i = *(loff_t *)v; - struct tty_ldisc_ops *ldops; + const struct tty_ldisc_ops *ldops; ldops = get_ldops(i); if (IS_ERR(ldops)) diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index c5cccc3fc1e8..d227a58e3e49 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -266,7 +266,7 @@ struct tty_ldisc_ops { }; struct tty_ldisc { - struct tty_ldisc_ops *ops; + const struct tty_ldisc_ops *ops; struct tty_struct *tty; }; @@ -281,8 +281,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); void tty_ldisc_flush(struct tty_struct *tty); -int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc); -void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc); +int tty_register_ldisc(const struct tty_ldisc_ops *new_ldisc); +void tty_unregister_ldisc(const struct tty_ldisc_ops *ldisc); int tty_set_ldisc(struct tty_struct *tty, int disc); #endif /* _LINUX_TTY_LDISC_H */ -- cgit v1.2.3 From 24728b93fafe0949b5353e1a7b3a94175fe26d6e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:23:47 -0700 Subject: serdev: serdev.h: clean up kernel-doc comments Correct kernel-doc comment format and add a missing to avoid kernel-doc warnings: Warning: include/linux/serdev.h:49 struct member 'write_comp' not described in 'serdev_device' Warning: include/linux/serdev.h:49 struct member 'write_lock' not described in 'serdev_device' Warning: include/linux/serdev.h:68 struct member 'shutdown' not described in 'serdev_device_driver' Warning: include/linux/serdev.h:134 function parameter 'serdev' not described in 'serdev_device_put' Warning: include/linux/serdev.h:162 function parameter 'ctrl' not described in 'serdev_controller_put' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260311052347.305612-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 5654c58eb73c..090c93c08045 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -37,8 +37,8 @@ struct serdev_device_ops { * @nr: Device number on serdev bus. * @ctrl: serdev controller managing this device. * @ops: Device operations. - * @write_comp Completion used by serdev_device_write() internally - * @write_lock Lock to serialize access when writing data + * @write_comp: Completion used by serdev_device_write() internally + * @write_lock: Lock to serialize access when writing data */ struct serdev_device { struct device dev; @@ -60,6 +60,7 @@ static inline struct serdev_device *to_serdev_device(struct device *d) * structure. * @probe: binds this driver to a serdev device. * @remove: unbinds this driver from the serdev device. + * @shutdown: shut down this serdev device. */ struct serdev_device_driver { struct device_driver driver; @@ -129,7 +130,7 @@ static inline void serdev_device_set_drvdata(struct serdev_device *serdev, void /** * serdev_device_put() - decrement serdev device refcount - * @serdev serdev device. + * @serdev: serdev device. */ static inline void serdev_device_put(struct serdev_device *serdev) { @@ -157,7 +158,7 @@ static inline void serdev_controller_set_drvdata(struct serdev_controller *ctrl, /** * serdev_controller_put() - decrement controller refcount - * @ctrl serdev controller. + * @ctrl: serdev controller. */ static inline void serdev_controller_put(struct serdev_controller *ctrl) { -- cgit v1.2.3 From 5cba06c71c713a5beb4aafab7973287d8a248ddb Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 2 Feb 2026 23:52:47 -0500 Subject: vt: add KT_CSI keysym type for modifier-aware CSI sequences Add a new keysym type KT_CSI that generates CSI tilde sequences with automatic modifier encoding. The keysym value encodes the CSI parameter number, producing sequences like ESC [ ~ or ESC [ ; ~ when Shift, Alt, or Ctrl modifiers are held. This allows navigation keys (Home, End, Insert, Delete, PgUp, PgDn) and function keys to generate modifier-aware escape sequences without consuming string table entries for each modifier combination. Define key symbols for navigation keys (K_CSI_HOME, K_CSI_END, etc.) and function keys (K_CSI_F1 through K_CSI_F20) using standard xterm CSI parameter values. The modifier encoding follows the xterm convention: mod = 1 + (shift ? 1 : 0) + (alt ? 2 : 0) + (ctrl ? 4 : 0) Allowed CSI parameter values range from 0 to 99. Note: The Linux console historically uses a non-standard double-bracket format for F1-F5 (ESC [ [ A through ESC [ [ E) rather than the xterm tilde format (ESC [ 11 ~ through ESC [ 15 ~). The K_CSI_F1 through K_CSI_F5 definitions use the xterm format. Converting F1-F5 to KT_CSI would require updating the "linux" terminfo entry to match. Navigation keys and F6-F20 already use the tilde format and are fully compatible. Signed-off-by: Nicolas Pitre Link: https://patch.msgid.link/20260203045457.1049793-3-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/keyboard.c | 38 +++++++++++++++++++++++++++++++++----- include/uapi/linux/keyboard.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index cb907a3b9d3d..44fd67eb723a 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -74,7 +74,7 @@ static inline int kbd_defleds(void) k_self, k_fn, k_spec, k_pad,\ k_dead, k_cons, k_cur, k_shift,\ k_meta, k_ascii, k_lock, k_lowercase,\ - k_slock, k_dead2, k_brl, k_ignore + k_slock, k_dead2, k_brl, k_csi typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value, char up_flag); @@ -127,6 +127,7 @@ static const unsigned char max_vals[] = { [ KT_SLOCK ] = NR_LOCK - 1, [ KT_DEAD2 ] = 255, [ KT_BRL ] = NR_BRL - 1, + [ KT_CSI ] = 99, }; static const int NR_TYPES = ARRAY_SIZE(max_vals); @@ -644,10 +645,6 @@ static void fn_null(struct vc_data *vc) /* * Special key handlers */ -static void k_ignore(struct vc_data *vc, unsigned char value, char up_flag) -{ -} - static void k_spec(struct vc_data *vc, unsigned char value, char up_flag) { if (up_flag) @@ -1029,6 +1026,37 @@ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag) } } +/* + * Handle KT_CSI keysym type: generate CSI tilde sequences with modifier + * support. The value encodes the CSI parameter number, producing sequences + * like ESC [ ~ or ESC [ ; ~ when modifiers are held. + */ +static void k_csi(struct vc_data *vc, unsigned char value, char up_flag) +{ + char buf[10]; + int i = 0; + int mod; + + if (up_flag) + return; + + mod = csi_modifier_param(); + + buf[i++] = 0x1b; + buf[i++] = '['; + if (value >= 10) + buf[i++] = '0' + value / 10; + buf[i++] = '0' + value % 10; + if (mod > 1) { + buf[i++] = ';'; + buf[i++] = '0' + mod; + } + buf[i++] = '~'; + buf[i] = 0x00; + + puts_queue(vc, buf); +} + #if IS_ENABLED(CONFIG_INPUT_LEDS) && IS_ENABLED(CONFIG_LEDS_TRIGGERS) struct kbd_led_trigger { diff --git a/include/uapi/linux/keyboard.h b/include/uapi/linux/keyboard.h index 36d230cedf12..48ecb0cefb45 100644 --- a/include/uapi/linux/keyboard.h +++ b/include/uapi/linux/keyboard.h @@ -41,6 +41,7 @@ #define KT_SLOCK 12 #define KT_DEAD2 13 #define KT_BRL 14 +#define KT_CSI 15 /* CSI sequences with modifier support */ #define K(t,v) (((t)<<8)|(v)) #define KTYP(x) ((x) >> 8) @@ -461,5 +462,33 @@ #define NR_BRL 11 +/* KT_CSI keys: value is the CSI parameter number for ESC [ ~ */ +#define K_CSI_HOME K(KT_CSI, 1) /* ESC [ 1 ~ */ +#define K_CSI_INSERT K(KT_CSI, 2) /* ESC [ 2 ~ */ +#define K_CSI_DELETE K(KT_CSI, 3) /* ESC [ 3 ~ */ +#define K_CSI_END K(KT_CSI, 4) /* ESC [ 4 ~ */ +#define K_CSI_PGUP K(KT_CSI, 5) /* ESC [ 5 ~ */ +#define K_CSI_PGDN K(KT_CSI, 6) /* ESC [ 6 ~ */ +#define K_CSI_F1 K(KT_CSI, 11) /* ESC [ 11 ~ */ +#define K_CSI_F2 K(KT_CSI, 12) /* ESC [ 12 ~ */ +#define K_CSI_F3 K(KT_CSI, 13) /* ESC [ 13 ~ */ +#define K_CSI_F4 K(KT_CSI, 14) /* ESC [ 14 ~ */ +#define K_CSI_F5 K(KT_CSI, 15) /* ESC [ 15 ~ */ +#define K_CSI_F6 K(KT_CSI, 17) /* ESC [ 17 ~ */ +#define K_CSI_F7 K(KT_CSI, 18) /* ESC [ 18 ~ */ +#define K_CSI_F8 K(KT_CSI, 19) /* ESC [ 19 ~ */ +#define K_CSI_F9 K(KT_CSI, 20) /* ESC [ 20 ~ */ +#define K_CSI_F10 K(KT_CSI, 21) /* ESC [ 21 ~ */ +#define K_CSI_F11 K(KT_CSI, 23) /* ESC [ 23 ~ */ +#define K_CSI_F12 K(KT_CSI, 24) /* ESC [ 24 ~ */ +#define K_CSI_F13 K(KT_CSI, 25) /* ESC [ 25 ~ */ +#define K_CSI_F14 K(KT_CSI, 26) /* ESC [ 26 ~ */ +#define K_CSI_F15 K(KT_CSI, 28) /* ESC [ 28 ~ */ +#define K_CSI_F16 K(KT_CSI, 29) /* ESC [ 29 ~ */ +#define K_CSI_F17 K(KT_CSI, 31) /* ESC [ 31 ~ */ +#define K_CSI_F18 K(KT_CSI, 32) /* ESC [ 32 ~ */ +#define K_CSI_F19 K(KT_CSI, 33) /* ESC [ 33 ~ */ +#define K_CSI_F20 K(KT_CSI, 34) /* ESC [ 34 ~ */ + #define MAX_DIACR 256 #endif /* _UAPI__LINUX_KEYBOARD_H */ -- cgit v1.2.3 From eb3b0d92c9c39890592cca6647601fe5c631efea Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 13 Feb 2026 16:50:39 +0800 Subject: tty: tty_port: add workqueue to flip TTY buffer On the embedded platform, certain critical data, such as IMU data, is transmitted through UART. The tty_flip_buffer_push() interface in the TTY layer uses system_dfl_wq to handle the flipping of the TTY buffer. Although the unbound workqueue can create new threads on demand and wake up the kworker thread on an idle CPU, it may be preempted by real-time tasks or other high-prio tasks. flush_to_ldisc() needs to wake up the relevant data handle thread. When executing __wake_up_common_lock(), it calls spin_lock_irqsave(), which does not disable preemption but disables migration in RT-Linux. This prevents the kworker thread from being migrated to other cores by CPU's balancing logic, resulting in long delays. The call trace is as follows: __wake_up_common_lock __wake_up ep_poll_callback __wake_up_common __wake_up_common_lock __wake_up n_tty_receive_buf_common n_tty_receive_buf2 tty_ldisc_receive_buf tty_port_default_receive_buf flush_to_ldisc In our system, the processing interval for each frame of IMU data transmitted via UART can experience significant jitter due to this issue. Instead of the expected 10 to 15 ms frame processing interval, we see spikes up to 30 to 35 ms. Moreover, in just one or two hours, there can be 2 to 3 occurrences of such high jitter, which is quite frequent. This jitter exceeds the software's tolerable limit of 20 ms. Introduce flip_wq in tty_port which can be set by tty_port_link_wq() or as default linked to default workqueue allocated when tty_register_driver(). The default workqueue is allocated with flag WQ_SYSFS, so that cpumask and nice can be set dynamically. The execution timing of tty_port_link_wq() is not clearly restricted. The newly added function tty_port_link_driver_wq() checks whether the flip_wq of the tty_port has already been assigned when linking the default tty_driver's workqueue to the port. After the user has set a custom workqueue for a certain tty_port using tty_port_link_wq(), the system will only use this custom workqueue, even if tty_driver does not have %TTY_DRIVER_NO_WORKQUEUE flag. When tty_port register device, flip_wq link operation is done by tty_port_link_driver_wq(), but for in-memory devices the link operation cannot cover all the cases. Although tty_port_install() is dedicated for in-memory devices lik PTY to link port allocated on demand, the logic of tty_port_install() is so simple that people may not call it, vc_cons[0].d->port is one such case. We check the buf.flip_wq when flip TTY buffer, if buf.flip_wq of TTY port is NULL, use system_dfl_wq as a backup. To avoid naming conflict of the default tty_driver's workqueue, using '"%s-%s", driver->name, driver->driver_name' as the workqueue name. In cases where driver_name is not specified and therefore is NULL, the workqueue is not created. Drivers that do not define driver_name are potentially in-memory devices like vty, which generally do not require special workqueue settings. Even with the combination of name and driver_name, the workqueue names can still be duplicated, as many tty serial drivers use "ttyS" as dev_name and "serial" as driver_name. I modified the conflicting driver_name of these drivers by appending a suffix of _xx based on the corresponding .c file. If this modification is not made, it could not only lead to duplicate workqueue names but also result in duplicate entries for the /proc/tty/driver/ nodes. Introduce %TTY_DRIVER_NO_WORKQUEUE flag meaning not to create the default single tty_driver workqueue. Two reasons why need to introduce the %TTY_DRIVER_NO_WORKQUEUE flag: 1. If the WQ_SYSFS parameter is enabled, workqueue_sysfs_register() will fail when trying to create a workqueue with the same name. The pty is an example of this; if both CONFIG_LEGACY_PTYS and CONFIG_UNIX98_PTYS are enabled, the call to tty_register_driver() in unix98_pty_init() will fail. 2. Different TTY ports may be used for different tasks, which may require separate core binding control via workqueues. In this case, the workqueue created by default in the TTY driver is unnecessary. Enabling this flag prevents the creation of this redundant workqueue. After applying this patch, we can set the related UART TTY flip buffer workqueue by sysfs. We set the cpumask to CPU cores associated with the IMU tasks, and set the nice to -20. Testing has shown significant improvement in the previously described issue, with almost no stuttering occurring anymore. Tested-by: Tommaso Merciai Tested-by: Marek Szyprowski Signed-off-by: Xin Zhao Link: https://patch.msgid.link/20260213085039.3274704-1-jackzxcui1989@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 12 ++++++++---- drivers/tty/serial/8250/8250_core.c | 2 +- drivers/tty/serial/apbuart.c | 2 +- drivers/tty/serial/dz.c | 2 +- drivers/tty/serial/ip22zilog.c | 2 +- drivers/tty/serial/zs.c | 2 +- drivers/tty/tty_buffer.c | 15 +++++++++++---- drivers/tty/tty_io.c | 25 ++++++++++++++++++++++++- drivers/tty/tty_port.c | 22 ++++++++++++++++++++++ include/linux/tty_buffer.h | 1 + include/linux/tty_driver.h | 7 +++++++ include/linux/tty_port.h | 13 +++++++++++++ 12 files changed, 91 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index cb427e93372d..cc7f7091ed9a 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -532,14 +532,16 @@ static void __init legacy_pty_init(void) pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); @@ -849,7 +851,8 @@ static void __init unix98_pty_init(void) TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, @@ -857,7 +860,8 @@ static void __init unix98_pty_init(void) TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index d2e2c5dfef99..a428e88938eb 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -524,7 +524,7 @@ console_initcall(univ8250_console_init); struct uart_driver serial8250_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_8250", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/apbuart.c b/drivers/tty/serial/apbuart.c index 364599f256db..3e46341cfff8 100644 --- a/drivers/tty/serial/apbuart.c +++ b/drivers/tty/serial/apbuart.c @@ -505,7 +505,7 @@ console_initcall(apbuart_console_init); static struct uart_driver grlib_apbuart_driver = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_apbuart", .dev_name = "ttyS", .major = SERIAL_APBUART_MAJOR, .minor = SERIAL_APBUART_MINOR, diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index eba91daedef8..e53c54353c3e 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -914,7 +914,7 @@ console_initcall(dz_serial_console_init); static struct uart_driver dz_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_dz", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/ip22zilog.c b/drivers/tty/serial/ip22zilog.c index 6e19c6713849..a69b06893d9e 100644 --- a/drivers/tty/serial/ip22zilog.c +++ b/drivers/tty/serial/ip22zilog.c @@ -1015,7 +1015,7 @@ static struct console ip22zilog_console = { static struct uart_driver ip22zilog_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_ip22zilog", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index 79ea7108a0f3..72a3c0d90f40 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -1252,7 +1252,7 @@ console_initcall(zs_serial_console_init); static struct uart_driver zs_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_zs", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 79ec953824d5..96be90db53b7 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -59,6 +59,13 @@ void tty_buffer_lock_exclusive(struct tty_port *port) } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); +static bool tty_buffer_queue_work(struct tty_bufhead *buf) +{ + struct workqueue_struct *flip_wq = READ_ONCE(buf->flip_wq); + + return queue_work(flip_wq ?: system_dfl_wq, &buf->work); +} + /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer @@ -76,7 +83,7 @@ void tty_buffer_unlock_exclusive(struct tty_port *port) mutex_unlock(&buf->lock); if (restart) - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); @@ -530,7 +537,7 @@ void tty_flip_buffer_push(struct tty_port *port) struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); } EXPORT_SYMBOL(tty_flip_buffer_push); @@ -560,7 +567,7 @@ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); return size; } @@ -613,7 +620,7 @@ void tty_buffer_set_lock_subclass(struct tty_port *port) bool tty_buffer_restart_work(struct tty_port *port) { - return queue_work(system_dfl_wq, &port->buf.work); + return tty_buffer_queue_work(&port->buf); } bool tty_buffer_cancel_work(struct tty_port *port) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index a5d0457e0e28..6b283fd03ff8 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3443,10 +3443,27 @@ int tty_register_driver(struct tty_driver *driver) if (error < 0) goto err; + /* + * Drivers that do not define driver_name are potentially in-memory devices + * like vty, which generally do not require special workqueue settings. + */ + if (!(driver->flags & TTY_DRIVER_NO_WORKQUEUE) && driver->driver_name) { + driver->flip_wq = alloc_workqueue("%s-%s", WQ_UNBOUND | WQ_SYSFS, + 0, driver->name, driver->driver_name); + if (!driver->flip_wq) { + error = -ENOMEM; + goto err_unreg_char; + } + for (i = 0; i < driver->num; i++) { + if (driver->ports[i]) + tty_port_link_driver_wq(driver->ports[i], driver); + } + } + if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) - goto err_unreg_char; + goto err_destroy_wq; } scoped_guard(mutex, &tty_mutex) @@ -3472,6 +3489,10 @@ err_unreg_devs: scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); +err_destroy_wq: + if (driver->flip_wq) + destroy_workqueue(driver->flip_wq); + err_unreg_char: unregister_chrdev_region(dev, driver->num); err: @@ -3491,6 +3512,8 @@ void tty_unregister_driver(struct tty_driver *driver) driver->num); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); + if (driver->flip_wq) + destroy_workqueue(driver->flip_wq); } EXPORT_SYMBOL(tty_unregister_driver); diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index fe67c5cb0a3f..54359310e293 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -99,6 +99,23 @@ void tty_port_init(struct tty_port *port) } EXPORT_SYMBOL(tty_port_init); +/** + * tty_port_link_wq - link tty_port and flip workqueue + * @port: tty_port of the device + * @flip_wq: workqueue to queue flip buffer work on + * + * Whenever %TTY_DRIVER_NO_WORKQUEUE is used, every tty_port can be linked to + * a workqueue manually by this function. + * tty_port will use system_dfl_wq when buf.flip_wq is NULL. + * + * Note that tty_port API will NOT destroy the workqueue. + */ +void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq) +{ + port->buf.flip_wq = flip_wq; +} +EXPORT_SYMBOL_GPL(tty_port_link_wq); + /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device @@ -157,6 +174,7 @@ struct device *tty_port_register_device_attr(struct tty_port *port, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); + tty_port_link_driver_wq(port, driver); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } @@ -183,6 +201,7 @@ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct device *dev; tty_port_link_device(port, driver, index); + tty_port_link_driver_wq(port, driver); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { @@ -210,6 +229,7 @@ void tty_port_unregister_device(struct tty_port *port, { int ret; + WRITE_ONCE(port->buf.flip_wq, NULL); ret = serdev_tty_port_unregister(port); if (ret == 0) return; @@ -257,6 +277,7 @@ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); + WRITE_ONCE(port->buf.flip_wq, NULL); } EXPORT_SYMBOL(tty_port_destroy); @@ -703,6 +724,7 @@ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; + tty_port_link_driver_wq(port, driver); return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h index 31125e3be3c5..48adcb0e8ff3 100644 --- a/include/linux/tty_buffer.h +++ b/include/linux/tty_buffer.h @@ -34,6 +34,7 @@ static inline u8 *flag_buf_ptr(struct tty_buffer *b, unsigned int ofs) struct tty_bufhead { struct tty_buffer *head; /* Queue head */ + struct workqueue_struct *flip_wq; struct work_struct work; struct mutex lock; atomic_t priority; diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 188ee9b768eb..1f2896e56e77 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -69,6 +69,10 @@ struct serial_struct; * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. + * + * @TTY_DRIVER_NO_WORKQUEUE: + * Do not create workqueue when tty_register_driver(). Whenever set, flip + * buffer workqueue can be set by tty_port_link_wq() for every port. */ enum tty_driver_flag { TTY_DRIVER_INSTALLED = BIT(0), @@ -79,6 +83,7 @@ enum tty_driver_flag { TTY_DRIVER_HARDWARE_BREAK = BIT(5), TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), TTY_DRIVER_UNNUMBERED_NODE = BIT(7), + TTY_DRIVER_NO_WORKQUEUE = BIT(8), }; enum tty_driver_type { @@ -506,6 +511,7 @@ struct tty_operations { * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver + * @flip_wq: workqueue to queue flip buffer work on * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar @@ -539,6 +545,7 @@ struct tty_driver { unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; + struct workqueue_struct *flip_wq; /* * Pointer to the tty data structures diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 660c254f1efe..d2a7882c0b58 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -138,6 +138,7 @@ struct tty_port { kernel */ void tty_port_init(struct tty_port *port); +void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq); void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); struct device *tty_port_register_device(struct tty_port *port, @@ -165,6 +166,18 @@ static inline struct tty_port *tty_port_get(struct tty_port *port) return NULL; } +/* + * Never overwrite the workqueue set by tty_port_link_wq(). + * No effect when %TTY_DRIVER_NO_WORKQUEUE is set, as driver->flip_wq is + * %NULL. + */ +static inline void tty_port_link_driver_wq(struct tty_port *port, + struct tty_driver *driver) +{ + if (!port->buf.flip_wq) + tty_port_link_wq(port, driver->flip_wq); +} + /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(const struct tty_port *port) { -- cgit v1.2.3 From 59621105ffca7a33955f56bc7dee0923992f5832 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:16 +0100 Subject: of: provide of_machine_read_compatible() Provide a helper function allowing users to read the compatible string of the machine, hiding the access to the root node. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-1-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 15 +++++++++++++++ include/linux/of.h | 8 ++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index 57420806c1a2..b70aec32e0e3 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -434,6 +434,21 @@ bool of_machine_compatible_match(const char *const *compats) } EXPORT_SYMBOL(of_machine_compatible_match); +/** + * of_machine_read_compatible - Get the compatible string of this machine + * @compatible: address at which the address of the compatible string will be + * stored + * @index: index of the compatible entry in the list + * + * Returns: + * 0 on success, negative error number on failure. + */ +int of_machine_read_compatible(const char **compatible, unsigned int index) +{ + return of_property_read_string_index(of_root, "compatible", index, compatible); +} +EXPORT_SYMBOL_GPL(of_machine_read_compatible); + /** * of_machine_device_match - Test root of device tree against a of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..7df971d52b55 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -426,6 +426,8 @@ static inline bool of_machine_is_compatible(const char *compat) return of_machine_compatible_match(compats); } +int of_machine_read_compatible(const char **compatible, unsigned int index); + extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); extern int of_update_property(struct device_node *np, struct property *newprop); @@ -851,6 +853,12 @@ static inline int of_machine_is_compatible(const char *compat) return 0; } +static inline int of_machine_read_compatible(const char **compatible, + unsigned int index) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From c86d3b7b847cc9b32a17117cfd71679e4315fd9f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:17 +0100 Subject: of: provide of_machine_read_model() Provide a helper function allowing users to read the model string of the machine, hiding the access to the root node. Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-2-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 13 +++++++++++++ include/linux/of.h | 6 ++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index b70aec32e0e3..bf4a51887d74 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -449,6 +449,19 @@ int of_machine_read_compatible(const char **compatible, unsigned int index) } EXPORT_SYMBOL_GPL(of_machine_read_compatible); +/** + * of_machine_read_model - Get the model string of this machine + * @model: address at which the address of the model string will be stored + * + * Returns: + * 0 on success, negative error number on failure. + */ +int of_machine_read_model(const char **model) +{ + return of_property_read_string(of_root, "model", model); +} +EXPORT_SYMBOL_GPL(of_machine_read_model); + /** * of_machine_device_match - Test root of device tree against a of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in diff --git a/include/linux/of.h b/include/linux/of.h index 7df971d52b55..2b95777f16f6 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -427,6 +427,7 @@ static inline bool of_machine_is_compatible(const char *compat) } int of_machine_read_compatible(const char **compatible, unsigned int index); +int of_machine_read_model(const char **model); extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); @@ -859,6 +860,11 @@ static inline int of_machine_read_compatible(const char **compatible, return -ENOSYS; } +static inline int of_machine_read_model(const char **model) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From 030706e954c10749da8c75464c6b02cb30cb00aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:19 +0100 Subject: base: soc: rename and export soc_device_get_machine() Some SoC drivers reimplement the functionality of soc_device_get_machine(). Make this function accessible through the sys_soc.h header and rename it to a more descriptive name. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-4-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/soc.c | 13 +++++-------- include/linux/sys_soc.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/base/soc.c b/drivers/base/soc.c index 48e2f0dbd330..65ce72d49230 100644 --- a/drivers/base/soc.c +++ b/drivers/base/soc.c @@ -111,17 +111,14 @@ static void soc_release(struct device *dev) kfree(soc_dev); } -static void soc_device_get_machine(struct soc_device_attribute *soc_dev_attr) +int soc_attr_read_machine(struct soc_device_attribute *soc_dev_attr) { - struct device_node *np; - if (soc_dev_attr->machine) - return; + return -EBUSY; - np = of_find_node_by_path("/"); - of_property_read_string(np, "model", &soc_dev_attr->machine); - of_node_put(np); + return of_machine_read_model(&soc_dev_attr->machine); } +EXPORT_SYMBOL_GPL(soc_attr_read_machine); static struct soc_device_attribute *early_soc_dev_attr; @@ -131,7 +128,7 @@ struct soc_device *soc_device_register(struct soc_device_attribute *soc_dev_attr const struct attribute_group **soc_attr_groups; int ret; - soc_device_get_machine(soc_dev_attr); + soc_attr_read_machine(soc_dev_attr); if (!soc_bus_registered) { if (early_soc_dev_attr) diff --git a/include/linux/sys_soc.h b/include/linux/sys_soc.h index d9b3cf0f410c..f19f5cec18e2 100644 --- a/include/linux/sys_soc.h +++ b/include/linux/sys_soc.h @@ -37,6 +37,16 @@ void soc_device_unregister(struct soc_device *soc_dev); */ struct device *soc_device_to_device(struct soc_device *soc); +/** + * soc_attr_read_machine - retrieve the machine model and store it in + * the soc_device_attribute structure + * @soc_dev_attr: SoC attribute structure to store the model in + * + * Returns: + * 0 on success, negative error number on failure. + */ +int soc_attr_read_machine(struct soc_device_attribute *soc_dev_attr); + #ifdef CONFIG_SOC_BUS const struct soc_device_attribute *soc_device_match( const struct soc_device_attribute *matches); -- cgit v1.2.3 From bb729bf1d6fdf5c2087c1651165c74cef0da1742 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 10 Mar 2026 23:57:53 +0800 Subject: driver core: Add conditional guard support for device_lock() Introduce conditional guard version of device_lock() for scenarios that require conditional device lock holding. Suggested-by: Dan Williams Reviewed-by: Dan Williams Acked-by: Greg Kroah-Hartman Signed-off-by: Li Ming Link: https://patch.msgid.link/20260310-fix_access_endpoint_without_drv_check-v1-1-94fe919a0b87@zohomail.com Signed-off-by: Danilo Krummrich --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..4fafee80524b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -911,6 +911,7 @@ static inline void device_unlock(struct device *dev) } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) +DEFINE_GUARD_COND(device, _intr, device_lock_interruptible(_T), _RET == 0) static inline void device_lock_assert(struct device *dev) { -- cgit v1.2.3 From 15abbe7c82661209c1dc67c21903c07e2fff5aae Mon Sep 17 00:00:00 2001 From: Nimrod Oren Date: Mon, 9 Mar 2026 10:13:01 +0200 Subject: net: page_pool: scale alloc cache with PAGE_SIZE The current page_pool alloc-cache size and refill values were chosen to match the NAPI budget and to leave headroom for XDP_DROP recycling. These fixed values do not scale well with large pages, as they significantly increase a given page_pool's memory footprint. Scale these values to better balance memory footprint across page sizes, while keeping behavior on 4KB-page systems unchanged. Reviewed-by: Dragos Tatulea Reviewed-by: Tariq Toukan Signed-off-by: Nimrod Oren Link: https://patch.msgid.link/20260309081301.103152-1-noren@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index cdd95477af7a..03da138722f5 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -44,6 +44,8 @@ * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX * ring is usually refilled and the max consumed elements will be 64, * thus a natural max size of objects needed in the cache. + * The refill watermark is set to 64 for 4KB pages, + * and scales to balance its size in bytes across page sizes. * * Keeping room for more objects, is due to XDP_DROP use-case. As * XDP_DROP allows the opportunity to recycle objects directly into @@ -51,8 +53,15 @@ * cache is already full (or partly full) then the XDP_DROP recycles * would have to take a slower code path. */ -#define PP_ALLOC_CACHE_SIZE 128 +#if PAGE_SIZE >= SZ_64K +#define PP_ALLOC_CACHE_REFILL 4 +#elif PAGE_SIZE >= SZ_16K +#define PP_ALLOC_CACHE_REFILL 16 +#else #define PP_ALLOC_CACHE_REFILL 64 +#endif + +#define PP_ALLOC_CACHE_SIZE (PP_ALLOC_CACHE_REFILL * 2) struct pp_alloc_cache { u32 count; netmem_ref cache[PP_ALLOC_CACHE_SIZE]; -- cgit v1.2.3 From e4b993f2bca78357b430170574f8de7bc7874088 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 3 Mar 2026 22:17:09 +0100 Subject: wifi: nl80211: split out UHR operation information The beacon doesn't contain the full UHR operation, a number of fields (such as NPCA) are only partially there. Add a new attribute to contain the full information, so it's available to the driver/mac80211. Link: https://patch.msgid.link/20260303221710.866bacf82639.Iafdf37fb0f4304bdcdb824977d61e17b38c47685@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/nl80211.c | 26 ++++++++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0b7a06c2b9f7..67d764023988 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3001,6 +3001,10 @@ enum nl80211_commands { * interference detection is not performed on these sub-channels, their * corresponding bits are consistently set to zero. * + * @NL80211_ATTR_UHR_OPERATION: Full UHR Operation element, as it appears in + * association response etc., since it's abridged in the beacon. Used + * for START_AP etc. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3576,6 +3580,8 @@ enum nl80211_attrs { NL80211_ATTR_INCUMBENT_SIGNAL_INTERFERENCE_BITMAP, + NL80211_ATTR_UHR_OPERATION, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 699687a0caa9..3e867930e253 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -344,6 +344,17 @@ static int validate_uhr_capa(const struct nlattr *attr, return 0; } +static int validate_uhr_operation(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + + if (!ieee80211_uhr_oper_size_ok(data, len, false)) + return -EINVAL; + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -949,6 +960,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_UHR_CAPABILITY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255), [NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG }, + [NL80211_ATTR_UHR_OPERATION] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_operation), }; /* policy for the key attributes */ @@ -6501,16 +6514,6 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) return -EINVAL; } - cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len); - if (cap) { - if (!cap->datalen) - return -EINVAL; - params->uhr_oper = (void *)(cap->data + 1); - if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper, - cap->datalen - 1, true)) - return -EINVAL; - } - return 0; } @@ -6952,6 +6955,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (err) goto out; + if (info->attrs[NL80211_ATTR_UHR_OPERATION]) + params->uhr_oper = nla_data(info->attrs[NL80211_ATTR_UHR_OPERATION]); + err = nl80211_validate_ap_phy_operation(params); if (err) goto out; -- cgit v1.2.3 From 341a2c99c87ce6f62c6f4423fa641a39f0966bff Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Thu, 12 Mar 2026 21:32:45 +0530 Subject: drm/xe/uapi: Fix kernel-doc for DRM_XE_VM_BIND_FLAG_DECOMPRESS There is kernel-doc warning for DRM_XE_VM_BIND_FLAG_DECOMPRESS: ./include/uapi/drm/xe_drm.h:1060: WARNING: Block quote ends without a blank line; unexpected unindent. Fix the warning by adding the missing '%' prefix to DRM_XE_VM_BIND_FLAG_DECOMPRESS in the kernel-doc list entry for struct drm_xe_vm_bind_op. Fixes: 2270bd7124f4 ("drm/xe: add VM_BIND DECOMPRESS uapi flag") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202603121515.gEMrFlTL-lkp@intel.com/ Cc: Matthew Auld Signed-off-by: Nitin Gote Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260312160244.809849-2-nitin.r.gote@intel.com Signed-off-by: Tejas Upadhyay --- include/uapi/drm/xe_drm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 0497b85fa12a..f8b2afb20540 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1057,7 +1057,7 @@ struct drm_xe_vm_destroy { * not invoke autoreset. Neither will stack variables going out of scope. * Therefore it's recommended to always explicitly reset the madvises when * freeing the memory backing a region used in a &DRM_IOCTL_XE_MADVISE call. - * - DRM_XE_VM_BIND_FLAG_DECOMPRESS - Request on-device decompression for a MAP. + * - %DRM_XE_VM_BIND_FLAG_DECOMPRESS - Request on-device decompression for a MAP. * When set on a MAP bind operation, request the driver schedule an on-device * in-place decompression (via the migrate/resolve path) for the GPU mapping * created by this bind. Only valid for DRM_XE_VM_BIND_OP_MAP; usage on -- cgit v1.2.3 From e416e7fa417b2d2604c1526a2f9cc38da7ced166 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:29:53 -0700 Subject: tee: clean up tee_core.h kernel-doc Use the correct struct member name and function parameter name in kernel-doc comments. Move a macro that was between a struct's documentation and its declaration. These eliminate the following kernel-doc warnings: Warning: include/linux/tee_core.h:73 struct member 'c_no_users' not described in 'tee_device' Warning: include/linux/tee_core.h:132 #define TEE_DESC_PRIVILEGED 0x1; error: Cannot parse struct or union! Warning: include/linux/tee_core.h:257 function parameter 'connection_data' not described in 'tee_session_calc_client_uuid' Warning: include/linux/tee_core.h:320 function parameter 'teedev' not described in 'tee_get_drvdata' Signed-off-by: Randy Dunlap Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_core.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index ee5f0bd41f43..f993d5118edd 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -50,7 +50,7 @@ enum tee_dma_heap_id { * @dev: embedded basic device structure * @cdev: embedded cdev * @num_users: number of active users of this device - * @c_no_user: completion used when unregistering the device + * @c_no_users: completion used when unregistering the device * @mutex: mutex protecting @num_users and @idr * @idr: register of user space shared memory objects allocated or * registered on this device @@ -132,6 +132,7 @@ struct tee_driver_ops { /* Size for TEE revision string buffer used by get_tee_revision(). */ #define TEE_REVISION_STR_SIZE 128 +#define TEE_DESC_PRIVILEGED 0x1 /** * struct tee_desc - Describes the TEE driver to the subsystem * @name: name of driver @@ -139,7 +140,6 @@ struct tee_driver_ops { * @owner: module providing the driver * @flags: Extra properties of driver, defined by TEE_DESC_* below */ -#define TEE_DESC_PRIVILEGED 0x1 struct tee_desc { const char *name; const struct tee_driver_ops *ops; @@ -187,7 +187,7 @@ struct tee_protmem_pool_ops { * Allocates a new struct tee_device instance. The device is * removed by tee_device_unregister(). * - * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure + * @returns: a pointer to a 'struct tee_device' or an ERR_PTR on failure */ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, struct device *dev, @@ -201,7 +201,7 @@ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, * tee_device_unregister() need to be called to remove the @teedev if * this function fails. * - * @returns < 0 on failure + * @returns: < 0 on failure */ int tee_device_register(struct tee_device *teedev); @@ -254,14 +254,14 @@ void tee_device_set_dev_groups(struct tee_device *teedev, * tee_session_calc_client_uuid() - Calculates client UUID for session * @uuid: Resulting UUID * @connection_method: Connection method for session (TEE_IOCTL_LOGIN_*) - * @connectuon_data: Connection data for opening session + * @connection_data: Connection data for opening session * * Based on connection method calculates UUIDv5 based client UUID. * * For group based logins verifies that calling process has specified * credentials. * - * @return < 0 on failure + * @returns: < 0 on failure */ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, const u8 connection_data[TEE_IOCTL_UUID_LEN]); @@ -295,7 +295,7 @@ struct tee_shm_pool_ops { * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. */ struct tee_shm_pool *tee_shm_pool_alloc_res_mem(unsigned long vaddr, phys_addr_t paddr, size_t size, @@ -318,14 +318,16 @@ static inline void tee_shm_pool_free(struct tee_shm_pool *pool) * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. */ struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, size_t size); /** * tee_get_drvdata() - Return driver_data pointer - * @returns the driver_data pointer supplied to tee_register(). + * @teedev: Pointer to the tee_device + * + * @returns: the driver_data pointer supplied to tee_register(). */ void *tee_get_drvdata(struct tee_device *teedev); @@ -334,7 +336,7 @@ void *tee_get_drvdata(struct tee_device *teedev); * TEE driver * @ctx: The TEE context for shared memory allocation * @size: Shared memory allocation size - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size); @@ -354,7 +356,7 @@ void tee_dyn_shm_free_helper(struct tee_shm *shm, /** * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind * @shm: Shared memory handle - * @returns true if object is dynamic shared memory + * @returns: true if object is dynamic shared memory */ static inline bool tee_shm_is_dynamic(struct tee_shm *shm) { @@ -370,7 +372,7 @@ void tee_shm_put(struct tee_shm *shm); /** * tee_shm_get_id() - Get id of a shared memory object * @shm: Shared memory handle - * @returns id + * @returns: id */ static inline int tee_shm_get_id(struct tee_shm *shm) { @@ -382,7 +384,7 @@ static inline int tee_shm_get_id(struct tee_shm *shm) * count * @ctx: Context owning the shared memory * @id: Id of shared memory object - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id); @@ -402,7 +404,7 @@ static inline bool tee_param_is_memref(struct tee_param *param) * teedev_open() - Open a struct tee_device * @teedev: Device to open * - * @return a pointer to struct tee_context on success or an ERR_PTR on failure. + * @returns: pointer to struct tee_context on success or an ERR_PTR on failure. */ struct tee_context *teedev_open(struct tee_device *teedev); -- cgit v1.2.3 From fe2511adb1fc1814df06ca11e0d8a92f792e4029 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:30:17 +0100 Subject: sysfs: constify group arrays in function arguments Constify the groups array argument where applicable. This allows to pass constant arrays as arguments. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/17035265-8882-4101-b7a7-16b3eb94f8b5@gmail.com Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 10 +++++----- include/linux/sysfs.h | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index e1e639f515a0..b3edae0578c0 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -217,7 +217,7 @@ int sysfs_create_group(struct kobject *kobj, EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { int error = 0; int i; @@ -250,7 +250,7 @@ static int internal_create_groups(struct kobject *kobj, int update, * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return internal_create_groups(kobj, 0, groups); } @@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups); * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return internal_create_groups(kobj, 1, groups); } @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(sysfs_remove_group); * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { int i; @@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(sysfs_group_change_owner); * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 99b775f3ff46..9777e9445dd5 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -445,15 +445,15 @@ void sysfs_delete_link(struct kobject *dir, struct kobject *targ, int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int __must_check sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, @@ -486,7 +486,7 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, @@ -629,13 +629,13 @@ static inline int sysfs_create_group(struct kobject *kobj, } static inline int sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } @@ -652,7 +652,7 @@ static inline void sysfs_remove_group(struct kobject *kobj, } static inline void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { } @@ -733,7 +733,7 @@ static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t k } static inline int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid) { return 0; -- cgit v1.2.3 From ece5283706aff6791a37894bafbb0c134a94c0f3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:02 +0100 Subject: driver: core: constify groups array argument in device_add_groups and device_remove_groups Now that sysfs_create_groups() and sysfs_remove_groups() allow to pass constant groups arrays, we can constify the groups array argument also here. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8ea2d6d1-0adb-4d7f-92bc-751e93ce08d6@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 5 +++-- include/linux/device.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index 791f9e444df8..f497b724332a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2831,14 +2831,15 @@ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(removable); -int device_add_groups(struct device *dev, const struct attribute_group **groups) +int device_add_groups(struct device *dev, + const struct attribute_group *const *groups) { return sysfs_create_groups(&dev->kobj, groups); } EXPORT_SYMBOL_GPL(device_add_groups); void device_remove_groups(struct device *dev, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { sysfs_remove_groups(&dev->kobj, groups); } diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..48a0444ccc1e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1131,9 +1131,9 @@ device_create_with_groups(const struct class *cls, struct device *parent, dev_t void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); void device_remove_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) -- cgit v1.2.3 From 10f874dc92b3f3bf96470d997bdf157b289c9d4c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:56 +0100 Subject: driver core: make struct class groups members constant arrays Constify the groups arrays, allowing to assign constant arrays. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/7ff56b07-09ca-4948-b98f-5bd37ceef21e@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 65880e60c720..2079239a5aa5 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -50,8 +50,8 @@ struct fwnode_handle; struct class { const char *name; - const struct attribute_group **class_groups; - const struct attribute_group **dev_groups; + const struct attribute_group *const *class_groups; + const struct attribute_group *const *dev_groups; int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode); -- cgit v1.2.3 From cb0caadb64ca0894c4a24e1a34841f260d462f90 Mon Sep 17 00:00:00 2001 From: Shayne Chen Date: Fri, 13 Mar 2026 14:21:49 +0800 Subject: wifi: ieee80211: fix definition of EHT-MCS 15 in MRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the definition in IEEE Std 802.11be-2024, Table 9-417r, each bit indicates support for the transmission and reception of EHT-MCS 15 in: - B0: 52+26-tone and 106+26-tone MRUs. - B1: a 484+242-tone MRU if 80 MHz is supported. - B2: a 996+484-tone MRU and a 996+484+242-tone MRU if 160 MHz is supported. - B3: a 3×996-tone MRU if 320 MHz is supported. Fixes: 6239da18d2f9 ("wifi: mac80211: adjust EHT capa when lowering bandwidth") Signed-off-by: Shayne Chen Link: https://patch.msgid.link/20260313062150.3165433-1-shayne.chen@mediatek.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-eht.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h index f8e9f5d36d2a..a97b1d01f3ac 100644 --- a/include/linux/ieee80211-eht.h +++ b/include/linux/ieee80211-eht.h @@ -251,8 +251,8 @@ struct ieee80211_eht_operation_info { #define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 #define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x20 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 #define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 -- cgit v1.2.3 From 7caedbb5ade345df0eec0bf01035c780919a9f56 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Mar 2026 13:37:02 -0700 Subject: integrity: Eliminate weak definition of arch_get_secureboot() security/integrity/secure_boot.c contains a single __weak function, which breaks recordmcount when building with clang: $ make -skj"$(nproc)" ARCH=powerpc LLVM=1 ppc64_defconfig security/integrity/secure_boot.o Cannot find symbol for section 2: .text. security/integrity/secure_boot.o: failed Introduce a Kconfig symbol, CONFIG_HAVE_ARCH_GET_SECUREBOOT, to indicate that an architecture provides a definition of arch_get_secureboot(). Provide a static inline stub when this symbol is not defined to achieve the same effect as the __weak function, allowing secure_boot.c to be removed altogether. Move the s390 definition of arch_get_secureboot() out of the CONFIG_KEXEC_FILE block to ensure it is always available, as it does not actually depend on KEXEC_FILE. Reported-by: Arnd Bergmann Fixes: 31a6a07eefeb ("integrity: Make arch_ima_get_secureboot integrity-wide") Signed-off-by: Nathan Chancellor Acked-by: Arnd Bergmann Signed-off-by: Mimi Zohar --- arch/Kconfig | 3 +++ arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/s390/kernel/ipl.c | 10 +++++----- include/linux/secure_boot.h | 4 ++++ security/integrity/Makefile | 2 +- security/integrity/secure_boot.c | 16 ---------------- 7 files changed, 15 insertions(+), 22 deletions(-) delete mode 100644 security/integrity/secure_boot.c (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 102ddbd4298e..a6d1c8cc1d64 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1841,4 +1841,7 @@ config ARCH_WANTS_PRE_LINK_VMLINUX config ARCH_HAS_CPU_ATTACK_VECTORS bool +config HAVE_ARCH_GET_SECUREBOOT + def_bool EFI + endmenu diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ad7a2fe63a2a..da1eafb64354 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1061,6 +1061,7 @@ config PPC_SECURE_BOOT depends on IMA_ARCH_POLICY imply IMA_SECURE_AND_OR_TRUSTED_BOOT select PSERIES_PLPKS if PPC_PSERIES + select HAVE_ARCH_GET_SECUREBOOT help Systems with firmware secure boot enabled need to define security policies to extend secure boot to the OS. This config allows a user diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2101cc738b5e..4197c20d34b4 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -181,6 +181,7 @@ config S390 select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_GET_SECUREBOOT select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 2d01a1713938..3c346b02ceb9 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2388,6 +2388,11 @@ void __no_stack_protector s390_reset_system(void) diag_amode31_ops.diag308_reset(); } +bool arch_get_secureboot(void) +{ + return ipl_secure_flag; +} + #ifdef CONFIG_KEXEC_FILE int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, @@ -2505,11 +2510,6 @@ out: return buf; } -bool arch_get_secureboot(void) -{ - return ipl_secure_flag; -} - int ipl_report_free(struct ipl_report *report) { struct ipl_report_component *comp, *ncomp; diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h index 3ded3f03655c..d17e92351567 100644 --- a/include/linux/secure_boot.h +++ b/include/linux/secure_boot.h @@ -10,10 +10,14 @@ #include +#ifdef CONFIG_HAVE_ARCH_GET_SECUREBOOT /* * Returns true if the platform secure boot is enabled. * Returns false if disabled or not supported. */ bool arch_get_secureboot(void); +#else +static inline bool arch_get_secureboot(void) { return false; } +#endif #endif /* _LINUX_SECURE_BOOT_H */ diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 548665e2b702..45dfdedbdad4 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_INTEGRITY) += integrity.o -integrity-y := iint.o secure_boot.o +integrity-y := iint.o integrity-$(CONFIG_INTEGRITY_AUDIT) += integrity_audit.o integrity-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-$(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) += digsig_asymmetric.o diff --git a/security/integrity/secure_boot.c b/security/integrity/secure_boot.c deleted file mode 100644 index fc2693c286f8..000000000000 --- a/security/integrity/secure_boot.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. - * - * Author: Coiby Xu - */ -#include - -/* - * Default weak implementation. - * Architectures that support secure boot must override this. - */ -__weak bool arch_get_secureboot(void) -{ - return false; -} -- cgit v1.2.3 From 97e6fabee5dcb2d86d4ff45f20606b8a73181f74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 13:53:03 +0100 Subject: driver core: auxiliary bus: Introduce dev_is_auxiliary() Introduce dev_is_auxiliary() in analogy with dev_is_platform() to facilitate subsequent changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Danilo Krummrich Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/5079467.GXAFRqVoOG@rafael.j.wysocki --- drivers/base/auxiliary.c | 10 ++++++++++ include/linux/auxiliary_bus.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 9fd3820d1f8a..11949d6bcda4 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -502,6 +502,16 @@ struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, } EXPORT_SYMBOL_GPL(__devm_auxiliary_device_create); +/** + * dev_is_auxiliary - check if the device is an auxiliary one + * @dev: device to check + */ +bool dev_is_auxiliary(struct device *dev) +{ + return dev->bus == &auxiliary_bus_type; +} +EXPORT_SYMBOL_GPL(dev_is_auxiliary); + void __init auxiliary_bus_init(void) { WARN_ON(bus_register(&auxiliary_bus_type)); diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 4086afd0cc6b..bc09b55e3682 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -271,6 +271,8 @@ struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, __devm_auxiliary_device_create(dev, KBUILD_MODNAME, devname, \ platform_data, 0) +bool dev_is_auxiliary(struct device *dev); + /** * module_auxiliary_driver() - Helper macro for registering an auxiliary driver * @__auxiliary_driver: auxiliary driver struct -- cgit v1.2.3 From 97892d5f0690f588bbcf755efe922c72cd248639 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 13:58:30 +0100 Subject: ACPI: driver: Do not set acpi_device_name() unnecessarily ACPI drivers usually set acpi_device_name() for the given struct acpi_device to whatever they like, but that value is never used unless the driver itself uses it and, quite unfortunately, drivers neglect to clear it on remove. Some drivers use it for printing messages or initializing the names of subordinate devices, but it is better to use string literals for that, especially if the given one is used just once. To eliminate unnecessary overhead related to acpi_device_name() handling, rework multiple core ACPI device drivers to stop setting acpi_device_name() for struct acpi_device objects manipulated by them and use a string literal instead of it where applicable. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/10840483.nUPlyArG6x@rafael.j.wysocki --- drivers/acpi/ac.c | 6 ++---- drivers/acpi/acpi_memhotplug.c | 2 -- drivers/acpi/acpi_pad.c | 2 -- drivers/acpi/acpi_processor.c | 1 - drivers/acpi/acpi_video.c | 16 +++++----------- drivers/acpi/battery.c | 3 --- drivers/acpi/button.c | 7 +++---- drivers/acpi/ec.c | 2 -- drivers/acpi/pci_link.c | 2 -- drivers/acpi/pci_root.c | 7 ++----- drivers/acpi/power.c | 2 -- drivers/acpi/sbs.c | 1 - drivers/acpi/sbshc.c | 2 -- drivers/acpi/thermal.c | 6 ++---- include/acpi/processor.h | 1 - 15 files changed, 14 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index e0560a2c71a0..4985c8890609 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -22,7 +22,6 @@ #include #define ACPI_AC_CLASS "ac_adapter" -#define ACPI_AC_DEVICE_NAME "AC Adapter" #define ACPI_AC_FILE_STATE "state" #define ACPI_AC_NOTIFY_STATUS 0x80 #define ACPI_AC_STATUS_OFFLINE 0x00 @@ -203,7 +202,6 @@ static int acpi_ac_probe(struct platform_device *pdev) return -ENOMEM; ac->device = adev; - strscpy(acpi_device_name(adev), ACPI_AC_DEVICE_NAME); strscpy(acpi_device_class(adev), ACPI_AC_CLASS); platform_set_drvdata(pdev, ac); @@ -226,8 +224,8 @@ static int acpi_ac_probe(struct platform_device *pdev) goto err_release_ac; } - pr_info("%s [%s] (%s-line)\n", acpi_device_name(adev), - acpi_device_bid(adev), str_on_off(ac->state)); + pr_info("AC Adapter [%s] (%s-line)\n", acpi_device_bid(adev), + str_on_off(ac->state)); ac->battery_nb.notifier_call = acpi_ac_battery_notify; register_acpi_notifier(&ac->battery_nb); diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 02a472fa85fc..7f021e6d8b0e 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -20,7 +20,6 @@ #define ACPI_MEMORY_DEVICE_CLASS "memory" #define ACPI_MEMORY_DEVICE_HID "PNP0C80" -#define ACPI_MEMORY_DEVICE_NAME "Hotplug Mem Device" static const struct acpi_device_id memory_device_ids[] = { {ACPI_MEMORY_DEVICE_HID, 0}, @@ -297,7 +296,6 @@ static int acpi_memory_device_add(struct acpi_device *device, INIT_LIST_HEAD(&mem_device->res_list); mem_device->device = device; mem_device->mgid = -1; - sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); device->driver_data = mem_device; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 407a0d68525c..1f735f77fd1a 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -24,7 +24,6 @@ #include #define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" -#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 #define ACPI_PROCESSOR_AGGREGATOR_STATUS_SUCCESS 0 @@ -427,7 +426,6 @@ static int acpi_pad_probe(struct platform_device *pdev) { struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); - strscpy(acpi_device_name(adev), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); strscpy(acpi_device_class(adev), ACPI_PROCESSOR_AGGREGATOR_CLASS); return acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index b34a48068a8d..46020a49a7ed 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -438,7 +438,6 @@ static int acpi_processor_add(struct acpi_device *device, } pr->handle = device->handle; - strscpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); device->driver_data = pr; diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index f48bc7817417..30822d46a71e 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -30,9 +30,6 @@ #include #include -#define ACPI_VIDEO_BUS_NAME "Video Bus" -#define ACPI_VIDEO_DEVICE_NAME "Video Device" - #define MAX_NAME_LEN 20 MODULE_AUTHOR("Bruno Ducrot"); @@ -1144,7 +1141,6 @@ static int acpi_video_bus_get_one_device(struct acpi_device *device, void *arg) return -ENOMEM; } - strscpy(acpi_device_name(device), ACPI_VIDEO_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_VIDEO_CLASS); data->device_id = device_id; @@ -1882,7 +1878,7 @@ static int acpi_video_bus_add_notify_handler(struct acpi_video_bus *video, snprintf(video->phys, sizeof(video->phys), "%s/video/input0", acpi_device_hid(video->device)); - input->name = acpi_device_name(video->device); + input->name = "Video Bus"; input->phys = video->phys; input->id.bustype = BUS_HOST; input->id.product = 0x06; @@ -2019,7 +2015,6 @@ static int acpi_video_bus_probe(struct auxiliary_device *aux_dev, auxiliary_set_drvdata(aux_dev, video); video->device = device; - strscpy(acpi_device_name(device), ACPI_VIDEO_BUS_NAME); strscpy(acpi_device_class(device), ACPI_VIDEO_CLASS); device->driver_data = video; @@ -2041,11 +2036,10 @@ static int acpi_video_bus_probe(struct auxiliary_device *aux_dev, */ acpi_device_fix_up_power_children(device); - pr_info("%s [%s] (multi-head: %s rom: %s post: %s)\n", - ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device), - str_yes_no(video->flags.multihead), - str_yes_no(video->flags.rom), - str_yes_no(video->flags.post)); + pr_info("Video Device [%s] (multi-head: %s rom: %s post: %s)\n", + acpi_device_bid(device), str_yes_no(video->flags.multihead), + str_yes_no(video->flags.rom), str_yes_no(video->flags.post)); + mutex_lock(&video_list_lock); list_add_tail(&video->entry, &video_bus_head); mutex_unlock(&video_list_lock); diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 8fbad8bc4650..acf5dd2177a1 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -33,8 +33,6 @@ #define ACPI_BATTERY_CAPACITY_VALID(capacity) \ ((capacity) != 0 && (capacity) != ACPI_BATTERY_VALUE_UNKNOWN) -#define ACPI_BATTERY_DEVICE_NAME "Battery" - /* Battery power unit: 0 means mW, 1 means mA */ #define ACPI_BATTERY_POWER_UNIT_MA 1 @@ -1229,7 +1227,6 @@ static int acpi_battery_probe(struct platform_device *pdev) platform_set_drvdata(pdev, battery); battery->device = device; - strscpy(acpi_device_name(device), ACPI_BATTERY_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_BATTERY_CLASS); result = devm_mutex_init(&pdev->dev, &battery->update_lock); diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 97b05246efab..c57bd9c63057 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -558,27 +558,26 @@ static int acpi_button_probe(struct platform_device *pdev) goto err_free_button; } - name = acpi_device_name(device); class = acpi_device_class(device); if (!strcmp(hid, ACPI_BUTTON_HID_POWER) || !strcmp(hid, ACPI_BUTTON_HID_POWERF)) { button->type = ACPI_BUTTON_TYPE_POWER; handler = acpi_button_notify; - strscpy(name, ACPI_BUTTON_DEVICE_NAME_POWER, MAX_ACPI_DEVICE_NAME_LEN); + name = ACPI_BUTTON_DEVICE_NAME_POWER; sprintf(class, "%s/%s", ACPI_BUTTON_CLASS, ACPI_BUTTON_SUBCLASS_POWER); } else if (!strcmp(hid, ACPI_BUTTON_HID_SLEEP) || !strcmp(hid, ACPI_BUTTON_HID_SLEEPF)) { button->type = ACPI_BUTTON_TYPE_SLEEP; handler = acpi_button_notify; - strscpy(name, ACPI_BUTTON_DEVICE_NAME_SLEEP, MAX_ACPI_DEVICE_NAME_LEN); + name = ACPI_BUTTON_DEVICE_NAME_SLEEP; sprintf(class, "%s/%s", ACPI_BUTTON_CLASS, ACPI_BUTTON_SUBCLASS_SLEEP); } else if (!strcmp(hid, ACPI_BUTTON_HID_LID)) { button->type = ACPI_BUTTON_TYPE_LID; handler = acpi_lid_notify; - strscpy(name, ACPI_BUTTON_DEVICE_NAME_LID, MAX_ACPI_DEVICE_NAME_LEN); + name = ACPI_BUTTON_DEVICE_NAME_LID; sprintf(class, "%s/%s", ACPI_BUTTON_CLASS, ACPI_BUTTON_SUBCLASS_LID); input->open = acpi_lid_input_open; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 5f63ed120a2c..4b21279012a7 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -36,7 +36,6 @@ #include "internal.h" #define ACPI_EC_CLASS "embedded_controller" -#define ACPI_EC_DEVICE_NAME "Embedded Controller" /* EC status register */ #define ACPI_EC_FLAG_OBF 0x01 /* Output buffer full */ @@ -1681,7 +1680,6 @@ static int acpi_ec_probe(struct platform_device *pdev) struct acpi_ec *ec; int ret; - strscpy(acpi_device_name(device), ACPI_EC_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_EC_CLASS); if (boot_ec && (boot_ec->handle == device->handle || diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 45bdfd06bd21..5745de24024c 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -30,7 +30,6 @@ #include "internal.h" #define ACPI_PCI_LINK_CLASS "pci_irq_routing" -#define ACPI_PCI_LINK_DEVICE_NAME "PCI Interrupt Link" #define ACPI_PCI_LINK_MAX_POSSIBLE 16 static int acpi_pci_link_add(struct acpi_device *device, @@ -725,7 +724,6 @@ static int acpi_pci_link_add(struct acpi_device *device, return -ENOMEM; link->device = device; - strscpy(acpi_device_name(device), ACPI_PCI_LINK_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_PCI_LINK_CLASS); device->driver_data = link; diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 4a882e939525..f4aa5b624d9b 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -25,7 +25,6 @@ #include "internal.h" #define ACPI_PCI_ROOT_CLASS "pci_bridge" -#define ACPI_PCI_ROOT_DEVICE_NAME "PCI Root Bridge" static int acpi_pci_root_add(struct acpi_device *device, const struct acpi_device_id *not_used); static void acpi_pci_root_remove(struct acpi_device *device); @@ -689,7 +688,6 @@ static int acpi_pci_root_add(struct acpi_device *device, root->device = device; root->segment = segment & 0xFFFF; - strscpy(acpi_device_name(device), ACPI_PCI_ROOT_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_PCI_ROOT_CLASS); device->driver_data = root; @@ -698,9 +696,8 @@ static int acpi_pci_root_add(struct acpi_device *device, goto end; } - pr_info("%s [%s] (domain %04x %pR)\n", - acpi_device_name(device), acpi_device_bid(device), - root->segment, &root->secondary); + pr_info("PCI Root Bridge [%s] (domain %04x %pR)\n", + acpi_device_bid(device), root->segment, &root->secondary); root->mcfg_addr = acpi_pci_root_get_mcfg_addr(handle); diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 4611159ee734..dcc9ad7790f0 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -38,7 +38,6 @@ #include "internal.h" #define ACPI_POWER_CLASS "power_resource" -#define ACPI_POWER_DEVICE_NAME "Power Resource" #define ACPI_POWER_RESOURCE_STATE_OFF 0x00 #define ACPI_POWER_RESOURCE_STATE_ON 0x01 #define ACPI_POWER_RESOURCE_STATE_UNKNOWN 0xFF @@ -955,7 +954,6 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle) mutex_init(&resource->resource_lock); INIT_LIST_HEAD(&resource->list_node); INIT_LIST_HEAD(&resource->dependents); - strscpy(acpi_device_name(device), ACPI_POWER_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_POWER_CLASS); device->power.state = ACPI_STATE_UNKNOWN; device->flags.match_driver = true; diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index bbd3938f7b52..7e789290c5ad 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -648,7 +648,6 @@ static int acpi_sbs_probe(struct platform_device *pdev) sbs->hc = dev_get_drvdata(pdev->dev.parent); sbs->device = device; - strscpy(acpi_device_name(device), ACPI_SBS_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_SBS_CLASS); result = acpi_charger_add(sbs); diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index 36850831910b..97eaa2fc31f2 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -19,7 +19,6 @@ #include "internal.h" #define ACPI_SMB_HC_CLASS "smbus_host_ctl" -#define ACPI_SMB_HC_DEVICE_NAME "ACPI SMBus HC" struct acpi_smb_hc { struct acpi_ec *ec; @@ -251,7 +250,6 @@ static int acpi_smbus_hc_probe(struct platform_device *pdev) return -EIO; } - strscpy(acpi_device_name(device), ACPI_SMB_HC_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_SMB_HC_CLASS); hc = kzalloc_obj(struct acpi_smb_hc); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 64356b004a57..6ccb364665d1 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -35,7 +35,6 @@ #include "internal.h" #define ACPI_THERMAL_CLASS "thermal_zone" -#define ACPI_THERMAL_DEVICE_NAME "Thermal Zone" #define ACPI_THERMAL_NOTIFY_TEMPERATURE 0x80 #define ACPI_THERMAL_NOTIFY_THRESHOLDS 0x81 #define ACPI_THERMAL_NOTIFY_DEVICES 0x82 @@ -800,7 +799,6 @@ static int acpi_thermal_probe(struct platform_device *pdev) tz->device = device; strscpy(tz->name, device->pnp.bus_id); - strscpy(acpi_device_name(device), ACPI_THERMAL_DEVICE_NAME); strscpy(acpi_device_class(device), ACPI_THERMAL_CLASS); acpi_thermal_aml_dependency_fix(tz); @@ -879,8 +877,8 @@ static int acpi_thermal_probe(struct platform_device *pdev) mutex_init(&tz->thermal_check_lock); INIT_WORK(&tz->thermal_check_work, acpi_thermal_check_fn); - pr_info("%s [%s] (%ld C)\n", acpi_device_name(device), - acpi_device_bid(device), deci_kelvin_to_celsius(tz->temp_dk)); + pr_info("Thermal Zone [%s] (%ld C)\n", acpi_device_bid(device), + deci_kelvin_to_celsius(tz->temp_dk)); result = acpi_dev_install_notify_handler(device, ACPI_DEVICE_NOTIFY, acpi_thermal_notify, tz); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 7146a8e9e9c2..43fe4a85fc0f 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -15,7 +15,6 @@ #include #define ACPI_PROCESSOR_CLASS "processor" -#define ACPI_PROCESSOR_DEVICE_NAME "Processor" #define ACPI_PROCESSOR_DEVICE_HID "ACPI0007" #define ACPI_PROCESSOR_CONTAINER_HID "ACPI0010" -- cgit v1.2.3 From 69652f32c9ac71e2b0c8ed407e13ad905e00e947 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 13:59:17 +0100 Subject: ACPI: event: Redefine acpi_notifier_call_chain() Notice that acpi_notifier_call_chain() only uses its device argument to retrieve the pnp.device_class and pnp.bus_id values from there, so it can be redefined to take pointers to those two strings as parameters istead of a struct acpi_device pointer. That allows all of its callers to pass a string literal as its first argument, so they won't need to initialize pnp.device_class in struct acpi_device objects operated by them any more, and its signature becomes more similar to acpi_bus_generate_netlink_event() then. Update the code as per the above. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2056097.PYKUYFuaPT@rafael.j.wysocki --- drivers/acpi/ac.c | 3 ++- drivers/acpi/acpi_video.c | 9 ++++++--- drivers/acpi/battery.c | 3 ++- drivers/acpi/event.c | 7 ++++--- include/acpi/acpi_bus.h | 3 ++- 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 4985c8890609..2b500e89169f 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -133,7 +133,8 @@ static void acpi_ac_notify(acpi_handle handle, u32 event, void *data) acpi_bus_generate_netlink_event(adev->pnp.device_class, dev_name(&adev->dev), event, (u32) ac->state); - acpi_notifier_call_chain(adev, event, (u32) ac->state); + acpi_notifier_call_chain(ACPI_AC_CLASS, acpi_device_bid(adev), + event, ac->state); power_supply_changed(ac->charger); } } diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 30822d46a71e..c747827653d9 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -1566,7 +1566,8 @@ static void acpi_video_bus_notify(acpi_handle handle, u32 event, void *data) break; } - if (acpi_notifier_call_chain(device, event, 0)) + if (acpi_notifier_call_chain(ACPI_VIDEO_CLASS, acpi_device_bid(device), + event, 0)) /* Something vetoed the keypress. */ keycode = 0; @@ -1607,7 +1608,8 @@ static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data) if (video_device->backlight) backlight_force_update(video_device->backlight, BACKLIGHT_UPDATE_HOTKEY); - acpi_notifier_call_chain(device, event, 0); + acpi_notifier_call_chain(ACPI_VIDEO_CLASS, acpi_device_bid(device), + event, 0); return; } @@ -1640,7 +1642,8 @@ static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data) if (keycode) may_report_brightness_keys = true; - acpi_notifier_call_chain(device, event, 0); + acpi_notifier_call_chain(ACPI_VIDEO_CLASS, acpi_device_bid(device), + event, 0); if (keycode && (report_key_events & REPORT_BRIGHTNESS_KEY_EVENTS)) { input_report_key(input, keycode, 1); diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index acf5dd2177a1..1bfc4179e885 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -1081,7 +1081,8 @@ static void acpi_battery_notify(acpi_handle handle, u32 event, void *data) acpi_bus_generate_netlink_event(device->pnp.device_class, dev_name(&device->dev), event, acpi_battery_present(battery)); - acpi_notifier_call_chain(device, event, acpi_battery_present(battery)); + acpi_notifier_call_chain(ACPI_BATTERY_CLASS, acpi_device_bid(device), + event, acpi_battery_present(battery)); /* acpi_battery_update could remove power_supply object */ if (old && battery->bat) power_supply_changed(battery->bat); diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index 96a9aaaaf9f7..4d840d2e7b98 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -24,12 +24,13 @@ /* ACPI notifier chain */ static BLOCKING_NOTIFIER_HEAD(acpi_chain_head); -int acpi_notifier_call_chain(struct acpi_device *dev, u32 type, u32 data) +int acpi_notifier_call_chain(const char *device_class, + const char *bus_id, u32 type, u32 data) { struct acpi_bus_event event; - strscpy(event.device_class, dev->pnp.device_class); - strscpy(event.bus_id, dev->pnp.bus_id); + strscpy(event.device_class, device_class); + strscpy(event.bus_id, bus_id); event.type = type; event.data = data; return (blocking_notifier_call_chain(&acpi_chain_head, 0, (void *)&event) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index aad1a95e6863..ff14c9362122 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -625,7 +625,8 @@ int acpi_dev_install_notify_handler(struct acpi_device *adev, void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler); -extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32); +extern int acpi_notifier_call_chain(const char *device_class, + const char *bus_id, u32 type, u32 data); extern int register_acpi_notifier(struct notifier_block *); extern int unregister_acpi_notifier(struct notifier_block *); -- cgit v1.2.3 From e18947038bf4f39d47cdba511f85a9af668d56e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 14:00:41 +0100 Subject: ACPI: driver: Do not set acpi_device_class() unnecessarily Several core ACPI device drivers set acpi_device_class() for the given struct acpi_device to whatever they like, but that value is never used unless the driver itself uses it and, sadly, they neglect to clear it on remove. Since the only one of them still using acpi_device_class() after previous changes is the button driver, update the others to stop setting it in vain. Also drop the related device class sybmols that become redundant. Since the ACPI button driver continues to use acpi_device_class(), make it clear the struct field represented by acpi_device_class() in its remove callback. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/3706295.iIbC2pHGDl@rafael.j.wysocki --- drivers/acpi/ac.c | 1 - drivers/acpi/acpi_memhotplug.c | 2 -- drivers/acpi/acpi_pad.c | 5 +---- drivers/acpi/acpi_processor.c | 1 - drivers/acpi/acpi_video.c | 3 --- drivers/acpi/battery.c | 1 - drivers/acpi/button.c | 2 ++ drivers/acpi/ec.c | 4 ---- drivers/acpi/pci_link.c | 2 -- drivers/acpi/pci_root.c | 2 -- drivers/acpi/power.c | 2 -- drivers/acpi/processor_driver.c | 4 ++-- drivers/acpi/sbs.c | 2 -- drivers/acpi/sbshc.c | 4 ---- drivers/acpi/thermal.c | 1 - include/acpi/processor.h | 1 - 16 files changed, 5 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 41a085562c63..2825db974bd8 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -203,7 +203,6 @@ static int acpi_ac_probe(struct platform_device *pdev) return -ENOMEM; ac->device = adev; - strscpy(acpi_device_class(adev), ACPI_AC_CLASS); platform_set_drvdata(pdev, ac); diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 7f021e6d8b0e..1d7dfe4ee9a6 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -18,7 +18,6 @@ #include "internal.h" -#define ACPI_MEMORY_DEVICE_CLASS "memory" #define ACPI_MEMORY_DEVICE_HID "PNP0C80" static const struct acpi_device_id memory_device_ids[] = { @@ -296,7 +295,6 @@ static int acpi_memory_device_add(struct acpi_device *device, INIT_LIST_HEAD(&mem_device->res_list); mem_device->device = device; mem_device->mgid = -1; - sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); device->driver_data = mem_device; /* Get the range from the _CRS */ diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index b46c4dd65fbe..0a8e02bc8c8b 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -23,7 +23,6 @@ #include #include -#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 #define ACPI_PROCESSOR_AGGREGATOR_STATUS_SUCCESS 0 @@ -413,7 +412,7 @@ static void acpi_pad_notify(acpi_handle handle, u32 event, void *data) switch (event) { case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: acpi_pad_handle_notify(handle); - acpi_bus_generate_netlink_event(ACPI_PROCESSOR_AGGREGATOR_CLASS, + acpi_bus_generate_netlink_event("acpi_pad", dev_name(&adev->dev), event, 0); break; default: @@ -426,8 +425,6 @@ static int acpi_pad_probe(struct platform_device *pdev) { struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); - strscpy(acpi_device_class(adev), ACPI_PROCESSOR_AGGREGATOR_CLASS); - return acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_pad_notify, adev); } diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 46020a49a7ed..2ac76f3b1cfd 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -438,7 +438,6 @@ static int acpi_processor_add(struct acpi_device *device, } pr->handle = device->handle; - strscpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); device->driver_data = pr; result = acpi_processor_get_info(device); diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index c747827653d9..05793ddef787 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -1141,8 +1141,6 @@ static int acpi_video_bus_get_one_device(struct acpi_device *device, void *arg) return -ENOMEM; } - strscpy(acpi_device_class(device), ACPI_VIDEO_CLASS); - data->device_id = device_id; data->video = video; data->dev = device; @@ -2018,7 +2016,6 @@ static int acpi_video_bus_probe(struct auxiliary_device *aux_dev, auxiliary_set_drvdata(aux_dev, video); video->device = device; - strscpy(acpi_device_class(device), ACPI_VIDEO_CLASS); device->driver_data = video; acpi_video_bus_find_cap(video); diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 54048438b5da..b4c25474f42f 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -1228,7 +1228,6 @@ static int acpi_battery_probe(struct platform_device *pdev) platform_set_drvdata(pdev, battery); battery->device = device; - strscpy(acpi_device_class(device), ACPI_BATTERY_CLASS); result = devm_mutex_init(&pdev->dev, &battery->update_lock); if (result) diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index cc17d9d843ec..dc064a388c23 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -697,6 +697,8 @@ static void acpi_button_remove(struct platform_device *pdev) acpi_button_remove_fs(button); input_unregister_device(button->input); kfree(button); + + memset(acpi_device_class(adev), 0, sizeof(acpi_device_class)); } static int param_set_lid_init_state(const char *val, diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 4b21279012a7..0624d8673679 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -35,8 +35,6 @@ #include "internal.h" -#define ACPI_EC_CLASS "embedded_controller" - /* EC status register */ #define ACPI_EC_FLAG_OBF 0x01 /* Output buffer full */ #define ACPI_EC_FLAG_IBF 0x02 /* Input buffer full */ @@ -1680,8 +1678,6 @@ static int acpi_ec_probe(struct platform_device *pdev) struct acpi_ec *ec; int ret; - strscpy(acpi_device_class(device), ACPI_EC_CLASS); - if (boot_ec && (boot_ec->handle == device->handle || !strcmp(acpi_device_hid(device), ACPI_ECDT_HID))) { /* Fast path: this device corresponds to the boot EC. */ diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 5745de24024c..e6ed13aee48d 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -29,7 +29,6 @@ #include "internal.h" -#define ACPI_PCI_LINK_CLASS "pci_irq_routing" #define ACPI_PCI_LINK_MAX_POSSIBLE 16 static int acpi_pci_link_add(struct acpi_device *device, @@ -724,7 +723,6 @@ static int acpi_pci_link_add(struct acpi_device *device, return -ENOMEM; link->device = device; - strscpy(acpi_device_class(device), ACPI_PCI_LINK_CLASS); device->driver_data = link; mutex_lock(&acpi_link_lock); diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index f4aa5b624d9b..a0ba64e45e8a 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -24,7 +24,6 @@ #include #include "internal.h" -#define ACPI_PCI_ROOT_CLASS "pci_bridge" static int acpi_pci_root_add(struct acpi_device *device, const struct acpi_device_id *not_used); static void acpi_pci_root_remove(struct acpi_device *device); @@ -688,7 +687,6 @@ static int acpi_pci_root_add(struct acpi_device *device, root->device = device; root->segment = segment & 0xFFFF; - strscpy(acpi_device_class(device), ACPI_PCI_ROOT_CLASS); device->driver_data = root; if (hotadd && dmar_device_add(handle)) { diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index dcc9ad7790f0..6b1680ec3694 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -37,7 +37,6 @@ #include "sleep.h" #include "internal.h" -#define ACPI_POWER_CLASS "power_resource" #define ACPI_POWER_RESOURCE_STATE_OFF 0x00 #define ACPI_POWER_RESOURCE_STATE_ON 0x01 #define ACPI_POWER_RESOURCE_STATE_UNKNOWN 0xFF @@ -954,7 +953,6 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle) mutex_init(&resource->resource_lock); INIT_LIST_HEAD(&resource->list_node); INIT_LIST_HEAD(&resource->dependents); - strscpy(acpi_device_class(device), ACPI_POWER_CLASS); device->power.state = ACPI_STATE_UNKNOWN; device->flags.match_driver = true; diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index bbe2c5afb8ba..cda8fd720000 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -85,8 +85,8 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) return; } - acpi_bus_generate_netlink_event(ACPI_PROCESSOR_CLASS, - dev_name(&device->dev), event, ev_data); + acpi_bus_generate_netlink_event("processor", dev_name(&device->dev), + event, ev_data); } static int __acpi_processor_start(struct acpi_device *device); diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 7e789290c5ad..e301d73ac420 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -26,7 +26,6 @@ #include "sbshc.h" -#define ACPI_SBS_CLASS "sbs" #define ACPI_AC_CLASS "ac_adapter" #define ACPI_SBS_DEVICE_NAME "Smart Battery System" #define ACPI_BATTERY_DIR_NAME "BAT%i" @@ -648,7 +647,6 @@ static int acpi_sbs_probe(struct platform_device *pdev) sbs->hc = dev_get_drvdata(pdev->dev.parent); sbs->device = device; - strscpy(acpi_device_class(device), ACPI_SBS_CLASS); result = acpi_charger_add(sbs); if (result && result != -ENODEV) diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index 97eaa2fc31f2..f413270415b6 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -18,8 +18,6 @@ #include "sbshc.h" #include "internal.h" -#define ACPI_SMB_HC_CLASS "smbus_host_ctl" - struct acpi_smb_hc { struct acpi_ec *ec; struct mutex lock; @@ -250,8 +248,6 @@ static int acpi_smbus_hc_probe(struct platform_device *pdev) return -EIO; } - strscpy(acpi_device_class(device), ACPI_SMB_HC_CLASS); - hc = kzalloc_obj(struct acpi_smb_hc); if (!hc) return -ENOMEM; diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index e764641a43c1..b8b487d89d25 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -799,7 +799,6 @@ static int acpi_thermal_probe(struct platform_device *pdev) tz->device = device; strscpy(tz->name, device->pnp.bus_id); - strscpy(acpi_device_class(device), ACPI_THERMAL_CLASS); acpi_thermal_aml_dependency_fix(tz); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 43fe4a85fc0f..554be224ce76 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -14,7 +14,6 @@ #include -#define ACPI_PROCESSOR_CLASS "processor" #define ACPI_PROCESSOR_DEVICE_HID "ACPI0007" #define ACPI_PROCESSOR_CONTAINER_HID "ACPI0010" -- cgit v1.2.3 From 236ad358166cca167e6ed33639bb7948e7a2f6fd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 14:03:46 +0100 Subject: ACPI: AC: Define ACPI_AC_CLASS in one place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACPI_AC_CLASS symbol is defined in several places in the same way which is rather unfortunate. Instead, define it in one common header file (acpi_bus.h) so that it is accessible to all of its users. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/6163384.MhkbZ0Pkbq@rafael.j.wysocki --- drivers/acpi/ac.c | 1 - drivers/acpi/sbs.c | 1 - drivers/gpu/drm/amd/include/amd_acpi.h | 2 -- drivers/gpu/drm/radeon/radeon_acpi.c | 2 -- drivers/platform/x86/hp/hp-wmi.c | 2 -- drivers/platform/x86/lenovo/wmi-capdata.c | 1 - include/acpi/acpi_bus.h | 2 ++ 7 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 2825db974bd8..e9e970fd8f33 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -21,7 +21,6 @@ #include #include -#define ACPI_AC_CLASS "ac_adapter" #define ACPI_AC_FILE_STATE "state" #define ACPI_AC_NOTIFY_STATUS 0x80 #define ACPI_AC_STATUS_OFFLINE 0x00 diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index e301d73ac420..440f1d69aca8 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -26,7 +26,6 @@ #include "sbshc.h" -#define ACPI_AC_CLASS "ac_adapter" #define ACPI_SBS_DEVICE_NAME "Smart Battery System" #define ACPI_BATTERY_DIR_NAME "BAT%i" #define ACPI_AC_DIR_NAME "AC0" diff --git a/drivers/gpu/drm/amd/include/amd_acpi.h b/drivers/gpu/drm/amd/include/amd_acpi.h index 84933c07f720..4225640131f2 100644 --- a/drivers/gpu/drm/amd/include/amd_acpi.h +++ b/drivers/gpu/drm/amd/include/amd_acpi.h @@ -26,8 +26,6 @@ #include -#define ACPI_AC_CLASS "ac_adapter" - struct atif_verify_interface { u16 size; /* structure size in bytes (includes size field) */ u16 version; /* version */ diff --git a/drivers/gpu/drm/radeon/radeon_acpi.c b/drivers/gpu/drm/radeon/radeon_acpi.c index 08f8ba4fd148..9f511ff08822 100644 --- a/drivers/gpu/drm/radeon/radeon_acpi.c +++ b/drivers/gpu/drm/radeon/radeon_acpi.c @@ -44,8 +44,6 @@ bool radeon_atpx_dgpu_req_power_for_displays(void); static inline bool radeon_atpx_dgpu_req_power_for_displays(void) { return false; } #endif -#define ACPI_AC_CLASS "ac_adapter" - struct atif_verify_interface { u16 size; /* structure size in bytes (includes size field) */ u16 version; /* version */ diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 68ede7e5757a..1ee8e2a5c738 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -58,8 +58,6 @@ enum hp_ec_offsets { #define HP_POWER_LIMIT_DEFAULT 0x00 #define HP_POWER_LIMIT_NO_CHANGE 0xFF -#define ACPI_AC_CLASS "ac_adapter" - #define zero_if_sup(tmp) (zero_insize_support?0:sizeof(tmp)) // use when zero insize is required enum hp_thermal_profile_omen_v0 { diff --git a/drivers/platform/x86/lenovo/wmi-capdata.c b/drivers/platform/x86/lenovo/wmi-capdata.c index ee1fb02d8e31..b73d378f0e8b 100644 --- a/drivers/platform/x86/lenovo/wmi-capdata.c +++ b/drivers/platform/x86/lenovo/wmi-capdata.c @@ -53,7 +53,6 @@ #define LENOVO_CAPABILITY_DATA_01_GUID "7A8F5407-CB67-4D6E-B547-39B3BE018154" #define LENOVO_FAN_TEST_DATA_GUID "B642801B-3D21-45DE-90AE-6E86F164FB21" -#define ACPI_AC_CLASS "ac_adapter" #define ACPI_AC_NOTIFY_STATUS 0x80 #define LWMI_FEATURE_ID_FAN_TEST 0x05 diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index ff14c9362122..f7c2d3daed44 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -613,6 +613,8 @@ struct acpi_bus_event { u32 data; }; +#define ACPI_AC_CLASS "ac_adapter" + extern struct kobject *acpi_kobj; extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int); void acpi_bus_private_data_handler(acpi_handle, void *); -- cgit v1.2.3 From 98d709cba3193f0bec54da4cd76ef499ea2f1ef7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Mar 2026 09:43:22 -1000 Subject: sched_ext: Implement SCX_ENQ_IMMED Add SCX_ENQ_IMMED enqueue flag for local DSQ insertions. Once a task is dispatched with IMMED, it either gets on the CPU immediately and stays on it, or gets reenqueued back to the BPF scheduler. It will never linger on a local DSQ behind other tasks or on a CPU taken by a higher-priority class. rq_is_open() uses rq->next_class to determine whether the rq is available, and wakeup_preempt_scx() triggers reenqueue when a higher-priority class task arrives. These capture all higher class preemptions. Combined with reenqueue points in the dispatch path, all cases where an IMMED task would not execute immediately are covered. SCX_TASK_IMMED persists in p->scx.flags until the next fresh enqueue, so the guarantee survives SAVE/RESTORE cycles. If preempted while running, put_prev_task_scx() reenqueues through ops.enqueue() with SCX_TASK_REENQ_PREEMPTED instead of silently placing the task back on the local DSQ. This enables tighter scheduling latency control by preventing tasks from piling up on local DSQs. It also enables opportunistic CPU sharing across sub-schedulers - without this, a sub-scheduler can stuff the local DSQ of a shared CPU, making it difficult for others to use. v2: - Rewrite is_curr_done() as rq_is_open() using rq->next_class and implement wakeup_preempt_scx() to achieve complete coverage of all cases where IMMED tasks could get stranded. - Track IMMED persistently in p->scx.flags and reenqueue preempted-while-running tasks through ops.enqueue(). - Bound deferred reenq cycles (SCX_REENQ_LOCAL_MAX_REPEAT). - Misc renames, documentation. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 + kernel/sched/ext.c | 271 ++++++++++++++++++++++++++++--- kernel/sched/ext_internal.h | 47 ++++++ kernel/sched/sched.h | 2 + tools/sched_ext/include/scx/compat.bpf.h | 5 + 5 files changed, 311 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60a4f65d0174..602dc83cab36 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -100,6 +100,7 @@ enum scx_ent_flags { SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ + SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ /* * Bits 8 and 9 are used to carry task state: @@ -125,6 +126,8 @@ enum scx_ent_flags { * * NONE not being reenqueued * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + * IMMED reenqueued due to failed ENQ_IMMED + * PREEMPTED preempted while running */ SCX_TASK_REENQ_REASON_SHIFT = 12, SCX_TASK_REENQ_REASON_BITS = 2, @@ -132,6 +135,8 @@ enum scx_ent_flags { SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT, /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2f59265b9b57..c75c35b67a18 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -406,6 +406,62 @@ static bool bypass_dsp_enabled(struct scx_sched *sch) return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); } +/** + * rq_is_open - Is the rq available for immediate execution of an SCX task? + * @rq: rq to test + * @enq_flags: optional %SCX_ENQ_* of the task being enqueued + * + * Returns %true if @rq is currently open for executing an SCX task. After a + * %false return, @rq is guaranteed to invoke SCX dispatch path at least once + * before going to idle and not inserting a task into @rq's local DSQ after a + * %false return doesn't cause @rq to stall. + */ +static bool rq_is_open(struct rq *rq, u64 enq_flags) +{ + lockdep_assert_rq_held(rq); + + /* + * A higher-priority class task is either running or in the process of + * waking up on @rq. + */ + if (sched_class_above(rq->next_class, &ext_sched_class)) + return false; + + /* + * @rq is either in transition to or in idle and there is no + * higher-priority class task waking up on it. + */ + if (sched_class_above(&ext_sched_class, rq->next_class)) + return true; + + /* + * @rq is either picking, in transition to, or running an SCX task. + */ + + /* + * If we're in the dispatch path holding rq lock, $curr may or may not + * be ready depending on whether the on-going dispatch decides to extend + * $curr's slice. We say yes here and resolve it at the end of dispatch. + * See balance_one(). + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return true; + + /* + * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, + * so allow it to avoid spuriously triggering reenq on a combined + * PREEMPT|IMMED insertion. + */ + if (enq_flags & SCX_ENQ_PREEMPT) + return true; + + /* + * @rq is either in transition to or running an SCX task and can't go + * idle without another SCX dispatch cycle. + */ + return false; +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1220,6 +1276,16 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq } } +static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) +{ + struct scx_sched *root = rcu_dereference_sched(scx_root); + + if (WARN_ON_ONCE(!root)) + return; + + schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags); +} + /** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked @@ -1296,10 +1362,58 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); } -static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ - WRITE_ONCE(dsq->nr, dsq->nr + delta); + WRITE_ONCE(dsq->nr, dsq->nr + 1); + + /* + * Once @p reaches a local DSQ, it can only leave it by being dispatched + * to the CPU or dequeued. In both cases, the only way @p can go back to + * the BPF sched is through enqueueing. If being inserted into a local + * DSQ with IMMED, persist the state until the next enqueueing event in + * do_enqueue_task() so that we can maintain IMMED protection through + * e.g. SAVE/RESTORE cycles and slice extensions. + */ + if (enq_flags & SCX_ENQ_IMMED) { + if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { + WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); + return; + } + p->scx.flags |= SCX_TASK_IMMED; + } + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + return; + + rq->scx.nr_immed++; + + /* + * If @rq already had other tasks or the current task is not + * done yet, @p can't go on the CPU immediately. Re-enqueue. + */ + if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) + schedule_reenq_local(rq, 0); + } +} + +static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) +{ + /* see dsq_inc_nr() */ + WRITE_ONCE(dsq->nr, dsq->nr - 1); + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || + WARN_ON_ONCE(rq->scx.nr_immed <= 0)) + return; + + rq->scx.nr_immed--; + } } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) @@ -1458,7 +1572,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; - dsq_mod_nr(dsq, 1); + dsq_inc_nr(dsq, p, enq_flags); p->scx.dsq = dsq; /* @@ -1512,7 +1626,7 @@ static void task_unlink_from_dsq(struct task_struct *p, } list_del_init(&p->scx.dsq_list.node); - dsq_mod_nr(dsq, -1); + dsq_dec_nr(dsq, p); if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { struct task_struct *first_task; @@ -1723,10 +1837,18 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); - /* rq migration */ + /* internal movements - rq migration / RESTORE */ if (sticky_cpu == cpu_of(rq)) goto local_norefill; + /* + * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). + * Note that exiting and migration-disabled tasks that skip + * ops.enqueue() below will lose IMMED protection unless + * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. + */ + p->scx.flags &= ~SCX_TASK_IMMED; + /* * If !scx_rq_online(), we already told the BPF scheduler that the CPU * is offline and are just running the hotplug path. Don't bother the @@ -2032,6 +2154,30 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) return false; } +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Preemption between SCX tasks is implemented by resetting the victim + * task's slice to 0 and triggering reschedule on the target CPU. + * Nothing to do. + */ + if (p->sched_class == &ext_sched_class) + return; + + /* + * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. + * This captures all preemption cases including: + * + * - A SCX task is currently running. + * + * - @rq is waking from idle due to a SCX task waking to it. + * + * - A higher-priority wakes up while SCX dispatch is in progress. + */ + if (rq->scx.nr_immed) + schedule_reenq_local(rq, 0); +} + static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) @@ -2049,7 +2195,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, else list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); - dsq_mod_nr(dst_dsq, 1); + dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; local_dsq_post_enq(dst_dsq, p, enq_flags); @@ -2257,6 +2403,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dst_dsq = find_global_dsq(sch, task_cpu(p)); dst_rq = src_rq; + enq_flags |= SCX_ENQ_GDSQ_FALLBACK; } } else { /* no need to migrate if destination is a non-local DSQ */ @@ -2385,7 +2532,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, - enq_flags | SCX_ENQ_CLEAR_OPSS); + enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); return; } @@ -2738,6 +2885,19 @@ static int balance_one(struct rq *rq, struct task_struct *prev) return false; has_tasks: + /* + * @rq may have extra IMMED tasks without reenq scheduled: + * + * - rq_is_open() can't reliably tell when and how slice is going to be + * modified for $curr and allows IMMED tasks to be queued while + * dispatch is in progress. + * + * - A non-IMMED HEAD task can get queued in front of an IMMED task + * between the IMMED queueing and the subsequent scheduling event. + */ + if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) + schedule_reenq_local(rq, 0); + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; return true; } @@ -2859,11 +3019,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * If @p has slice left and is being put, @p is getting * preempted by a higher priority scheduler class or core-sched * forcing a different task. Leave it at the head of the local - * DSQ. + * DSQ unless it was an IMMED task. IMMED tasks should not + * linger on a busy CPU, reenqueue them to the BPF scheduler. */ if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { - dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, - SCX_ENQ_HEAD); + if (p->scx.flags & SCX_TASK_IMMED) { + p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + } else { + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); + } goto switch_class; } @@ -3682,8 +3848,6 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(scx_task_sched(p), p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} - static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3725,9 +3889,45 @@ static void process_ddsp_deferred_locals(struct rq *rq) } } +/* + * Determine whether @p should be reenqueued from a local DSQ. + * + * @reenq_flags is mutable and accumulates state across the DSQ walk: + * + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" + * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at + * the head consumes the first slot. + * + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if + * rq_is_open() is true. + * + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ + * AND the current task is done — i.e. it will execute immediately. All other + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, + * every IMMED task behind it gets reenqueued. + * + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT + * in process_deferred_reenq_locals(). + */ static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) { + bool first; + + first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); + *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; + *reason = SCX_TASK_REENQ_KFUNC; + + if ((p->scx.flags & SCX_TASK_IMMED) && + (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { + __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); + *reason = SCX_TASK_REENQ_IMMED; + return true; + } + return *reenq_flags & SCX_REENQ_ANY; } @@ -3739,6 +3939,11 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) lockdep_assert_rq_held(rq); + if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) + reenq_flags &= ~__SCX_REENQ_TSR_MASK; + if (rq_is_open(rq, 0)) + reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; + /* * The BPF scheduler may choose to dispatch tasks back to * @rq->scx.local_dsq. Move all candidate tasks off to a private list @@ -3792,11 +3997,14 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) static void process_deferred_reenq_locals(struct rq *rq) { + u64 seq = ++rq->scx.deferred_reenq_locals_seq; + lockdep_assert_rq_held(rq); while (true) { struct scx_sched *sch; u64 reenq_flags; + bool skip = false; scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { struct scx_deferred_reenq_local *drl = @@ -3811,15 +4019,31 @@ static void process_deferred_reenq_locals(struct rq *rq) sch_pcpu = container_of(drl, struct scx_sched_pcpu, deferred_reenq_local); sch = sch_pcpu->sch; + reenq_flags = drl->flags; WRITE_ONCE(drl->flags, 0); list_del_init(&drl->node); + + if (likely(drl->seq != seq)) { + drl->seq = seq; + drl->cnt = 0; + } else { + if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { + scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", + drl->cnt); + skip = true; + } + + __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); + } } - /* see schedule_dsq_reenq() */ - smp_mb(); + if (!skip) { + /* see schedule_dsq_reenq() */ + smp_mb(); - reenq_local(sch, rq, reenq_flags); + reenq_local(sch, rq, reenq_flags); + } } } @@ -4208,10 +4432,6 @@ static void scx_cgroup_unlock(void) {} /* * Omitted operations: * - * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task - * isn't tied to the CPU at that point. Preemption is implemented by resetting - * the victim task's slice to 0 and triggering reschedule on the target CPU. - * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of @@ -4580,6 +4800,8 @@ static ssize_t scx_attr_events_show(struct kobject *kobj, at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); @@ -6019,6 +6241,8 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); + scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); @@ -7532,6 +7756,13 @@ void __init init_sched_ext_class(void) */ static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags) { + if ((enq_flags & SCX_ENQ_IMMED) && + unlikely(dsq_id != SCX_DSQ_LOCAL && + (dsq_id & SCX_DSQ_LOCAL_ON) != SCX_DSQ_LOCAL_ON)) { + scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); + return false; + } + return true; } @@ -9101,6 +9332,8 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index c78dadaadab8..2ef855f7c861 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -31,6 +31,8 @@ enum scx_consts { SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + SCX_SUB_MAX_DEPTH = 4, }; @@ -887,6 +889,24 @@ struct scx_event_stats { */ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + /* + * The number of times a task, enqueued on a local DSQ with + * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for + * immediate execution. + */ + s64 SCX_EV_REENQ_IMMED; + + /* + * The number of times a reenq of local DSQ caused another reenq of + * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher + * priority class task even if the BPF scheduler always satisfies the + * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However, + * that scenario is very unlikely and this count going up regularly + * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ + * incorrectly causing recursive reenqueues. + */ + s64 SCX_EV_REENQ_LOCAL_REPEAT; + /* * Total number of times a task's time slice was refilled with the * default value (SCX_SLICE_DFL). @@ -951,6 +971,8 @@ struct scx_dsp_ctx { struct scx_deferred_reenq_local { struct list_head node; u64 flags; + u64 seq; + u32 cnt; }; struct scx_sched_pcpu { @@ -1074,6 +1096,24 @@ enum scx_enq_flags { */ SCX_ENQ_PREEMPT = 1LLU << 32, + /* + * Only allowed on local DSQs. Guarantees that the task either gets + * on the CPU immediately and stays on it, or gets reenqueued back + * to the BPF scheduler. It will never linger on a local DSQ or be + * silently put back after preemption. + * + * The protection persists until the next fresh enqueue - it + * survives SAVE/RESTORE cycles, slice extensions and preemption. + * If the task can't stay on the CPU for any reason, it gets + * reenqueued back to the BPF scheduler. + * + * Exiting and migration-disabled tasks bypass ops.enqueue() and + * are placed directly on a local DSQ without IMMED protection + * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED + * are set respectively. + */ + SCX_ENQ_IMMED = 1LLU << 33, + /* * The task being enqueued was previously enqueued on a DSQ, but was * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find @@ -1098,6 +1138,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, SCX_ENQ_NESTED = 1LLU << 58, + SCX_ENQ_GDSQ_FALLBACK = 1LLU << 59, /* fell back to global DSQ */ }; enum scx_deq_flags { @@ -1127,6 +1168,12 @@ enum scx_reenq_flags { __SCX_REENQ_FILTER_MASK = 0xffffLLU, __SCX_REENQ_USER_MASK = SCX_REENQ_ANY, + + /* bits 32-35 used by task_should_reenq() */ + SCX_REENQ_TSR_RQ_OPEN = 1LLU << 32, + SCX_REENQ_TSR_NOT_FIRST = 1LLU << 33, + + __SCX_REENQ_TSR_MASK = 0xfLLU << 32, }; enum scx_pick_idle_cpu_flags { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 60627119d0ab..5b93f6190d31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -800,6 +800,7 @@ struct scx_rq { u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ bool cpu_released; u32 flags; + u32 nr_immed; /* ENQ_IMMED tasks on local_dsq */ u64 clock; /* current per-rq clock -- see scx_bpf_now() */ cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; @@ -810,6 +811,7 @@ struct scx_rq { struct task_struct *sub_dispatch_prev; raw_spinlock_t deferred_reenq_lock; + u64 deferred_reenq_locals_seq; struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 704728864d83..cba37432eec0 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -404,6 +404,11 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs"); } +/* + * v7.1: %SCX_ENQ_IMMED. + */ +#define SCX_ENQ_IMMED __COMPAT_ENUM_OR_ZERO(enum scx_enq_flags, SCX_ENQ_IMMED) + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). -- cgit v1.2.3 From 82b6c1b542ea0530318c6f2a880d884eb4dce49f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 2 Mar 2026 17:29:05 +0100 Subject: of: Add of_machine_get_match() helper Currently, there are two helpers to match the root compatible value against an of_device_id array: - of_machine_device_match() returns true if a match is found, - of_machine_get_match_data() returns the match data if a match is found. However, there is no helper that returns the actual of_device_id structure corresponding to the match, leading to code duplication in various drivers. Fix this by reworking of_machine_device_match() to return the actual match structure, and renaming it to of_machine_get_match(). Retain the old of_machine_device_match() functionality using a cheap static inline wrapper around the new of_machine_get_match() helper. Signed-off-by: Geert Uytterhoeven Acked-by: Viresh Kumar Link: https://patch.msgid.link/14e1c03d443b1a5f210609ec3a1ebbaeab8fb3d9.1772468323.git.geert+renesas@glider.be Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 11 +++++------ include/linux/of.h | 11 ++++++++--- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index 57420806c1a2..2a01d2a66eed 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -435,13 +435,12 @@ bool of_machine_compatible_match(const char *const *compats) EXPORT_SYMBOL(of_machine_compatible_match); /** - * of_machine_device_match - Test root of device tree against a of_device_id array + * of_machine_get_match - Test root of device tree against an of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in * - * Returns true if the root node has any of the given compatible values in its - * compatible property. + * Returns matched entry or NULL */ -bool of_machine_device_match(const struct of_device_id *matches) +const struct of_device_id *of_machine_get_match(const struct of_device_id *matches) { struct device_node *root; const struct of_device_id *match = NULL; @@ -452,9 +451,9 @@ bool of_machine_device_match(const struct of_device_id *matches) of_node_put(root); } - return match != NULL; + return match; } -EXPORT_SYMBOL(of_machine_device_match); +EXPORT_SYMBOL(of_machine_get_match); /** * of_machine_get_match_data - Tell if root of device tree has a matching of_match structure diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..b4d7d33b0ceb 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,7 +410,7 @@ extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); bool of_machine_compatible_match(const char *const *compats); -bool of_machine_device_match(const struct of_device_id *matches); +const struct of_device_id *of_machine_get_match(const struct of_device_id *matches); const void *of_machine_get_match_data(const struct of_device_id *matches); /** @@ -866,9 +866,9 @@ static inline bool of_machine_compatible_match(const char *const *compats) return false; } -static inline bool of_machine_device_match(const struct of_device_id *matches) +static inline const struct of_device_id *of_machine_get_match(const struct of_device_id *matches) { - return false; + return NULL; } static inline const void * @@ -976,6 +976,11 @@ static inline int of_numa_init(void) } #endif +static inline bool of_machine_device_match(const struct of_device_id *matches) +{ + return of_machine_get_match(matches) != NULL; +} + static inline struct device_node *of_find_matching_node( struct device_node *from, const struct of_device_id *matches) -- cgit v1.2.3 From d7eafe655b741dfc241d5b920f6d2cea45b568d9 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:13:16 +0800 Subject: dma-mapping: Separate DMA sync issuing and completion waiting Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device always wait for the completion of each DMA buffer. That is, issuing the DMA sync and waiting for completion is done in a single API call. For scatter-gather lists with multiple entries, this means issuing and waiting is repeated for each entry, which can hurt performance. Architectures like ARM64 may be able to issue all DMA sync operations for all entries first and then wait for completion together. To address this, arch_sync_dma_for_* now batches DMA operations and performs a flush afterward. On ARM64, the flush is implemented with a dsb instruction in arch_sync_dma_flush(). On other architectures, arch_sync_dma_flush() is currently a nop. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Joerg Roedel Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Tangquan Zheng Reviewed-by: Juergen Gross # drivers/xen/swiotlb-xen.c Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221316.59934-1-21cnbao@gmail.com --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/cache.h | 5 +++++ arch/arm64/mm/dma-mapping.c | 4 ++-- drivers/iommu/dma-iommu.c | 35 +++++++++++++++++++++++++++-------- drivers/xen/swiotlb-xen.c | 24 ++++++++++++++++-------- include/linux/dma-map-ops.h | 6 ++++++ kernel/dma/Kconfig | 3 +++ kernel/dma/direct.c | 6 +++++- kernel/dma/direct.h | 9 +++++++-- kernel/dma/swiotlb.c | 7 ++++++- 10 files changed, 78 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..ceafaac6532c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -55,6 +55,7 @@ config ARM64 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select ARCH_HAS_BATCHED_DMA_SYNC select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_ZONE_DMA_SET if EXPERT diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index dd2c8586a725..10a7ffadee3d 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -87,6 +87,11 @@ int cache_line_size(void); #define dma_get_cache_alignment cache_line_size +static inline void arch_sync_dma_flush(void) +{ + dsb(sy); +} + /* Compress a u64 MPIDR value into 32 bits. */ static inline u64 arch_compact_of_hwid(u64 id) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index b2b5792b2caa..ae1ae0280eef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, { unsigned long start = (unsigned long)phys_to_virt(paddr); - dcache_clean_poc(start, start + size); + dcache_clean_poc_nosync(start, start + size); } void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, if (dir == DMA_TO_DEVICE) return; - dcache_inval_poc(start, start + size); + dcache_inval_poc_nosync(start, start + size); } void arch_dma_prep_coherent(struct page *page, size_t size) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 5dac64be61bb..66fc25bae85b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1095,8 +1095,10 @@ void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } swiotlb_sync_single_for_cpu(dev, phys, size, dir); } @@ -1112,8 +1114,10 @@ void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); swiotlb_sync_single_for_device(dev, phys, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, @@ -1122,13 +1126,15 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, @@ -1137,14 +1143,16 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_device(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, @@ -1219,8 +1227,10 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, return DMA_MAPPING_ERROR; } - if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) @@ -1242,8 +1252,10 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, if (WARN_ON(!phys)) return; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } __iommu_dma_unmap(dev, dma_handle, size); @@ -1980,6 +1992,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state, dma_addr_t addr = state->addr + offset; size_t iova_start_pad = iova_offset(iovad, addr); + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); return iommu_sync_map(domain, addr - iova_start_pad, iova_align(iovad, size + iova_start_pad)); } @@ -1993,6 +2007,8 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_start_pad = iova_offset(iovad, addr); + bool need_sync_dma = !dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)); dma_addr_t end = addr + size; do { @@ -2016,6 +2032,9 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, addr += len; iova_start_pad = 0; } while (addr < end); + + if (need_sync_dma) + arch_sync_dma_flush(); } static void __iommu_dma_iova_unlink(struct device *dev, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ccf25027bec1..b79917e785a5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -262,10 +262,12 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys, done: if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) { arch_sync_dma_for_device(phys, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dev_addr, size, dir); + } } return dev_addr; } @@ -287,10 +289,12 @@ static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); + } } /* NOTE: We use dev_addr here, not paddr! */ @@ -308,10 +312,12 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, struct io_tlb_pool *pool; if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(dev, dma_addr, size, dir); + } } pool = xen_swiotlb_find_pool(dev, dma_addr); @@ -331,10 +337,12 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_device(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dma_addr, size, dir); + } } } diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..8a07df5a9ef6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ +#ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC +static inline void arch_sync_dma_flush(void) +{ +} +#endif + #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL void arch_sync_dma_for_cpu_all(void); #else diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 159900736f25..bfef21b4a9ae 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +config ARCH_HAS_BATCHED_DMA_SYNC + bool + # # Select this option if the architecture assumes DMA devices are coherent # by default. diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f43a930716d..c7666e5d5e7c 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(paddr, sg->length, dir); } + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -427,8 +429,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); } - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); + } } /* diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f476c63b668c..f925a7e8b000 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -60,8 +60,10 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, swiotlb_sync_single_for_device(dev, paddr, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(paddr, size, dir); + arch_sync_dma_flush(); + } } static inline void dma_direct_sync_single_for_cpu(struct device *dev, @@ -71,6 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); } @@ -106,8 +109,10 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, } if (!dev_is_dma_coherent(dev) && - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } return dma_addr; err_overflow: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..1105db1689d5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -867,6 +867,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size if (orig_addr == INVALID_PHYS_ADDR) return; + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); + /* * It's valid for tlb_offset to be negative. This can happen when the * "offset" returned by swiotlb_align_offset() is non-zero, and the @@ -1595,8 +1598,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, return DMA_MAPPING_ERROR; } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { arch_sync_dma_for_device(swiotlb_addr, size, dir); + arch_sync_dma_flush(); + } return dma_addr; } -- cgit v1.2.3 From 2e03c0c5c59a086df534e15ddde03cb33bc475c4 Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Thu, 12 Mar 2026 20:20:12 +0100 Subject: drm/pagemap: Add helper to access zone_device_data This new helper helps ensure all accesses to zone_device_data use the correct API whether the page is part of a folio or not. v2: - Move to drm_pagemap.h, stick to folio_zone_device_data (Matthew Brost) - Return struct drm_pagemap_zdd * (Matthew Brost) v3: - Add stub for !CONFIG_ZONE_DEVICE (CI) Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Suggested-by: Matthew Brost Reviewed-by: Matthew Brost Signed-off-by: Francois Dugast Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260312192126.2024853-3-francois.dugast@intel.com --- drivers/gpu/drm/drm_gpusvm.c | 7 +++++-- drivers/gpu/drm/drm_pagemap.c | 21 ++++++++++++--------- include/drm/drm_pagemap.h | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index 35dd07297dd0..4b928fda5b12 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1488,12 +1488,15 @@ map_pages: order = drm_gpusvm_hmm_pfn_to_order(pfns[i], i, npages); if (is_device_private_page(page) || is_device_coherent_page(page)) { + struct drm_pagemap_zdd *__zdd = + drm_pagemap_page_zone_device_data(page); + if (!ctx->allow_mixed && - zdd != page->zone_device_data && i > 0) { + zdd != __zdd && i > 0) { err = -EOPNOTSUPP; goto err_unmap; } - zdd = page->zone_device_data; + zdd = __zdd; if (pagemap != page_pgmap(page)) { if (pagemap) { err = -EOPNOTSUPP; diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index f453a12b6a8e..733a3857947c 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -252,7 +252,7 @@ static int drm_pagemap_migrate_map_pages(struct device *dev, order = folio_order(folio); if (is_device_private_page(page)) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); struct drm_pagemap *dpagemap = zdd->dpagemap; struct drm_pagemap_addr addr; @@ -323,7 +323,7 @@ static void drm_pagemap_migrate_unmap_pages(struct device *dev, goto next; if (is_zone_device_page(page)) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); struct drm_pagemap *dpagemap = zdd->dpagemap; dpagemap->ops->device_unmap(dpagemap, dev, &pagemap_addr[i]); @@ -601,7 +601,8 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, pages[i] = NULL; if (src_page && is_device_private_page(src_page)) { - struct drm_pagemap_zdd *src_zdd = src_page->zone_device_data; + struct drm_pagemap_zdd *src_zdd = + drm_pagemap_page_zone_device_data(src_page); if (page_pgmap(src_page) == pagemap && !mdetails->can_migrate_same_pagemap) { @@ -723,8 +724,8 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas, goto next; if (fault_page) { - if (src_page->zone_device_data != - fault_page->zone_device_data) + if (drm_pagemap_page_zone_device_data(src_page) != + drm_pagemap_page_zone_device_data(fault_page)) goto next; } @@ -1065,7 +1066,7 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, void *buf; int i, err = 0; - zdd = page->zone_device_data; + zdd = drm_pagemap_page_zone_device_data(page); if (time_before64(get_jiffies_64(), zdd->devmem_allocation->timeslice_expiration)) return 0; @@ -1148,7 +1149,9 @@ err_out: */ static void drm_pagemap_folio_free(struct folio *folio) { - drm_pagemap_zdd_put(folio->page.zone_device_data); + struct page *page = folio_page(folio, 0); + + drm_pagemap_zdd_put(drm_pagemap_page_zone_device_data(page)); } /** @@ -1164,7 +1167,7 @@ static void drm_pagemap_folio_free(struct folio *folio) */ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf) { - struct drm_pagemap_zdd *zdd = vmf->page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(vmf->page); int err; err = __drm_pagemap_migrate_to_ram(vmf->vma, @@ -1230,7 +1233,7 @@ EXPORT_SYMBOL_GPL(drm_pagemap_devmem_init); */ struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); return zdd->devmem_allocation->dpagemap; } diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index c848f578e3da..75e6ca58922d 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -4,6 +4,7 @@ #include #include +#include #include #define NR_PAGES(order) (1U << (order)) @@ -367,6 +368,26 @@ void drm_pagemap_destroy(struct drm_pagemap *dpagemap, bool is_atomic_or_reclaim int drm_pagemap_reinit(struct drm_pagemap *dpagemap); +/** + * drm_pagemap_page_zone_device_data() - Page to zone_device_data + * @page: Pointer to the page + * + * Return: Page's zone_device_data + */ +static inline struct drm_pagemap_zdd *drm_pagemap_page_zone_device_data(struct page *page) +{ + struct folio *folio = page_folio(page); + + return folio_zone_device_data(folio); +} + +#else + +static inline struct drm_pagemap_zdd *drm_pagemap_page_zone_device_data(struct page *page) +{ + return NULL; +} + #endif /* IS_ENABLED(CONFIG_ZONE_DEVICE) */ #endif -- cgit v1.2.3 From 86a41d957ba058932d58c2d7729451afe8625ce9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:48 +0000 Subject: udp: Make udp[46]_seq_show() static. Since commit a3d2599b2446 ("ipv{4,6}/udp{,lite}: simplify proc registration"), udp4_seq_show() and udp6_seq_show() are not used in net/ipv4/udplite.c and net/ipv6/udplite.c. Instead, udp_seq_ops and udp6_seq_ops are exposed to UDP-Lite. Let's make udp4_seq_show() and udp6_seq_show() static. udp_seq_ops and udp6_seq_ops are moved to udp_impl.h so that we can make them static when the header is removed. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/udp.h | 3 --- net/ipv4/udp.c | 3 +-- net/ipv4/udp_impl.h | 2 +- net/ipv6/udp.c | 3 +-- net/ipv6/udp_impl.h | 2 +- 5 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index b648003e5792..f51a51c0e468 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -576,9 +576,6 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); -extern const struct seq_operations udp_seq_ops; -extern const struct seq_operations udp6_seq_ops; - int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 668a0284c3d5..1fcdc5482594 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3443,7 +3443,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, sk_drops_read(sp)); } -int udp4_seq_show(struct seq_file *seq, void *v) +static int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) @@ -3753,7 +3753,6 @@ const struct seq_operations udp_seq_ops = { .stop = udp_seq_stop, .show = udp4_seq_show, }; -EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 17a6fa8b1409..0ca4384f9afa 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -22,6 +22,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -int udp4_seq_show(struct seq_file *seq, void *v); +extern const struct seq_operations udp_seq_ops; #endif #endif /* _UDP4_IMPL_H */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5a3984e59c90..5fef1c226697 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1903,7 +1903,7 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -int udp6_seq_show(struct seq_file *seq, void *v) +static int udp6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); @@ -1924,7 +1924,6 @@ const struct seq_operations udp6_seq_ops = { .stop = udp_seq_stop, .show = udp6_seq_show, }; -EXPORT_SYMBOL(udp6_seq_ops); static struct udp_seq_afinfo udp6_seq_afinfo = { .family = AF_INET6, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 1bd4a573e1bb..525ea600228a 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -26,6 +26,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -int udp6_seq_show(struct seq_file *seq, void *v); +extern const struct seq_operations udp6_seq_ops; #endif #endif /* _UDP6_IMPL_H */ -- cgit v1.2.3 From 62554a51c5844feebe0466d8b31980e110b481de Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:49 +0000 Subject: ipv6: Retire UDP-Lite. As announced in commit be28c14ac8bb ("udplite: Print deprecation notice."), it's time to deprecate UDP-Lite. As a first step, let's drop support for IPv6 UDP-Lite sockets. We will remove the remaining dead code gradually. Along with the removal of udplite.c, most of the functions exposed via udp_impl.h are made static. The prototypes of udpv6_sendmsg() and udpv6_recvmsg() are moved to udp.h, but only udpv6_recvmsg() has INDIRECT_CALLABLE_DECLARE() because udpv6_sendmsg() is exported for rxrpc since commit ed472b0c8783 ("rxrpc: Call udp_sendmsg() directly"). Also, udpv6_recvmsg() needs INDIRECT_CALLABLE_SCOPE for CONFIG_MITIGATION_RETPOLINE=n. Note that udplite.h is included temporarily for udplite_csum(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 - include/net/transp_v6.h | 3 -- include/net/udp.h | 4 ++ net/ipv6/Makefile | 2 +- net/ipv6/af_inet6.c | 23 +------- net/ipv6/proc.c | 2 - net/ipv6/udp.c | 32 +++++------ net/ipv6/udp_impl.h | 31 ----------- net/ipv6/udplite.c | 139 ------------------------------------------------ net/rxrpc/output.c | 2 - 10 files changed, 23 insertions(+), 217 deletions(-) delete mode 100644 net/ipv6/udp_impl.h delete mode 100644 net/ipv6/udplite.c (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 1c0ce5151275..0958cc5c6ec3 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1179,8 +1179,6 @@ int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); -int udplite6_proc_init(void); -void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 1a97e3f32029..c0a421fe0c2a 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -8,7 +8,6 @@ /* IPv6 transport protocols */ extern struct proto rawv6_prot; extern struct proto udpv6_prot; -extern struct proto udplitev6_prot; extern struct proto tcpv6_prot; extern struct proto pingv6_prot; @@ -28,8 +27,6 @@ int rawv6_init(void); void rawv6_exit(void); int udpv6_init(void); void udpv6_exit(void); -int udplitev6_init(void); -void udplitev6_exit(void); int tcpv6_init(void); void tcpv6_exit(void); diff --git a/include/net/udp.h b/include/net/udp.h index f51a51c0e468..05f63e9e00a7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -282,6 +282,10 @@ typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); +int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags)); + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 0492f1a0b491..2c9ce2ccbde1 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ - route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ + route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 03c175cbbdb6..61f7bc88526a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -636,8 +635,6 @@ int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL_GPL(inet6_compat_ioctl); #endif /* CONFIG_COMPAT */ -INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, - size_t)); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -652,8 +649,6 @@ int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) sk, msg, size); } -INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, - size_t, int)); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -1080,13 +1075,9 @@ static int __init inet6_init(void) if (err) goto out_unregister_tcp_proto; - err = proto_register(&udplitev6_prot, 1); - if (err) - goto out_unregister_udp_proto; - err = proto_register(&rawv6_prot, 1); if (err) - goto out_unregister_udplite_proto; + goto out_unregister_udp_proto; err = proto_register(&pingv6_prot, 1); if (err) @@ -1137,8 +1128,6 @@ static int __init inet6_init(void) err = -ENOMEM; if (raw6_proc_init()) goto proc_raw6_fail; - if (udplite6_proc_init()) - goto proc_udplite6_fail; if (ipv6_misc_proc_init()) goto proc_misc6_fail; if (if6_proc_init()) @@ -1174,10 +1163,6 @@ static int __init inet6_init(void) if (err) goto udpv6_fail; - err = udplitev6_init(); - if (err) - goto udplitev6_fail; - err = udpv6_offload_init(); if (err) goto udpv6_offload_fail; @@ -1248,8 +1233,6 @@ ipv6_packet_fail: tcpv6_fail: udpv6_offload_exit(); udpv6_offload_fail: - udplitev6_exit(); -udplitev6_fail: udpv6_exit(); udpv6_fail: ipv6_frag_exit(); @@ -1271,8 +1254,6 @@ ip6_route_fail: proc_if6_fail: ipv6_misc_proc_exit(); proc_misc6_fail: - udplite6_proc_exit(); -proc_udplite6_fail: raw6_proc_exit(); proc_raw6_fail: #endif @@ -1296,8 +1277,6 @@ out_unregister_ping_proto: proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); -out_unregister_udplite_proto: - proto_unregister(&udplitev6_prot); out_unregister_udp_proto: proto_unregister(&udpv6_prot); out_unregister_tcp_proto: diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 73296f38c252..21bfc73152f0 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -39,8 +39,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", sock_prot_inuse_get(net, &udpv6_prot)); - seq_printf(seq, "UDPLITE6: inuse %d\n", - sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5fef1c226697..aa859bb0527d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,7 @@ #include #include #include -#include "udp_impl.h" +#include static void udpv6_destruct_sock(struct sock *sk) { @@ -65,7 +66,7 @@ static void udpv6_destruct_sock(struct sock *sk) inet6_sock_destruct(sk); } -int udpv6_init_sock(struct sock *sk) +static int udpv6_init_sock(struct sock *sk) { int res = udp_lib_init_sock(sk); @@ -95,7 +96,7 @@ u32 udp6_ehashfn(const struct net *net, udp6_ehash_secret + net_hash_mix(net)); } -int udp_v6_get_port(struct sock *sk, unsigned short snum) +static int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); @@ -107,7 +108,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, hash2_nulladdr); } -void udp_v6_rehash(struct sock *sk) +static void udp_v6_rehash(struct sock *sk) { u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, @@ -464,6 +465,7 @@ static int udp6_skb_len(struct sk_buff *skb) * return it, otherwise we block. */ +INDIRECT_CALLABLE_SCOPE int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { @@ -700,9 +702,9 @@ out: return sk; } -int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info, - struct udp_table *udptable) +static int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) { struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -1115,8 +1117,8 @@ static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) return 0; } -int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, - int proto) +static int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + int proto) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; @@ -1854,7 +1856,7 @@ static void udpv6_splice_eof(struct socket *sock) release_sock(sk); } -void udpv6_destroy_sock(struct sock *sk) +static void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); lock_sock(sk); @@ -1882,8 +1884,8 @@ void udpv6_destroy_sock(struct sock *sk) /* * Socket option code for UDP */ -int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) +static int udpv6_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, @@ -1892,8 +1894,8 @@ int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, return ipv6_setsockopt(sk, level, optname, optval, optlen); } -int udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +static int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); @@ -1918,7 +1920,7 @@ static int udp6_seq_show(struct seq_file *seq, void *v) return 0; } -const struct seq_operations udp6_seq_ops = { +static const struct seq_operations udp6_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h deleted file mode 100644 index 525ea600228a..000000000000 --- a/net/ipv6/udp_impl.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _UDP6_IMPL_H -#define _UDP6_IMPL_H -#include -#include -#include -#include -#include -#include -#include - -int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); -int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, - __be32, struct udp_table *); - -int udpv6_init_sock(struct sock *sk); -int udp_v6_get_port(struct sock *sk, unsigned short snum); -void udp_v6_rehash(struct sock *sk); - -int udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen); -int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); -void udpv6_destroy_sock(struct sock *sk); - -#ifdef CONFIG_PROC_FS -extern const struct seq_operations udp6_seq_ops; -#endif -#endif /* _UDP6_IMPL_H */ diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c deleted file mode 100644 index e867721cda4d..000000000000 --- a/net/ipv6/udplite.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6. - * See also net/ipv4/udplite.c - * - * Authors: Gerrit Renker - * - * Changes: - * Fixes: - */ -#define pr_fmt(fmt) "UDPLite6: " fmt - -#include -#include -#include "udp_impl.h" - -static int udplitev6_sk_init(struct sock *sk) -{ - pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - return udpv6_init_sock(sk); -} - -static int udplitev6_rcv(struct sk_buff *skb) -{ - return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); -} - -static int udplitev6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - return __udp6_lib_err(skb, opt, type, code, offset, info, - &udplite_table); -} - -static const struct inet6_protocol udplitev6_protocol = { - .handler = udplitev6_rcv, - .err_handler = udplitev6_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; - -struct proto udplitev6_prot = { - .name = "UDPLITEv6", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip6_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udplitev6_sk_init, - .destroy = udpv6_destroy_sock, - .setsockopt = udpv6_setsockopt, - .getsockopt = udpv6_getsockopt, - .sendmsg = udpv6_sendmsg, - .recvmsg = udpv6_recvmsg, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v6_rehash, - .get_port = udp_v6_get_port, - - .memory_allocated = &net_aligned_data.udp_memory_allocated, - .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, - - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), - .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), - .obj_size = sizeof(struct udp6_sock), - .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), - .h.udp_table = &udplite_table, -}; - -static struct inet_protosw udplite6_protosw = { - .type = SOCK_DGRAM, - .protocol = IPPROTO_UDPLITE, - .prot = &udplitev6_prot, - .ops = &inet6_dgram_ops, - .flags = INET_PROTOSW_PERMANENT, -}; - -int __init udplitev6_init(void) -{ - int ret; - - ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); - if (ret) - goto out; - - ret = inet6_register_protosw(&udplite6_protosw); - if (ret) - goto out_udplitev6_protocol; -out: - return ret; - -out_udplitev6_protocol: - inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); - goto out; -} - -void udplitev6_exit(void) -{ - inet6_unregister_protosw(&udplite6_protosw); - inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); -} - -#ifdef CONFIG_PROC_FS -static struct udp_seq_afinfo udplite6_seq_afinfo = { - .family = AF_INET6, - .udp_table = &udplite_table, -}; - -static int __net_init udplite6_proc_init_net(struct net *net) -{ - if (!proc_create_net_data("udplite6", 0444, net->proc_net, - &udp6_seq_ops, sizeof(struct udp_iter_state), - &udplite6_seq_afinfo)) - return -ENOMEM; - return 0; -} - -static void __net_exit udplite6_proc_exit_net(struct net *net) -{ - remove_proc_entry("udplite6", net->proc_net); -} - -static struct pernet_operations udplite6_net_ops = { - .init = udplite6_proc_init_net, - .exit = udplite6_proc_exit_net, -}; - -int __init udplite6_proc_init(void) -{ - return register_pernet_subsys(&udplite6_net_ops); -} - -void udplite6_proc_exit(void) -{ - unregister_pernet_subsys(&udplite6_net_ops); -} -#endif diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index d70db367e358..e5880116e087 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -16,8 +16,6 @@ #include #include "ar-internal.h" -extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); - ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; -- cgit v1.2.3 From 56520b398e5e6ee129c6279d8649ca959765a0a0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:51 +0000 Subject: ipv4: Retire UDP-Lite. We have deprecated IPv6 UDP-Lite sockets. Let's drop support for IPv4 UDP-Lite sockets as well. Most of the changes are similar to the IPv6 patch: removing udplite.c and udp_impl.h, marking most functions in udp_impl.h as static, moving the prototype for udp_recvmsg() to udp.h, and adding INDIRECT_CALLABLE_SCOPE for it. In addition, the INET_DIAG support for UDP-Lite is dropped. We will remove the remaining dead code in the following patches. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 +- include/net/udp.h | 7 +-- include/net/udplite.h | 4 -- net/ipv4/Makefile | 2 +- net/ipv4/af_inet.c | 6 --- net/ipv4/proc.c | 3 -- net/ipv4/udp.c | 33 ++++++------ net/ipv4/udp_bpf.c | 2 - net/ipv4/udp_diag.c | 47 +----------------- net/ipv4/udp_impl.h | 27 ---------- net/ipv4/udplite.c | 135 -------------------------------------------------- 11 files changed, 26 insertions(+), 244 deletions(-) delete mode 100644 net/ipv4/udp_impl.h delete mode 100644 net/ipv4/udplite.c (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 6c3f1340e8ef..16a1b8895206 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -126,14 +126,14 @@ typedef __u64 __bitwise __addrpair; * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables - * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol + * @skc_portaddr_node: second hash linkage for UDP * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables - * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol + * @skc_nulls_node: main hash linkage for TCP * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags diff --git a/include/net/udp.h b/include/net/udp.h index 05f63e9e00a7..39223e2692e9 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -105,7 +105,7 @@ struct udp_table { unsigned int log; }; extern struct udp_table udp_table; -void udp_table_init(struct udp_table *, const char *); + static inline struct udp_hslot *udp_hashslot(struct udp_table *table, const struct net *net, unsigned int num) @@ -312,7 +312,7 @@ static inline void udp_drops_inc(struct sock *sk) numa_drop_add(&udp_sk(sk)->drop_counters, 1); } -/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ +/* hash routines shared between UDPv4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); @@ -420,6 +420,8 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags)); void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); @@ -427,7 +429,6 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); -int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); diff --git a/include/net/udplite.h b/include/net/udplite.h index 786919d29f8d..fdd769745ac4 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -12,9 +12,6 @@ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ -extern struct proto udplite_prot; -extern struct udp_table udplite_table; - /* * Checksum computation is all in software, hence simpler getfrag. */ @@ -84,5 +81,4 @@ static inline __wsum udplite_csum(struct sk_buff *skb) return skb_checksum(skb, off, len, 0); } -void udplite4_register(void); #endif /* _UDPLITE_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 18108a6f0499..7f9f98813986 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_recovery.o tcp_ulp.o \ - tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4a112f0ee269..5d4bc4c1a731 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -104,7 +104,6 @@ #include #include #include -#include #include #include #include @@ -884,8 +883,6 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); -INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, - size_t, int)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -1985,9 +1982,6 @@ static int __init inet_init(void) /* Setup UDP memory threshold */ udp_init(); - /* Add UDP-Lite (RFC 3828) */ - udplite4_register(); - raw_init(); ping_init(); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 974afc4ecbe2..cf51f8fcf34b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -65,8 +64,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", - sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %u memory %lu\n", diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1fcdc5482594..b3f63a5ea2a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -98,8 +98,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -112,10 +114,10 @@ #include #include #include -#include "udp_impl.h" #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) #include @@ -229,7 +231,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) } /** - * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 + * udp_lib_get_port - UDP port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up @@ -353,7 +355,7 @@ fail: } EXPORT_IPV6_MOD(udp_lib_get_port); -int udp_v4_get_port(struct sock *sk, unsigned short snum) +static int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); @@ -928,7 +930,7 @@ out: * to find the appropriate port. */ -int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +static int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -1855,7 +1857,7 @@ static void udp_destruct_sock(struct sock *sk) inet_sock_destruct(sk); } -int udp_init_sock(struct sock *sk) +static int udp_init_sock(struct sock *sk) { int res = udp_lib_init_sock(sk); @@ -2070,6 +2072,7 @@ EXPORT_IPV6_MOD(udp_read_skb); * return it, otherwise we block. */ +INDIRECT_CALLABLE_SCOPE int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2342,7 +2345,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) } EXPORT_IPV6_MOD(udp_lib_rehash); -void udp_v4_rehash(struct sock *sk) +static void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, @@ -2688,8 +2691,8 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, * All we need to do is get the socket, and then do a checksum. */ -int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, - int proto) +static int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + int proto) { struct sock *sk = NULL; struct udphdr *uh; @@ -2935,7 +2938,7 @@ int udp_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } -void udp_destroy_sock(struct sock *sk) +static void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); @@ -3125,8 +3128,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } EXPORT_IPV6_MOD(udp_lib_setsockopt); -int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) +static int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, @@ -3196,8 +3199,8 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, } EXPORT_IPV6_MOD(udp_lib_getsockopt); -int udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +static int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); @@ -3747,7 +3750,7 @@ static unsigned short seq_file_family(const struct seq_file *seq) return afinfo->family; } -const struct seq_operations udp_seq_ops = { +static const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, @@ -3806,7 +3809,7 @@ static int __init set_uhash_entries(char *str) } __setup("uhash_entries=", set_uhash_entries); -void __init udp_table_init(struct udp_table *table, const char *name) +static void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i, slot_size; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index d328a3078ae0..9f33b07b1481 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -7,8 +7,6 @@ #include #include -#include "udp_impl.h" - static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6e491c720c90..a010d05062a0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -10,7 +10,6 @@ #include #include #include -#include #include static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, @@ -224,12 +223,6 @@ static int udp_diag_destroy(struct sk_buff *in_skb, return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } -static int udplite_diag_destroy(struct sk_buff *in_skb, - const struct inet_diag_req_v2 *req) -{ - return __udp_diag_destroy(in_skb, req, &udplite_table); -} - #endif static const struct inet_diag_handler udp_diag_handler = { @@ -244,50 +237,13 @@ static const struct inet_diag_handler udp_diag_handler = { #endif }; -static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - udp_dump(&udplite_table, skb, cb, r); -} - -static int udplite_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return udp_dump_one(&udplite_table, cb, req); -} - -static const struct inet_diag_handler udplite_diag_handler = { - .owner = THIS_MODULE, - .dump = udplite_diag_dump, - .dump_one = udplite_diag_dump_one, - .idiag_get_info = udp_diag_get_info, - .idiag_type = IPPROTO_UDPLITE, - .idiag_info_size = 0, -#ifdef CONFIG_INET_DIAG_DESTROY - .destroy = udplite_diag_destroy, -#endif -}; - static int __init udp_diag_init(void) { - int err; - - err = inet_diag_register(&udp_diag_handler); - if (err) - goto out; - err = inet_diag_register(&udplite_diag_handler); - if (err) - goto out_lite; -out: - return err; -out_lite: - inet_diag_unregister(&udp_diag_handler); - goto out; + return inet_diag_register(&udp_diag_handler); } static void __exit udp_diag_exit(void) { - inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } @@ -296,4 +252,3 @@ module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h deleted file mode 100644 index 0ca4384f9afa..000000000000 --- a/net/ipv4/udp_impl.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _UDP4_IMPL_H -#define _UDP4_IMPL_H -#include -#include -#include -#include -#include - -int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); - -int udp_v4_get_port(struct sock *sk, unsigned short snum); -void udp_v4_rehash(struct sock *sk); - -int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen); -int udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); - -int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); -void udp_destroy_sock(struct sock *sk); - -#ifdef CONFIG_PROC_FS -extern const struct seq_operations udp_seq_ops; -#endif -#endif /* _UDP4_IMPL_H */ diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c deleted file mode 100644 index 826e9e79eb19..000000000000 --- a/net/ipv4/udplite.c +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). - * - * Authors: Gerrit Renker - * - * Changes: - * Fixes: - */ - -#define pr_fmt(fmt) "UDPLite: " fmt - -#include -#include -#include "udp_impl.h" - -struct udp_table udplite_table __read_mostly; -EXPORT_SYMBOL(udplite_table); - -/* Designate sk as UDP-Lite socket */ -static int udplite_sk_init(struct sock *sk) -{ - pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - return udp_init_sock(sk); -} - -static int udplite_rcv(struct sk_buff *skb) -{ - return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); -} - -static int udplite_err(struct sk_buff *skb, u32 info) -{ - return __udp4_lib_err(skb, info, &udplite_table); -} - -static const struct net_protocol udplite_protocol = { - .handler = udplite_rcv, - .err_handler = udplite_err, - .no_policy = 1, -}; - -struct proto udplite_prot = { - .name = "UDP-Lite", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udplite_sk_init, - .destroy = udp_destroy_sock, - .setsockopt = udp_setsockopt, - .getsockopt = udp_getsockopt, - .sendmsg = udp_sendmsg, - .recvmsg = udp_recvmsg, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v4_rehash, - .get_port = udp_v4_get_port, - - .memory_allocated = &net_aligned_data.udp_memory_allocated, - .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, - - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), - .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), - .obj_size = sizeof(struct udp_sock), - .h.udp_table = &udplite_table, -}; -EXPORT_SYMBOL(udplite_prot); - -static struct inet_protosw udplite4_protosw = { - .type = SOCK_DGRAM, - .protocol = IPPROTO_UDPLITE, - .prot = &udplite_prot, - .ops = &inet_dgram_ops, - .flags = INET_PROTOSW_PERMANENT, -}; - -#ifdef CONFIG_PROC_FS -static struct udp_seq_afinfo udplite4_seq_afinfo = { - .family = AF_INET, - .udp_table = &udplite_table, -}; - -static int __net_init udplite4_proc_init_net(struct net *net) -{ - if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops, - sizeof(struct udp_iter_state), &udplite4_seq_afinfo)) - return -ENOMEM; - return 0; -} - -static void __net_exit udplite4_proc_exit_net(struct net *net) -{ - remove_proc_entry("udplite", net->proc_net); -} - -static struct pernet_operations udplite4_net_ops = { - .init = udplite4_proc_init_net, - .exit = udplite4_proc_exit_net, -}; - -static __init int udplite4_proc_init(void) -{ - return register_pernet_subsys(&udplite4_net_ops); -} -#else -static inline int udplite4_proc_init(void) -{ - return 0; -} -#endif - -void __init udplite4_register(void) -{ - udp_table_init(&udplite_table, "UDP-Lite"); - if (proto_register(&udplite_prot, 1)) - goto out_register_err; - - if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) - goto out_unregister_proto; - - inet_register_protosw(&udplite4_protosw); - - if (udplite4_proc_init()) - pr_err("%s: Cannot register /proc!\n", __func__); - return; - -out_unregister_proto: - proto_unregister(&udplite_prot); -out_register_err: - pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); -} -- cgit v1.2.3 From 7accba6fd1ab60fb4f3a5c15c52d6fbb3af7f3a3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:52 +0000 Subject: udp: Remove UDP-Lite SNMP stats. Since UDP and UDP-Lite shared most of the code, we have had to check the protocol every time we increment SNMP stats. Now that the UDP-Lite paths are dead, let's remove UDP-Lite SNMP stats. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/mib.h | 5 ---- include/net/udp.h | 46 ++++++++++++------------------ net/ipv4/af_inet.c | 6 ---- net/ipv4/proc.c | 13 --------- net/ipv4/udp.c | 75 +++++++++++++++++++++---------------------------- net/ipv6/af_inet6.c | 9 ++---- net/ipv6/proc.c | 14 --------- net/ipv6/udp.c | 48 +++++++++++++------------------ 8 files changed, 71 insertions(+), 145 deletions(-) (limited to 'include') diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 7e373664b1e7..dce05f8e6a33 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -28,11 +28,6 @@ struct netns_mib { DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); #endif - DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); -#if IS_ENABLED(CONFIG_IPV6) - DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); -#endif - DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/udp.h b/include/net/udp.h index 39223e2692e9..264c10607d2e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -529,38 +529,28 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, } /* - * SNMP statistics for UDP and UDP-Lite + * SNMP statistics for UDP */ -#define UDP_INC_STATS(net, field, is_udplite) do { \ - if (unlikely(is_udplite)) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ - else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) -#define __UDP_INC_STATS(net, field, is_udplite) do { \ - if (unlikely(is_udplite)) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ - else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) - -#define __UDP6_INC_STATS(net, field, is_udplite) do { \ - if (unlikely(is_udplite)) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ - else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ -} while(0) -#define UDP6_INC_STATS(net, field, __lite) do { \ - if (unlikely(__lite)) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ - else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ -} while(0) +#define __UDP_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.udp_statistics, field) +#define UDP_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.udp_statistics, field) +#define __UDP6_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.udp_stats_in6, field) +#define UDP6_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.udp_stats_in6, field) #if IS_ENABLED(CONFIG_IPV6) -#define __UDPX_MIB(sk, ipv4) \ -({ \ - ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ - sock_net(sk)->mib.udp_statistics) : \ - (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ - sock_net(sk)->mib.udp_stats_in6); \ -}) +#define __UDPX_MIB(sk, ipv4) \ + ({ \ + ipv4 ? sock_net(sk)->mib.udp_statistics : \ + sock_net(sk)->mib.udp_stats_in6; \ + }) #else -#define __UDPX_MIB(sk, ipv4) \ -({ \ - IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ - sock_net(sk)->mib.udp_statistics; \ -}) +#define __UDPX_MIB(sk, ipv4) \ + ({ \ + sock_net(sk)->mib.udp_statistics; \ + }) #endif #define __UDPX_INC_STATS(sk, field) \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5d4bc4c1a731..f98e46ae3e30 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1733,9 +1733,6 @@ static __net_init int ipv4_mib_init_net(struct net *net) net->mib.udp_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udp_statistics) goto err_udp_mib; - net->mib.udplite_statistics = alloc_percpu(struct udp_mib); - if (!net->mib.udplite_statistics) - goto err_udplite_mib; net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); if (!net->mib.icmp_statistics) goto err_icmp_mib; @@ -1749,8 +1746,6 @@ static __net_init int ipv4_mib_init_net(struct net *net) err_icmpmsg_mib: free_percpu(net->mib.icmp_statistics); err_icmp_mib: - free_percpu(net->mib.udplite_statistics); -err_udplite_mib: free_percpu(net->mib.udp_statistics); err_udp_mib: free_percpu(net->mib.net_statistics); @@ -1766,7 +1761,6 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); free_percpu(net->mib.icmp_statistics); - free_percpu(net->mib.udplite_statistics); free_percpu(net->mib.udp_statistics); free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cf51f8fcf34b..bfc06d1713ec 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -444,19 +444,6 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); - memset(buff, 0, udp_cnt * sizeof(unsigned long)); - - /* the UDP and UDP-Lite MIBs are the same */ - seq_puts(seq, "\nUdpLite:"); - snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, - udp_cnt, - net->mib.udplite_statistics); - for (i = 0; i < udp_cnt; i++) - seq_printf(seq, " %s", snmp4_udp_list[i].name); - seq_puts(seq, "\nUdpLite:"); - for (i = 0; i < udp_cnt; i++) - seq_printf(seq, " %lu", buff[i]); - seq_putc(seq, '\n'); return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b3f63a5ea2a9..10082095e633 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1198,13 +1198,12 @@ send: if (unlikely(err)) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { - UDP_INC_STATS(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS); err = 0; } - } else - UDP_INC_STATS(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + } else { + UDP_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS); + } return err; } @@ -1535,10 +1534,9 @@ out_free: * things). We could add another new stat but at least for now that * seems like overkill. */ - if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); - } + if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS); + return err; do_confirm: @@ -1897,10 +1895,10 @@ static struct sk_buff *__first_packet_length(struct sock *sk, while ((skb = skb_peek(rcvq)) != NULL) { if (udp_lib_checksum_complete(skb)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + struct net *net = sock_net(sk); + + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS); + __UDP_INC_STATS(net, UDP_MIB_INERRORS); udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; @@ -2052,11 +2050,10 @@ try_again: return err; if (udp_lib_checksum_complete(skb)) { - int is_udplite = IS_UDPLITE(sk); struct net *net = sock_net(sk); - __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); - __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS); + __UDP_INC_STATS(net, UDP_MIB_INERRORS); udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; @@ -2081,6 +2078,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); + struct net *net = sock_net(sk); bool checksum_valid = false; if (flags & MSG_ERRQUEUE) @@ -2128,16 +2126,14 @@ try_again: if (unlikely(err)) { if (!peeking) { udp_drops_inc(sk); - UDP_INC_STATS(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(net, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } if (!peeking) - UDP_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS(net, UDP_MIB_INDATAGRAMS); sock_recv_cmsgs(msg, sk, skb); @@ -2170,8 +2166,8 @@ try_again: csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { - UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(net, UDP_MIB_CSUMERRORS); + UDP_INC_STATS(net, UDP_MIB_INERRORS); } kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2371,20 +2367,18 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { - int is_udplite = IS_UDPLITE(sk); + struct net *net = sock_net(sk); int drop_reason; /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { - UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); + UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { - UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, - is_udplite); + UDP_INC_STATS(net, UDP_MIB_MEMERRORS); drop_reason = SKB_DROP_REASON_PROTO_MEM; } - UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(net, UDP_MIB_INERRORS); trace_udp_fail_queue_rcv_skb(rc, sk, skb); sk_skb_reason_drop(sk, skb, drop_reason); return -1; @@ -2405,7 +2399,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); - int is_udplite = IS_UDPLITE(sk); + struct net *net = sock_net(sk); /* * Charge it to the socket, dropping if the queue is full. @@ -2442,9 +2436,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - __UDP_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP_INC_STATS(net, UDP_MIB_INDATAGRAMS); return -ret; } } @@ -2503,9 +2495,9 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS); drop: - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP_INC_STATS(net, UDP_MIB_INERRORS); udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; @@ -2592,10 +2584,8 @@ start_lookup: if (unlikely(!nskb)) { udp_drops_inc(sk); - __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(net, UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS); + __UDP_INC_STATS(net, UDP_MIB_INERRORS); continue; } if (udp_queue_rcv_skb(sk, nskb) > 0) @@ -2613,8 +2603,7 @@ start_lookup: consume_skb(skb); } else { kfree_skb(skb); - __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI); } return 0; } @@ -2764,7 +2753,7 @@ no_sk: goto csum_error; drop_reason = SKB_DROP_REASON_NO_SOCKET; - __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_NOPORTS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -2793,9 +2782,9 @@ csum_error: proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); - __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS); drop: - __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_INERRORS); sk_skb_reason_drop(sk, skb, drop_reason); return 0; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 61f7bc88526a..eb9fff86baa1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -886,9 +886,7 @@ static int __net_init ipv6_init_mibs(struct net *net) net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); if (!net->mib.udp_stats_in6) return -ENOMEM; - net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); - if (!net->mib.udplite_stats_in6) - goto err_udplite_mib; + net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ipv6_statistics) goto err_ip_mib; @@ -899,10 +897,10 @@ static int __net_init ipv6_init_mibs(struct net *net) u64_stats_init(&af_inet6_stats->syncp); } - net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); if (!net->mib.icmpv6_statistics) goto err_icmp_mib; + net->mib.icmpv6msg_statistics = kzalloc_obj(struct icmpv6msg_mib); if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; @@ -913,8 +911,6 @@ err_icmpmsg_mib: err_icmp_mib: free_percpu(net->mib.ipv6_statistics); err_ip_mib: - free_percpu(net->mib.udplite_stats_in6); -err_udplite_mib: free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } @@ -922,7 +918,6 @@ err_udplite_mib: static void ipv6_cleanup_mibs(struct net *net) { free_percpu(net->mib.udp_stats_in6); - free_percpu(net->mib.udplite_stats_in6); free_percpu(net->mib.ipv6_statistics); free_percpu(net->mib.icmpv6_statistics); kfree(net->mib.icmpv6msg_statistics); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 21bfc73152f0..813013ca4e75 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -108,17 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), }; -static const struct snmp_mib snmp6_udplite6_list[] = { - SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), - SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), - SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), - SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), - SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), - SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), - SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), - SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), -}; - static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; @@ -226,9 +215,6 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) snmp6_seq_show_item(seq, net->mib.udp_stats_in6, NULL, snmp6_udp6_list, ARRAY_SIZE(snmp6_udp6_list)); - snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list, - ARRAY_SIZE(snmp6_udplite6_list)); return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index aa859bb0527d..07308b7156a6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -796,20 +796,18 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { - int is_udplite = IS_UDPLITE(sk); enum skb_drop_reason drop_reason; + struct net *net = sock_net(sk); /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); + UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_MEMERRORS, is_udplite); + UDP6_INC_STATS(net, UDP_MIB_MEMERRORS); drop_reason = SKB_DROP_REASON_PROTO_MEM; } - UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS(net, UDP_MIB_INERRORS); trace_udp_fail_queue_rcv_skb(rc, sk, skb); sk_skb_reason_drop(sk, skb, drop_reason); return -1; @@ -830,7 +828,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); - int is_udplite = IS_UDPLITE(sk); + struct net *net = sock_net(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; @@ -864,9 +862,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - __UDP6_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP6_INC_STATS(net, UDP_MIB_INDATAGRAMS); return -ret; } } @@ -909,9 +905,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; - __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS); drop: - __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS); udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; @@ -1018,10 +1014,8 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { udp_drops_inc(sk); - __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - __UDP6_INC_STATS(net, UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS); continue; } @@ -1040,8 +1034,7 @@ start_lookup: consume_skb(skb); } else { kfree_skb(skb); - __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI); } return 0; } @@ -1213,7 +1206,7 @@ no_sk: if (udp_lib_checksum_complete(skb)) goto csum_error; - __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_NOPORTS); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); sk_skb_reason_drop(sk, skb, reason); @@ -1234,9 +1227,9 @@ report_csum_error: csum_error: if (reason == SKB_DROP_REASON_NOT_SPECIFIED) reason = SKB_DROP_REASON_UDP_CSUM; - __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS); discard: - __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS); sk_skb_reason_drop(sk, skb, reason); return 0; } @@ -1490,13 +1483,11 @@ send: err = ip6_send_skb(skb); if (unlikely(err)) { if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS); err = 0; } } else { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS); } return err; } @@ -1826,10 +1817,9 @@ out_no_dst: * things). We could add another new stat but at least for now that * seems like overkill. */ - if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP6_INC_STATS(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); - } + if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS); + return err; do_confirm: -- cgit v1.2.3 From c2539d4f2df7a9889b71bad97b97ddfd9e47add1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:54 +0000 Subject: udp: Remove partial csum code in RX. UDP-Lite supports the partial checksum and the coverage is stored in the position of the length field of struct udphdr. In RX paths, udp4_csum_init() / udp6_csum_init() save the value in UDP_SKB_CB(skb)->cscov and set UDP_SKB_CB(skb)->partial_cov to 1 if the coverage is not full. The subsequent processing diverges depending on the value, but such paths are now dead. Also, these functions have some code guarded for UDP: * udp_unicast_rcv_skb / udp6_unicast_rcv_skb * __udp4_lib_rcv() and __udp6_lib_rcv(). Let's remove the partial csum code and the unnecessary guard for UDP-Lite in RX. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/udp.h | 17 ++--------- include/net/udplite.h | 34 --------------------- net/ipv4/udp.c | 85 ++++++++++----------------------------------------- net/ipv6/udp.c | 82 ++++++++++++------------------------------------- 4 files changed, 39 insertions(+), 179 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 264c10607d2e..bc275cda9f8c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -32,11 +32,9 @@ #include /** - * struct udp_skb_cb - UDP(-Lite) private variables + * struct udp_skb_cb - UDP private variables * * @header: private variables used by IPv4/IPv6 - * @cscov: checksum coverage length (UDP-Lite only) - * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { @@ -45,8 +43,6 @@ struct udp_skb_cb { struct inet6_skb_parm h6; #endif } header; - __u16 cscov; - __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) @@ -216,13 +212,11 @@ extern int sysctl_udp_wmem_min; struct sk_buff; /* - * Generic checksumming routines for UDP(-Lite) v4 and v6 + * Generic checksumming routines for UDP v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { - return (UDP_SKB_CB(skb)->cscov == skb->len ? - __skb_checksum_complete(skb) : - __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); + return __skb_checksum_complete(skb); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) @@ -273,7 +267,6 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); - UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, @@ -641,9 +634,6 @@ drop: static inline void udp_post_segment_fix_csum(struct sk_buff *skb) { - /* UDP-lite can't land here - no GRO */ - WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); - /* UDP packets generated with UDP_SEGMENT and traversing: * * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) @@ -657,7 +647,6 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) * a valid csum after the segmentation. * Additionally fixup the UDP CB. */ - UDP_SKB_CB(skb)->cscov = skb->len; if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) skb->csum_valid = 1; } diff --git a/include/net/udplite.h b/include/net/udplite.h index fdd769745ac4..0456a14c993b 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -25,40 +25,6 @@ static __inline__ int udplite_getfrag(void *from, char *to, int offset, /* * Checksumming routines */ -static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) -{ - u16 cscov; - - /* In UDPv4 a zero checksum means that the transmitter generated no - * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets - * with a zero checksum field are illegal. */ - if (uh->check == 0) { - net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); - return 1; - } - - cscov = ntohs(uh->len); - - if (cscov == 0) /* Indicates that full coverage is required. */ - ; - else if (cscov < 8 || cscov > skb->len) { - /* - * Coverage length violates RFC 3828: log and discard silently. - */ - net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", - cscov, skb->len); - return 1; - - } else if (cscov < skb->len) { - UDP_SKB_CB(skb)->partial_cov = 1; - UDP_SKB_CB(skb)->cscov = cscov; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - skb->csum_valid = 0; - } - - return 0; -} /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 10082095e633..d42fb9330c22 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2072,14 +2072,13 @@ EXPORT_IPV6_MOD(udp_read_skb); INDIRECT_CALLABLE_SCOPE int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { - struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); - struct sk_buff *skb; - unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; - int is_udplite = IS_UDPLITE(sk); + struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); bool checksum_valid = false; + unsigned int ulen, copied; + struct sk_buff *skb; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len); @@ -2097,14 +2096,10 @@ try_again: else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; - /* - * If checksum is needed at all, try to do it while copying the - * data. If the data is truncated, or if we only want a partial - * coverage checksum (UDP-Lite), do it before the copy. + /* If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, do it before the copy. */ - - if (copied < ulen || peeking || - (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { + if (copied < ulen || peeking) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) @@ -2444,42 +2439,6 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* FALLTHROUGH -- it's a UDP Packet */ } - /* - * UDP-Lite specific tests, ignored on UDP sockets - */ - if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && - UDP_SKB_CB(skb)->partial_cov)) { - u16 pcrlen = READ_ONCE(up->pcrlen); - - /* - * MIB statistics other than incrementing the error count are - * disabled for the following two types of errors: these depend - * on the application settings, not on the functioning of the - * protocol stack as such. - * - * RFC 3828 here recommends (sec 3.3): "There should also be a - * way ... to ... at least let the receiving application block - * delivery of packets with coverage values less than a value - * provided by the application." - */ - if (pcrlen == 0) { /* full coverage was set */ - net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); - goto drop; - } - /* The next case involves violating the min. coverage requested - * by the receiver. This is subtle: if receiver wants x and x is - * greater than the buffersize/MTU then receiver will complain - * that it wants x while sender emits packets of smaller size y. - * Therefore the above ...()->partial_cov statement is essential. - */ - if (UDP_SKB_CB(skb)->cscov < pcrlen) { - net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, pcrlen); - goto drop; - } - } - prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) @@ -2613,29 +2572,14 @@ start_lookup: * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ -static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, - int proto) +static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh) { int err; - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - - if (UDP_SKB_CB(skb)->partial_cov) { - skb->csum = inet_compute_pseudo(skb, proto); - return 0; - } - } - /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ - err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + err = (__force int)skb_checksum_init_zero_check(skb, IPPROTO_UDP, uh->check, inet_compute_pseudo); if (err) return err; @@ -2663,7 +2607,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, { int ret; - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + if (inet_get_convert_csum(sk) && uh->check) skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); @@ -2708,14 +2652,17 @@ static int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ulen > skb->len) goto short_packet; - if (proto == IPPROTO_UDP) { - /* UDP validates ulen. */ - if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) + if (ulen < sizeof(*uh)) + goto short_packet; + + if (ulen < skb->len) { + if (pskb_trim_rcsum(skb, ulen)) goto short_packet; + uh = udp_hdr(skb); } - if (udp4_csum_init(skb, uh, proto)) + if (udp4_csum_init(skb, uh)) goto csum_error; sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 07308b7156a6..bf5430ea66f0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -469,15 +469,13 @@ INDIRECT_CALLABLE_SCOPE int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { + int off, is_udp4, err, peeking = flags & MSG_PEEK; struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); - struct sk_buff *skb; - unsigned int ulen, copied; - int off, err, peeking = flags & MSG_PEEK; - int is_udplite = IS_UDPLITE(sk); struct udp_mib __percpu *mib; bool checksum_valid = false; - int is_udp4; + unsigned int ulen, copied; + struct sk_buff *skb; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len); @@ -501,14 +499,10 @@ try_again: is_udp4 = (skb->protocol == htons(ETH_P_IP)); mib = __UDPX_MIB(sk, is_udp4); - /* - * If checksum is needed at all, try to do it while copying the - * data. If the data is truncated, or if we only want a partial - * coverage checksum (UDP-Lite), do it before the copy. + /* If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, do it before the copy. */ - - if (copied < ulen || peeking || - (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { + if (copied < ulen || peeking) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) @@ -870,25 +864,6 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* FALLTHROUGH -- it's a UDP Packet */ } - /* - * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). - */ - if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && - UDP_SKB_CB(skb)->partial_cov)) { - u16 pcrlen = READ_ONCE(up->pcrlen); - - if (pcrlen == 0) { /* full coverage was set */ - net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); - goto drop; - } - if (UDP_SKB_CB(skb)->cscov < pcrlen) { - net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, pcrlen); - goto drop; - } - } - prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) @@ -1053,7 +1028,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, { int ret; - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + if (inet_get_convert_csum(sk) && uh->check) skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); @@ -1064,24 +1039,10 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, return 0; } -static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) +static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh) { int err; - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - - if (UDP_SKB_CB(skb)->partial_cov) { - skb->csum = ip6_compute_pseudo(skb, proto); - return 0; - } - } - /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum @@ -1090,7 +1051,7 @@ static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) * Note, we are only interested in != 0 or == 0, thus the * force to int. */ - err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + err = (__force int)skb_checksum_init_zero_check(skb, IPPROTO_UDP, uh->check, ip6_compute_pseudo); if (err) return err; @@ -1132,26 +1093,23 @@ static int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ulen > skb->len) goto short_packet; - if (proto == IPPROTO_UDP) { - /* UDP validates ulen. */ + /* Check for jumbo payload */ + if (ulen == 0) + ulen = skb->len; - /* Check for jumbo payload */ - if (ulen == 0) - ulen = skb->len; + if (ulen < sizeof(*uh)) + goto short_packet; - if (ulen < sizeof(*uh)) + if (ulen < skb->len) { + if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - if (ulen < skb->len) { - if (pskb_trim_rcsum(skb, ulen)) - goto short_packet; - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; - uh = udp_hdr(skb); - } + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); } - if (udp6_csum_init(skb, uh, proto)) + if (udp6_csum_init(skb, uh)) goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ -- cgit v1.2.3 From b2a1d719be4f8e9d970038ecd4db983f6e42d377 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:55 +0000 Subject: udp: Remove partial csum code in TX. UDP TX paths also have some code for UDP-Lite partial checksum: * udplite_csum() in udp_send_skb() and udp_v6_send_skb() * udplite_getfrag() in udp_sendmsg() and udpv6_sendmsg() Let's remove such code. Now, we can use IPPROTO_UDP directly instead of sk->sk_protocol or fl6->flowi6_proto for csum_tcpudp_magic() and csum_ipv6_magic(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/udplite.h | 35 ------------------------------- net/ipv4/udp.c | 58 +++++++++++++++++++-------------------------------- net/ipv6/udp.c | 54 ++++++++++++++++++++++------------------------- 3 files changed, 47 insertions(+), 100 deletions(-) (limited to 'include') diff --git a/include/net/udplite.h b/include/net/udplite.h index 0456a14c993b..6bfa1d6833d1 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -12,39 +12,4 @@ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ -/* - * Checksum computation is all in software, hence simpler getfrag. - */ -static __inline__ int udplite_getfrag(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb) -{ - struct msghdr *msg = from; - return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; -} - -/* - * Checksumming routines - */ - -/* Fast-path computation of checksum. Socket may not be locked. */ -static inline __wsum udplite_csum(struct sk_buff *skb) -{ - const int off = skb_transport_offset(skb); - const struct sock *sk = skb->sk; - int len = skb->len - off; - - if (udp_test_bit(UDPLITE_SEND_CC, sk)) { - u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); - - if (pcslen < len) { - if (pcslen > 0) - len = pcslen; - udp_hdr(skb)->len = htons(pcslen); - } - } - skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ - - return skb_checksum(skb, off, len, 0); -} - #endif /* _UDPLITE_H */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d42fb9330c22..9a2c8ff96e83 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1120,20 +1120,19 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct inet_cork *cork) { struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); + int offset, len, datalen; struct udphdr *uh; int err; - int is_udplite = IS_UDPLITE(sk); - int offset = skb_transport_offset(skb); - int len = skb->len - offset; - int datalen = len - sizeof(*uh); - __wsum csum = 0; + + offset = skb_transport_offset(skb); + len = skb->len - offset; + datalen = len - sizeof(*uh); /* * Create a UDP header */ uh = udp_hdr(skb); - uh->source = inet->inet_sport; + uh->source = inet_sk(sk)->inet_sport; uh->dest = fl4->fl4_dport; uh->len = htons(len); uh->check = 0; @@ -1154,7 +1153,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, kfree_skb(skb); return -EINVAL; } - if (is_udplite || dst_xfrm(skb_dst(skb))) { + if (dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -1170,26 +1169,18 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, } } - if (is_udplite) /* UDP-Lite */ - csum = udplite_csum(skb); - - else if (sk->sk_no_check_tx) { /* UDP csum off */ - + if (sk->sk_no_check_tx) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: - udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; - - } else - csum = udp_csum(skb); + } /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, - sk->sk_protocol, csum); + IPPROTO_UDP, udp_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -1270,26 +1261,23 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, IP_OPTIONS_DATA_FIXED_SIZE); + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); + int ulen = len, free = 0, connected = 0; struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); + __be32 daddr, faddr, saddr; + struct rtable *rt = NULL; struct flowi4 fl4_stack; - struct flowi4 *fl4; - int ulen = len; struct ipcm_cookie ipc; - struct rtable *rt = NULL; - int free = 0; - int connected = 0; - __be32 daddr, faddr, saddr; - u8 scope; - __be16 dport; - int err, is_udplite = IS_UDPLITE(sk); - int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; - int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; + struct flowi4 *fl4; + __be16 dport; int uc_index; + u8 scope; + int err; if (len > 0xFFFF) return -EMSGSIZE; @@ -1301,8 +1289,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* @@ -1444,7 +1430,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, - sk->sk_protocol, flow_flags, faddr, saddr, + IPPROTO_UDP, flow_flags, faddr, saddr, dport, inet->inet_sport, sk_uid(sk)); @@ -1478,7 +1464,7 @@ back_from_confirm: if (!corkreq) { struct inet_cork cork; - skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, + skb = ip_make_skb(sk, fl4, ip_generic_getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, &cork, msg->msg_flags); err = PTR_ERR(skb); @@ -1509,7 +1495,7 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip_append_data(sk, fl4, getfrag, msg, ulen, + err = ip_append_data(sk, fl4, ip_generic_getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf5430ea66f0..511e3f898be5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1370,13 +1370,13 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, struct inet_cork *cork) { struct sock *sk = skb->sk; + int offset, len, datalen; struct udphdr *uh; int err = 0; - int is_udplite = IS_UDPLITE(sk); - __wsum csum = 0; - int offset = skb_transport_offset(skb); - int len = skb->len - offset; - int datalen = len - sizeof(*uh); + + offset = skb_transport_offset(skb); + len = skb->len - offset; + datalen = len - sizeof(*uh); /* * Create a UDP header @@ -1403,7 +1403,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (is_udplite || dst_xfrm(skb_dst(skb))) { + if (dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -1419,21 +1419,18 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, } } - if (is_udplite) - csum = udplite_csum(skb); - else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ + if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len); goto send; - } else - csum = udp_csum(skb); + } /* add protocol-dependent pseudo-header */ uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, - len, fl6->flowi6_proto, csum); + len, IPPROTO_UDP, udp_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -1473,27 +1470,26 @@ out: int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - struct ipv6_txoptions opt_space; - struct udp_sock *up = udp_sk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); - struct in6_addr *daddr, *final_p, final; - struct ipv6_txoptions *opt = NULL; struct ipv6_txoptions *opt_to_free = NULL; + struct in6_addr *daddr, *final_p, final; struct ip6_flowlabel *flowlabel = NULL; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt = NULL; + struct udp_sock *up = udp_sk(sk); + struct ipv6_txoptions opt_space; + int addr_len = msg->msg_namelen; struct inet_cork_full cork; - struct flowi6 *fl6 = &cork.fl.u.ip6; - struct dst_entry *dst; struct ipcm6_cookie ipc6; - int addr_len = msg->msg_namelen; bool connected = false; + struct dst_entry *dst; + struct flowi6 *fl6; int ulen = len; - int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int err; - int is_udplite = IS_UDPLITE(sk); - int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + fl6 = &cork.fl.u.ip6; ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); @@ -1552,7 +1548,6 @@ do_udp_sendmsg: if (len > INT_MAX - sizeof(struct udphdr)) return -EMSGSIZE; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; if (READ_ONCE(up->pending)) { if (READ_ONCE(up->pending) == AF_INET) return udp_sendmsg(sk, msg, len); @@ -1654,7 +1649,7 @@ do_udp_sendmsg: opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; - fl6->flowi6_proto = sk->sk_protocol; + fl6->flowi6_proto = IPPROTO_UDP; fl6->flowi6_mark = ipc6.sockc.mark; fl6->daddr = *daddr; if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr)) @@ -1721,7 +1716,7 @@ back_from_confirm: if (!corkreq) { struct sk_buff *skb; - skb = ip6_make_skb(sk, getfrag, msg, ulen, + skb = ip6_make_skb(sk, ip_generic_getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, dst_rt6_info(dst), msg->msg_flags, &cork); @@ -1747,8 +1742,9 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), - &ipc6, fl6, dst_rt6_info(dst), + err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, + sizeof(struct udphdr), &ipc6, fl6, + dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); -- cgit v1.2.3 From 74f0cca1100b6d1f1ea28178435aff8078d06603 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:56 +0000 Subject: udp: Remove UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV. UDP-Lite supports variable-length checksum and has two socket options, UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV, to control the checksum coverage. Let's remove the support. setsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was only available for UDP-Lite and returned -ENOPROTOOPT for UDP. Now, the options are handled in ip_setsockopt() and ipv6_setsockopt(), which still return the same error. getsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was available for UDP and always returned 0, meaning full checksum, but now -ENOPROTOOPT is returned. Given that getsockopt() is meaningless for UDP and even the options are not defined under include/uapi/, this should not be a problem. $ man 7 udplite ... BUGS Where glibc support is missing, the following definitions are needed: #define IPPROTO_UDPLITE 136 #define UDPLITE_SEND_CSCOV 10 #define UDPLITE_RECV_CSCOV 11 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 10 +--------- include/net/udplite.h | 15 --------------- include/uapi/linux/udp.h | 2 ++ net/ipv4/udp.c | 46 ++-------------------------------------------- net/ipv6/ip6_checksum.c | 2 +- net/ipv6/udp.c | 5 ++--- 6 files changed, 8 insertions(+), 72 deletions(-) delete mode 100644 include/net/udplite.h (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 1cbf6b4d3aab..ce56ebcee5cb 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -40,8 +40,6 @@ enum { UDP_FLAGS_ACCEPT_FRAGLIST, UDP_FLAGS_ACCEPT_L4, UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ - UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */ - UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; /* per NUMA structure for lockless producer usage. */ @@ -74,11 +72,7 @@ struct udp_sock { */ __u16 len; /* total length of pending frames */ __u16 gso_size; - /* - * Fields specific to UDP-Lite. - */ - __u16 pcslen; - __u16 pcrlen; + /* * For encapsulation sockets. */ @@ -236,8 +230,6 @@ static inline void udp_allow_gso(struct sock *sk) hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) #endif -#define IS_UDPLITE(__sk) (unlikely(__sk->sk_protocol == IPPROTO_UDPLITE)) - static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) { #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) diff --git a/include/net/udplite.h b/include/net/udplite.h deleted file mode 100644 index 6bfa1d6833d1..000000000000 --- a/include/net/udplite.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Definitions for the UDP-Lite (RFC 3828) code. - */ -#ifndef _UDPLITE_H -#define _UDPLITE_H - -#include -#include - -/* UDP-Lite socket options */ -#define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ -#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ - -#endif /* _UDPLITE_H */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index edca3e430305..877fb02df8fb 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -29,6 +29,8 @@ struct udphdr { /* UDP socket options */ #define UDP_CORK 1 /* Never send partially complete segments */ +/* Deprecated, reserved for UDPLITE_SEND_CSCOV 10 */ +/* Deprecated, reserved for UDPLITE_RECV_CSCOV 11 */ #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ #define UDP_NO_CHECK6_RX 102 /* Disable accepting checksum for UDP6 */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9a2c8ff96e83..d47ca721ef0d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -117,7 +117,6 @@ #include #include #include -#include #include #if IS_ENABLED(CONFIG_IPV6) #include @@ -2924,7 +2923,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, valbool; int err = 0; - int is_udplite = IS_UDPLITE(sk); if (level == SOL_SOCKET) { err = sk_setsockopt(sk, level, optname, optval, optlen); @@ -3011,36 +3009,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockopt_release_sock(sk); break; - /* - * UDP-Lite's partial checksum coverage (RFC 3828). - */ - /* The sender sets actual checksum coverage length via this option. - * The case coverage > packet length is handled by send module. */ - case UDPLITE_SEND_CSCOV: - if (!is_udplite) /* Disable the option on UDP sockets */ - return -ENOPROTOOPT; - if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ - val = 8; - else if (val > USHRT_MAX) - val = USHRT_MAX; - WRITE_ONCE(up->pcslen, val); - udp_set_bit(UDPLITE_SEND_CC, sk); - break; - - /* The receiver specifies a minimum checksum coverage value. To make - * sense, this should be set to at least 8 (as done below). If zero is - * used, this again means full checksum coverage. */ - case UDPLITE_RECV_CSCOV: - if (!is_udplite) /* Disable the option on UDP sockets */ - return -ENOPROTOOPT; - if (val != 0 && val < 8) /* Avoid silly minimal values. */ - val = 8; - else if (val > USHRT_MAX) - val = USHRT_MAX; - WRITE_ONCE(up->pcrlen, val); - udp_set_bit(UDPLITE_RECV_CC, sk); - break; - default: err = -ENOPROTOOPT; break; @@ -3053,7 +3021,7 @@ EXPORT_IPV6_MOD(udp_lib_setsockopt); static int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) + if (level == SOL_UDP || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); @@ -3099,16 +3067,6 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = udp_test_bit(GRO_ENABLED, sk); break; - /* The following two cannot be changed on UDP sockets, the return is - * always 0 (which corresponds to the full checksum coverage of UDP). */ - case UDPLITE_SEND_CSCOV: - val = READ_ONCE(up->pcslen); - break; - - case UDPLITE_RECV_CSCOV: - val = READ_ONCE(up->pcrlen); - break; - default: return -ENOPROTOOPT; } @@ -3124,7 +3082,7 @@ EXPORT_IPV6_MOD(udp_lib_getsockopt); static int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 8bb68a0cdfd6..e1a594873675 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include -#include #include #ifndef _HAVE_ARCH_IPV6_CSUM diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 511e3f898be5..c3d8b5ede164 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -58,7 +58,6 @@ #include #include #include -#include static void udpv6_destruct_sock(struct sock *sk) { @@ -1831,7 +1830,7 @@ static void udpv6_destroy_sock(struct sock *sk) static int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) + if (level == SOL_UDP || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_v6_push_pending_frames); @@ -1841,7 +1840,7 @@ static int udpv6_setsockopt(struct sock *sk, int level, int optname, static int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ipv6_getsockopt(sk, level, optname, optval, optlen); } -- cgit v1.2.3 From 5c2738588621a4a53e3a1e87860abcaf9190194a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:57 +0000 Subject: udp: Remove struct proto.h.udp_table. Since UDP and UDP-Lite had dedicated socket hash tables for each, we have had to fetch them from different pointers. UDP always has its global or per-netns table in net->ipv4.udp_table and struct proto.h.udp_table is NULL. OTOH, UDP-Lite had only one global table in the pointer. We no longer use the field. Let's remove it and udp_get_table_prot(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/ipv4/udp.c | 26 ++++++++++++-------------- net/ipv6/udp.c | 1 - 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 16a1b8895206..7d51ac9e7d9a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1392,7 +1392,6 @@ struct proto { union { struct inet_hashinfo *hashinfo; - struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d47ca721ef0d..a7ca727347ce 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -134,11 +134,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET) -static struct udp_table *udp_get_table_prot(struct sock *sk) -{ - return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table; -} - static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -240,11 +235,13 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { - struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); + struct udp_table *udptable; int error = -EADDRINUSE; + udptable = net->ipv4.udp_table; + if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsigned short first, last; @@ -2224,12 +2221,13 @@ EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { - struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; + struct net *net = sock_net(sk); + struct udp_table *udptable; sock_rps_delete_flow(sk); - hslot = udp_hashslot(udptable, sock_net(sk), - udp_sk(sk)->udp_port_hash); + udptable = net->ipv4.udp_table; + hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); @@ -2238,7 +2236,7 @@ void udp_lib_unhash(struct sock *sk) if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); @@ -2258,11 +2256,12 @@ EXPORT_IPV6_MOD(udp_lib_unhash); void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { - struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; + struct net *net = sock_net(sk); + struct udp_table *udptable; - hslot = udp_hashslot(udptable, sock_net(sk), - udp_sk(sk)->udp_port_hash); + udptable = net->ipv4.udp_table; + hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); @@ -3175,7 +3174,6 @@ struct proto udp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), - .h.udp_table = NULL, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c3d8b5ede164..5bddbf457b61 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1924,7 +1924,6 @@ struct proto udpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), - .h.udp_table = NULL, .diag_destroy = udp_abort, }; -- cgit v1.2.3 From c570bd25d88a02c249be23850315435ec69808f5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:58 +0000 Subject: udp: Remove udp_table in struct udp_seq_afinfo. Since UDP and UDP-Lite had dedicated socket hash tables for each, we have had to fetch them from different pointers for procfs or bpf iterator. UDP always has its global or per-netns table in net->ipv4.udp_table and struct udp_seq_afinfo.udp_table is NULL. OTOH, UDP-Lite had only one global table in the pointer. We no longer use the field. Let's remove it and udp_get_table_seq(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-12-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/udp.h | 1 - net/ipv4/udp.c | 22 ++++------------------ net/ipv6/udp.c | 1 - 3 files changed, 4 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index bc275cda9f8c..76f401988353 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -552,7 +552,6 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; - struct udp_table *udp_table; }; struct udp_iter_state { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7ca727347ce..14b372b211be 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3193,21 +3193,8 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) #ifdef CONFIG_BPF_SYSCALL static const struct seq_operations bpf_iter_udp_seq_ops; -#endif -static struct udp_table *udp_get_table_seq(struct seq_file *seq, - struct net *net) -{ - const struct udp_seq_afinfo *afinfo; - -#ifdef CONFIG_BPF_SYSCALL - if (seq->op == &bpf_iter_udp_seq_ops) - return net->ipv4.udp_table; #endif - afinfo = pde_data(file_inode(seq->file)); - return afinfo->udp_table ? : net->ipv4.udp_table; -} - static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; @@ -3215,7 +3202,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct udp_table *udptable; struct sock *sk; - udptable = udp_get_table_seq(seq, net); + udptable = net->ipv4.udp_table; for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { @@ -3247,7 +3234,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) } while (sk && !seq_sk_match(seq, sk)); if (!sk) { - udptable = udp_get_table_seq(seq, net); + udptable = net->ipv4.udp_table; if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3295,7 +3282,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) struct udp_iter_state *state = seq->private; struct udp_table *udptable; - udptable = udp_get_table_seq(seq, seq_file_net(seq)); + udptable = seq_file_net(seq)->ipv4.udp_table; if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); @@ -3399,7 +3386,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) if (iter->cur_sk == iter->end_sk) state->bucket++; - udptable = udp_get_table_seq(seq, net); + udptable = net->ipv4.udp_table; again: /* New batch for the next bucket. @@ -3637,7 +3624,6 @@ static const struct seq_operations udp_seq_ops = { static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, - .udp_table = NULL, }; static int __net_init udp4_proc_init_net(struct net *net) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5bddbf457b61..eeb77363a556 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1872,7 +1872,6 @@ static const struct seq_operations udp6_seq_ops = { static struct udp_seq_afinfo udp6_seq_afinfo = { .family = AF_INET6, - .udp_table = NULL, }; int __net_init udp6_proc_init(struct net *net) -- cgit v1.2.3 From deffb85478a4076226f0213c7b9f7b7cf5dfe9f8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:20:00 +0000 Subject: udp: Don't pass udptable to IPv6 socket lookup functions. Since UDP and UDP-Lite had dedicated socket hash tables for each, we have had to pass the pointer down to many socket lookup functions. UDP-Lite gone, and we do not need to do that. Let's fetch net->ipv4.udp_table only where needed in IPv6 stack: __udp6_lib_lookup() and __udp6_lib_mcast_deliver(). __udp6_lib_err() is renamed to udpv6_err() as its wrapper is no longer needed. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6_stubs.h | 7 ++--- include/net/udp.h | 3 +- net/core/filter.c | 3 +- net/ipv4/udp_diag.c | 20 ++++++------ net/ipv6/udp.c | 82 ++++++++++++++++++++++-------------------------- net/ipv6/udp_offload.c | 3 +- 6 files changed, 53 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index d3013e721b14..907681cecde8 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -83,10 +83,9 @@ struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); struct sock *(*udp6_lib_lookup)(const struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct sk_buff *skb); int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, diff --git a/include/net/udp.h b/include/net/udp.h index 76f401988353..adec74531ee1 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -449,8 +449,7 @@ struct sock *udp6_lib_lookup(const struct net *net, struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); + int dif, int sdif, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); diff --git a/net/core/filter.c b/net/core/filter.c index 78b548158fb0..b66d985785f7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6904,8 +6904,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, - dif, sdif, - net->ipv4.udp_table, NULL); + dif, sdif, NULL); #endif } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index a010d05062a0..0899c60cce53 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -44,11 +44,11 @@ static int udp_dump_one(struct udp_table *tbl, #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - req->id.idiag_if, 0, tbl, NULL); + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + req->id.idiag_if, 0, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -185,11 +185,11 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, else sk = __udp6_lib_lookup(net, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if, 0, tbl, NULL); + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, 0, NULL); } #endif else { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5ba399218d07..dd958e2b552b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -344,9 +344,9 @@ static void udp6_hash4(struct sock *sk) struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *udptable, - struct sk_buff *skb) + int dif, int sdif, struct sk_buff *skb) { + struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; @@ -406,14 +406,13 @@ done: EXPORT_SYMBOL_GPL(__udp6_lib_lookup); static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport, - struct udp_table *udptable) + __be16 sport, __be16 dport) { const struct ipv6hdr *iph = ipv6_hdr(skb); return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), udptable, skb); + inet6_sdif(skb), skb); } struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, @@ -421,14 +420,12 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); - struct net *net = dev_net(skb->dev); int iif, sdif; inet6_get_iif_sdif(skb, &iif, &sdif); - return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, iif, - sdif, net->ipv4.udp_table, NULL); + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, iif, sdif, NULL); } /* Must be called under rcu_read_lock(). @@ -440,8 +437,7 @@ struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr { struct sock *sk; - sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, - dif, 0, net->ipv4.udp_table, NULL); + sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -642,7 +638,6 @@ static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, static struct sock *__udp6_lib_err_encap(struct net *net, const struct ipv6hdr *hdr, int offset, struct udphdr *uh, - struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -673,7 +668,7 @@ static struct sock *__udp6_lib_err_encap(struct net *net, sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, &hdr->saddr, uh->dest, - inet6_iif(skb), 0, udptable, skb); + inet6_iif(skb), 0, skb); if (sk) { up = udp_sk(sk); @@ -694,29 +689,28 @@ out: return sk; } -static int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info, - struct udp_table *udptable) +static int udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { - struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; - const struct in6_addr *saddr = &hdr->saddr; - const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr; - struct udphdr *uh = (struct udphdr *)(skb->data+offset); + struct udphdr *uh = (struct udphdr *)(skb->data + offset); + const struct in6_addr *saddr, *daddr; + struct net *net = dev_net(skb->dev); + struct ipv6_pinfo *np; bool tunnel = false; struct sock *sk; int harderr; int err; - struct net *net = dev_net(skb->dev); + daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr; + saddr = &hdr->saddr; sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), inet6_sdif(skb), udptable, NULL); + inet6_iif(skb), inet6_sdif(skb), NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udpv6_encap_needed_key)) { - sk = __udp6_lib_err_encap(net, hdr, offset, uh, - udptable, sk, skb, + sk = __udp6_lib_err_encap(net, hdr, offset, uh, sk, skb, opt, type, code, info); if (!sk) return 0; @@ -808,14 +802,6 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static __inline__ int udpv6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info) -{ - return __udp6_lib_err(skb, opt, type, code, offset, info, - dev_net(skb->dev)->ipv4.udp_table); -} - static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -947,19 +933,27 @@ static void udp6_csum_zero_error(struct sk_buff *skb) * so we don't need to lock the hashes. */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, const struct in6_addr *daddr, - struct udp_table *udptable, int proto) + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto) { - struct sock *sk, *first = NULL; + struct udp_table *udptable = net->ipv4.udp_table; const struct udphdr *uh = udp_hdr(skb); + unsigned int hash2, hash2_any, offset; unsigned short hnum = ntohs(uh->dest); - struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - unsigned int offset = offsetof(typeof(*sk), sk_node); - unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - int dif = inet6_iif(skb); + struct sock *sk, *first = NULL; int sdif = inet6_sdif(skb); + int dif = inet6_iif(skb); struct hlist_node *node; + struct udp_hslot *hslot; struct sk_buff *nskb; + bool use_hash2; + + hash2_any = 0; + hash2 = 0; + hslot = udp_hashslot(udptable, net, hnum); + use_hash2 = hslot->count > 10; + offset = offsetof(typeof(*sk), sk_node); if (use_hash2) { hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & @@ -1069,8 +1063,7 @@ static int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh) return 0; } -static int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, - int proto) +static int __udp6_lib_rcv(struct sk_buff *skb, int proto) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct in6_addr *saddr, *daddr; @@ -1139,11 +1132,10 @@ static int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, * Multicast receive code */ if (ipv6_addr_is_multicast(daddr)) - return __udp6_lib_mcast_deliver(net, skb, - saddr, daddr, udptable, proto); + return __udp6_lib_mcast_deliver(net, skb, saddr, daddr, proto); /* Unicast */ - sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest); if (sk) { if (!uh->check && !udp_get_no_check6_rx(sk)) goto report_csum_error; @@ -1261,7 +1253,7 @@ void udp_v6_early_demux(struct sk_buff *skb) INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) { - return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); + return __udp6_lib_rcv(skb, IPPROTO_UDP); } /* diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e003b8494dc0..778afc7453ce 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -128,8 +128,7 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, iif, - sdif, net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, sdif, NULL); } struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) -- cgit v1.2.3 From 68aeb21ef0e183ff3675fb82e22573e959505f95 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:20:01 +0000 Subject: udp: Don't pass udptable to IPv4 socket lookup functions. Since UDP and UDP-Lite had dedicated socket hash tables for each, we have had to pass the pointer down to many socket lookup functions. UDP-Lite gone, and we do not need to do that. Let's fetch net->ipv4.udp_table only where needed in IPv4 stack: __udp4_lib_lookup(), __udp4_lib_mcast_deliver(), and udp_diag_dump(). Some functions are renamed as the wrapper functions are no longer needed. __udp4_lib_err() -> udp_err() __udp_diag_destroy() -> udp_diag_destroy() udp_dump_one() -> udp_diag_dump_one() udp_dump() -> udp_diag_dump() Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-15-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/udp.h | 5 ++-- net/core/filter.c | 2 +- net/ipv4/udp.c | 75 +++++++++++++++++++++++--------------------------- net/ipv4/udp_diag.c | 61 ++++++++++++++-------------------------- net/ipv4/udp_offload.c | 3 +- 5 files changed, 59 insertions(+), 87 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index adec74531ee1..8262e2b215b4 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -437,9 +437,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, - __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif, - struct udp_table *tbl, struct sk_buff *skb); + __be16 sport, __be32 daddr, __be16 dport, + int dif, int sdif, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(const struct net *net, diff --git a/net/core/filter.c b/net/core/filter.c index b66d985785f7..2f023999f046 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6889,7 +6889,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, net->ipv4.udp_table, NULL); + dif, sdif, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fd0bad44d111..dff803e7d6a8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -673,9 +673,10 @@ EXPORT_IPV6_MOD(udp4_hash4); * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, - __be16 sport, __be32 daddr, __be16 dport, int dif, - int sdif, struct udp_table *udptable, struct sk_buff *skb) + __be16 sport, __be32 daddr, __be16 dport, + int dif, int sdif, struct sk_buff *skb) { + struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; @@ -741,14 +742,13 @@ done: EXPORT_SYMBOL_GPL(__udp4_lib_lookup); static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport, - struct udp_table *udptable) + __be16 sport, __be16 dport) { const struct iphdr *iph = ip_hdr(skb); return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), udptable, skb); + inet_sdif(skb), skb); } struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, @@ -756,14 +756,12 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); - struct net *net = dev_net(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); - return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, iif, - sdif, net->ipv4.udp_table, NULL); + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, iif, sdif, NULL); } /* Must be called under rcu_read_lock(). @@ -775,8 +773,7 @@ struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, { struct sock *sk; - sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, - dif, 0, net->ipv4.udp_table, NULL); + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -866,7 +863,6 @@ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) static struct sock *__udp4_lib_err_encap(struct net *net, const struct iphdr *iph, struct udphdr *uh, - struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info) { @@ -894,8 +890,7 @@ static struct sock *__udp4_lib_err_encap(struct net *net, } sk = __udp4_lib_lookup(net, iph->daddr, uh->source, - iph->saddr, uh->dest, skb->dev->ifindex, 0, - udptable, NULL); + iph->saddr, uh->dest, skb->dev->ifindex, 0, NULL); if (sk) { up = udp_sk(sk); @@ -924,29 +919,28 @@ out: * header points to the first 8 bytes of the udp header. We need * to find the appropriate port. */ - -static int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int udp_err(struct sk_buff *skb, u32 info) { - struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; - struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + struct inet_sock *inet; bool tunnel = false; + struct udphdr *uh; struct sock *sk; int harderr; int err; - struct net *net = dev_net(skb->dev); + uh = (struct udphdr *)(skb->data + (iph->ihl << 2)); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, - inet_sdif(skb), udptable, NULL); + inet_sdif(skb), NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) { - sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, - info); + sk = __udp4_lib_err_encap(net, iph, uh, sk, skb, info); if (!sk) return 0; } else @@ -1019,11 +1013,6 @@ out: return 0; } -int udp_err(struct sk_buff *skb, u32 info) -{ - return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table); -} - /* * Throw away all pending data and cancel the corking. Socket is locked. */ @@ -2491,18 +2480,24 @@ EXPORT_IPV6_MOD(udp_sk_rx_dst_set); static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable, int proto) { - struct sock *sk, *first = NULL; + struct udp_table *udptable = net->ipv4.udp_table; + unsigned int hash2, hash2_any, offset; unsigned short hnum = ntohs(uh->dest); - struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - unsigned int offset = offsetof(typeof(*sk), sk_node); + struct sock *sk, *first = NULL; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_node *node; + struct udp_hslot *hslot; struct sk_buff *nskb; + bool use_hash2; + + hash2_any = 0; + hash2 = 0; + hslot = udp_hashslot(udptable, net, hnum); + use_hash2 = hslot->count > 10; + offset = offsetof(typeof(*sk), sk_node); if (use_hash2) { hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -2607,15 +2602,14 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, * All we need to do is get the socket, and then do a checksum. */ -static int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, - int proto) +static int __udp4_lib_rcv(struct sk_buff *skb, int proto) { + struct rtable *rt = skb_rtable(skb); + struct net *net = dev_net(skb->dev); struct sock *sk = NULL; - struct udphdr *uh; unsigned short ulen; - struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; - struct net *net = dev_net(skb->dev); + struct udphdr *uh; bool refcounted; int drop_reason; @@ -2667,10 +2661,9 @@ static int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable, proto); + return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, proto); - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); no_sk: @@ -2854,7 +2847,7 @@ enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) int udp_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); + return __udp4_lib_rcv(skb, IPPROTO_UDP); } static void udp_destroy_sock(struct sock *sk) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 0899c60cce53..f4b24e628cf8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -24,23 +24,24 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, net_admin); } -static int udp_dump_one(struct udp_table *tbl, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) +static int udp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; - int err; struct sock *sk = NULL; struct sk_buff *rep; - struct net *net = sock_net(in_skb->sk); + struct net *net; + int err; + + net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, - req->id.idiag_src[0], req->id.idiag_sport, - req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, 0, tbl, NULL); + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_if, 0, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -85,14 +86,15 @@ out_nosk: return err; } -static void udp_dump(struct udp_table *table, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) +static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; + struct udp_table *table; + table = net->ipv4.udp_table; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -139,18 +141,6 @@ done: cb->args[1] = num; } -static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); -} - -static int udp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); -} - static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { @@ -159,9 +149,8 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } #ifdef CONFIG_INET_DIAG_DESTROY -static int __udp_diag_destroy(struct sk_buff *in_skb, - const struct inet_diag_req_v2 *req, - struct udp_table *tbl) +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); struct sock *sk; @@ -171,18 +160,17 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, - req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_src[0], req->id.idiag_sport, - req->id.idiag_if, 0, tbl, NULL); + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, 0, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, - req->id.idiag_dst[3], req->id.idiag_dport, - req->id.idiag_src[3], req->id.idiag_sport, - req->id.idiag_if, 0, tbl, NULL); - + req->id.idiag_dst[3], req->id.idiag_dport, + req->id.idiag_src[3], req->id.idiag_sport, + req->id.idiag_if, 0, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, @@ -216,13 +204,6 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, return err; } - -static int udp_diag_destroy(struct sk_buff *in_skb, - const struct inet_diag_req_v2 *req) -{ - return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); -} - #endif static const struct inet_diag_handler udp_diag_handler = { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6b1654c1ad4a..98e92da726b5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -869,8 +869,7 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, iif, - sdif, net->ipv4.udp_table, NULL); + iph->daddr, dport, iif, sdif, NULL); } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From 3ac949881396361b6462a717f6cbbd97f368af02 Mon Sep 17 00:00:00 2001 From: "Tycho Andersen (AMD)" Date: Mon, 2 Mar 2026 08:02:24 -0700 Subject: include/psp-sev.h: fix structure member in comment The member is 'data', not 'opaque'. Signed-off-by: Tycho Andersen (AMD) Reviewed-by: Tom Lendacky Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 2b5b042eb73b..52dae70b058b 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -277,7 +277,7 @@ struct sev_user_data_snp_wrapped_vlek_hashstick { * struct sev_issue_cmd - SEV ioctl parameters * * @cmd: SEV commands to execute - * @opaque: pointer to the command structure + * @data: pointer to the command structure * @error: SEV FW return code on failure */ struct sev_issue_cmd { -- cgit v1.2.3 From 203247c5cb972af5d46bdb7d41ef40078048810b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Mar 2026 07:47:00 -0700 Subject: blk-integrity: support arbitrary buffer alignment A bio segment may have partial interval block data with the rest continuing into the next segments because direct-io data payloads only need to align in memory to the device's DMA limits. At the same time, the protection information may also be split in multiple segments. The most likely way that may happen is if two requests merge, or if we're directly using the io_uring user metadata. The generate/verify, however, only ever accessed the first bip_vec. Further, it may be possible to unalign the protection fields from the user space buffer, or if there are odd additional opaque bytes in front or in back of the protection information metadata region. Change up the iteration to allow spanning multiple segments. This patch is mostly a re-write of the protection information handling to allow any arbitrary alignments, so it's probably easier to review the end result rather than the diff. Many controllers are not able to handle interval data composed of multiple segments when PI is used, so this patch introduces a new integrity limit that a low level driver can set to notify that it is capable, default to false. The nvme driver is the first one to enable it in this patch. Everyone else will force DMA alignment to the logical block size as before to ensure interval data is always aligned within a single segment. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260313144701.1221652-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +- block/t10-pi.c | 816 +++++++++++++++++++++++------------------- drivers/nvme/host/core.c | 1 + include/linux/blk-integrity.h | 1 + 4 files changed, 465 insertions(+), 365 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index dabfab97fbab..78c83817b9d3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) } /* - * The PI generation / validation helpers do not expect intervals to - * straddle multiple bio_vecs. Enforce alignment so that those are + * Some IO controllers can not handle data intervals straddling + * multiple bio_vecs. For those, enforce alignment so that those are * never generated, and that each buffer is aligned as expected. */ - if (bi->csum_type) { + if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) { lim->dma_alignment = max(lim->dma_alignment, (1U << bi->interval_exp) - 1); } @@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; + if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) && + !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE)) + ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE; } else { ti->flags = BLK_INTEGRITY_STACKED; ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | - (bi->flags & BLK_INTEGRITY_REF_TAG); + (bi->flags & BLK_INTEGRITY_REF_TAG) | + (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE); ti->csum_type = bi->csum_type; ti->pi_tuple_size = bi->pi_tuple_size; ti->metadata_size = bi->metadata_size; diff --git a/block/t10-pi.c b/block/t10-pi.c index d27be6041fd3..a19b4e102a83 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,462 +12,556 @@ #include #include "blk.h" +#define APP_TAG_ESCAPE 0xffff +#define REF_TAG_ESCAPE 0xffffffff + +/* + * This union is used for onstack allocations when the pi field is split across + * segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches + * the sizeof one of these two types. + */ +union pi_tuple { + struct crc64_pi_tuple crc64_pi; + struct t10_pi_tuple t10_pi; +}; + struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; + struct bio *bio; + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct bvec_iter data_iter; + struct bvec_iter prot_iter; + unsigned int interval_remaining; + u64 seed; + u64 csum; }; -static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, - unsigned char csum_type) +static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data, + unsigned int len) { - if (csum_type == BLK_INTEGRITY_CSUM_IP) - return (__force __be16)ip_compute_csum(data, len); - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + iter->csum = crc64_nvme(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_CRC: + iter->csum = crc_t10dif_update(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u32)csum_partial(data, len, + (__force __wsum)iter->csum); + break; + default: + WARN_ON_ONCE(1); + iter->csum = U64_MAX; + break; + } +} + +static void blk_integrity_csum_finish(struct blk_integrity_iter *iter) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum); + break; + default: + break; + } } /* - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, - * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref - * tag. + * Update the csum for formats that have metadata padding in front of the data + * integrity field */ -static void t10_pi_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_csum_offset(struct blk_integrity_iter *iter) { - u8 offset = bi->pi_offset; - unsigned int i; + unsigned int offset = iter->bi->pi_offset; + struct bio_vec *bvec = iter->bip->bip_vec; + + while (offset > 0) { + struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter); + unsigned int len = min(pbv.bv_len, offset); + void *prot_buf = bvec_kmap_local(&pbv); + + blk_calculate_guard(iter, prot_buf, len); + kunmap_local(prot_buf); + offset -= len; + bvec_iter_advance_single(bvec, &iter->prot_iter, len); + } + blk_integrity_csum_finish(iter); +} - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; +static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); + + memcpy(prot_buf, tuple, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; + } +} - pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - pi->guard_tag = t10_pi_csum(pi->guard_tag, - iter->prot_buf, offset, bi->csum_type); - pi->app_tag = 0; +static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); + + memcpy(tuple, prot_buf, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; + } +} - if (bi->flags & BLK_INTEGRITY_REF_TAG) - pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); - else - pi->ref_tag = 0; +static bool ext_pi_ref_escape(const u8 ref_tag[6]) +{ + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; - } + return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } -static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; - __be16 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - if (be32_to_cpu(pi->ref_tag) != - lower_32_bits(iter->seed)) { - pr_err("%s: ref tag error at location %llu " \ - "(rcvd %u)\n", iter->disk_name, - (unsigned long long) - iter->seed, be32_to_cpu(pi->ref_tag)); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - pi->ref_tag == T10_PI_REF_ESCAPE) - goto next; +static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) +{ + u64 seed = lower_48_bits(iter->seed); + u64 guard = get_unaligned_be64(&pi->guard_tag); + u64 ref = get_unaligned_be48(pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); + + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); + return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) { + return BLK_STS_OK; + } + + if (guard != iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, iter->csum); + return BLK_STS_PROTECTION; + } + + return BLK_STS_OK; +} - csum = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - csum = t10_pi_csum(csum, iter->prot_buf, offset, - bi->csum_type); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %04x, want %04x)\n", iter->disk_name, - (unsigned long long)iter->seed, - be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); +static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, u16 guard) +{ + u32 seed = lower_32_bits(iter->seed); + u32 ref = get_unaligned_be32(&pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); + + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %u (rcvd %u)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) { + return BLK_STS_OK; + } -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (guard != (u16)iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, (u16)iter->csum); + return BLK_STS_PROTECTION; } return BLK_STS_OK; } -/** - * t10_pi_type1_prepare - prepare PI prior submitting request to device - * @rq: request with PI that should be prepared - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Remap protection information to match the - * physical LBA. - */ -static void t10_pi_type1_prepare(struct request *rq) +static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u16 guard = get_unaligned_be16(&pi->guard_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; + return blk_verify_pi(iter, pi, guard); +} - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; +static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned((u16 *)&pi->guard_tag); - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == virt) - pi->ref_tag = cpu_to_be32(ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } + return blk_verify_pi(iter, pi, guard); +} - bip->bip_flags |= BIP_MAPPED_INTEGRITY; +static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_verify_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_verify_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_verify_ip_pi(iter, &tuple->t10_pi); + default: + return BLK_STS_OK; } } -/** - * t10_pi_type1_complete - prepare PI prior returning request to the blk layer - * @rq: request with PI that should be prepared - * @nr_bytes: total bytes to prepare - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Since the physical start sector was submitted - * to the device, we should remap it back to virtual values expected by the - * block layer. - */ -static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + put_unaligned_be64(iter->csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be48(iter->seed, &pi->ref_tag); +} - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == ref_tag) - pi->ref_tag = cpu_to_be32(virt); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } - } +static void blk_set_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, __be16 csum) +{ + put_unaligned(csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be32(iter->seed, &pi->ref_tag); } -static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) +static void blk_set_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - return cpu_to_be64(crc64_nvme(crc, data, len)); + blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum)); } -static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_set_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - u8 offset = bi->pi_offset; - unsigned int i; + blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum); +} - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; +static void blk_integrity_set(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_set_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_set_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_set_ip_pi(iter, &tuple->t10_pi); + default: + WARN_ON_ONCE(1); + return; + } +} - pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), - iter->prot_buf, offset); - pi->app_tag = 0; +static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter, + bool verify) +{ + blk_status_t ret = BLK_STS_OK; + union pi_tuple tuple; + void *ptuple = &tuple; + struct bio_vec pbv; + + blk_integrity_csum_offset(iter); + pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter); + if (pbv.bv_len >= iter->bi->pi_tuple_size) { + ptuple = bvec_kmap_local(&pbv); + bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter, + iter->bi->metadata_size - iter->bi->pi_offset); + } else if (verify) { + blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } - if (bi->flags & BLK_INTEGRITY_REF_TAG) - put_unaligned_be48(iter->seed, pi->ref_tag); - else - put_unaligned_be48(0ULL, pi->ref_tag); + if (verify) + ret = blk_integrity_verify(iter, ptuple); + else + blk_integrity_set(iter, ptuple); - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (likely(ptuple != &tuple)) { + kunmap_local(ptuple); + } else if (!verify) { + blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); } + + iter->interval_remaining = 1 << iter->bi->interval_exp; + iter->csum = 0; + iter->seed++; + return ret; } -static bool ext_pi_ref_escape(const u8 ref_tag[6]) +static blk_status_t blk_integrity_iterate(struct bio *bio, + struct bvec_iter *data_iter, + bool verify) { - static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter = { + .bio = bio, + .bip = bip, + .bi = bi, + .data_iter = *data_iter, + .prot_iter = bip->bip_iter, + .interval_remaining = 1 << bi->interval_exp, + .seed = data_iter->bi_sector, + .csum = 0, + }; + blk_status_t ret = BLK_STS_OK; + + while (iter.data_iter.bi_size && ret == BLK_STS_OK) { + struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec, + iter.data_iter); + void *kaddr = bvec_kmap_local(&bv); + void *data = kaddr; + unsigned int len; + + bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter, + bv.bv_len); + while (bv.bv_len && ret == BLK_STS_OK) { + len = min(iter.interval_remaining, bv.bv_len); + blk_calculate_guard(&iter, data, len); + bv.bv_len -= len; + data += len; + iter.interval_remaining -= len; + if (!iter.interval_remaining) + ret = blk_integrity_interval(&iter, verify); + } + kunmap_local(kaddr); + } - return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; + return ret; } -static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - u64 ref, seed; - __be64 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - ref = get_unaligned_be48(pi->ref_tag); - seed = lower_48_bits(iter->seed); - if (ref != seed) { - pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", - iter->disk_name, seed, ref); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - ext_pi_ref_escape(pi->ref_tag)) - goto next; - } +void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - csum = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, - offset); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_integrity_iterate(bio, &bio->bi_iter, false); + break; + default: + break; + } +} - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %016llx, want %016llx)\n", - iter->disk_name, (unsigned long long)iter->seed, - be64_to_cpu(pi->guard_tag), be64_to_cpu(csum)); - return BLK_STS_PROTECTION; - } +blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return blk_integrity_iterate(bio, saved_iter, true); + default: + break; } return BLK_STS_OK; } -static void ext_pi_type1_prepare(struct request *rq) +/* + * Advance @iter past the protection offset for protection formats that + * contain front padding on the metadata region. + */ +static void blk_pi_advance_offset(struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + unsigned int offset = bi->pi_offset; - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; + while (offset > 0) { + struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(bv.bv_len, offset); - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; + bvec_iter_advance_single(bip->bip_vec, iter, len); + offset -= len; + } +} - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == virt) - put_unaligned_be48(ref_tag, pi->ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } +static void *blk_tuple_remap_begin(union pi_tuple *tuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + struct bvec_iter titer; + struct bio_vec pbv; - bip->bip_flags |= BIP_MAPPED_INTEGRITY; + blk_pi_advance_offset(bi, bip, iter); + pbv = bvec_iter_bvec(bip->bip_vec, *iter); + if (likely(pbv.bv_len >= bi->pi_tuple_size)) + return bvec_kmap_local(&pbv); + + /* + * We need to preserve the state of the original iter for the + * copy_from_tuple at the end, so make a temp iter for here. + */ + titer = *iter; + blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size); + return tuple; +} + +static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int len = bi->metadata_size - bi->pi_offset; + + if (likely(ptuple != tuple)) { + kunmap_local(ptuple); + } else { + blk_integrity_copy_from_tuple(bip, iter, ptuple, + bi->pi_tuple_size); + len -= bi->pi_tuple_size; } + + bvec_iter_advance(bip->bip_vec, iter, len); } -static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u64 ref = get_unaligned_be48(&pi->ref_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == ref_tag) - put_unaligned_be48(virt, pi->ref_tag); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } + if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt)) + put_unaligned_be48(virt, pi->ref_tag); +} + +static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt, + u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == ref_tag && ref != virt) + put_unaligned_be32(virt, &pi->ref_tag); +} + +static void blk_reftag_remap_complete(struct blk_integrity *bi, + union pi_tuple *tuple, u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; } } -void bio_integrity_generate(struct bio *bio) +static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = bio->bi_iter.bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - bio_for_each_segment(bv, bio, bviter) { - void *kaddr = bvec_kmap_local(&bv); + u64 ref = get_unaligned_be48(&pi->ref_tag); - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ext_pi_crc64_generate(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - t10_pi_generate(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + if (ref == lower_48_bits(virt) && ref != ref_tag) + put_unaligned_be48(ref_tag, pi->ref_tag); +} + +static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == virt && ref != ref_tag) + put_unaligned_be32(ref_tag, &pi->ref_tag); +} + +static void blk_reftag_remap_prepare(struct blk_integrity *bi, + union pi_tuple *tuple, + u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_map_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; } } -blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) +static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi, + unsigned *intervals, u64 *ref, bool prep) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; + struct bvec_iter iter = bip->bip_iter; + u64 virt = bip_get_seed(bip); + union pi_tuple *ptuple; + union pi_tuple tuple; - /* - * At the moment verify is called bi_iter has been advanced during split - * and completion, so use the copy created during submission here. - */ - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = saved_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, *saved_iter) { - void *kaddr = bvec_kmap_local(&bv); - blk_status_t ret = BLK_STS_OK; + if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) { + *ref += bio->bi_iter.bi_size >> bi->interval_exp; + return; + } - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ret = ext_pi_crc64_verify(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - ret = t10_pi_verify(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + while (iter.bi_size && *intervals) { + ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter); + + if (prep) + blk_reftag_remap_prepare(bi, ptuple, virt, *ref); + else + blk_reftag_remap_complete(bi, ptuple, virt, *ref); - if (ret) - return ret; + blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter); + (*intervals)--; + (*ref)++; + virt++; } - return BLK_STS_OK; + if (prep) + bip->bip_flags |= BIP_MAPPED_INTEGRITY; } -void blk_integrity_prepare(struct request *rq) +static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes, + bool prep) { struct blk_integrity *bi = &rq->q->limits.integrity; + u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT); + unsigned intervals = nr_bytes >> bi->interval_exp; + struct bio *bio; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_prepare(rq); - else - t10_pi_type1_prepare(rq); + __rq_for_each_bio(bio, rq) { + __blk_reftag_remap(bio, bi, &intervals, &ref, prep); + if (!intervals) + break; + } } -void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +void blk_integrity_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->limits.integrity; - - if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) - return; + blk_integrity_remap(rq, blk_rq_bytes(rq), true); +} - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_complete(rq, nr_bytes); - else - t10_pi_type1_complete(rq, nr_bytes); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + blk_integrity_remap(rq, nr_bytes, false); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 58bf432ec5e6..3de52f1d2723 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } + bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE; bi->metadata_size = head->ms; if (bi->csum_type) { bi->pi_tuple_size = head->pi_size; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index ea6d7d322ae3..b1b530613c34 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,6 +14,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, + BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); -- cgit v1.2.3 From 0e24d17bd9668f9dad78ede6a0e8f13dab176682 Mon Sep 17 00:00:00 2001 From: Simon Baatz Date: Mon, 9 Mar 2026 09:02:26 +0100 Subject: tcp: implement RFC 7323 window retraction receiver requirements By default, the Linux TCP implementation does not shrink the advertised window (RFC 7323 calls this "window retraction") with the following exceptions: - When an incoming segment cannot be added due to the receive buffer running out of memory. Since commit 8c670bdfa58e ("tcp: correct handling of extreme memory squeeze") a zero window will be advertised in this case. It turns out that reaching the required memory pressure is easy when window scaling is in use. In the simplest case, sending a sufficient number of segments smaller than the scale factor to a receiver that does not read data is enough. - Commit b650d953cd39 ("tcp: enforce receive buffer memory limits by allowing the tcp window to shrink") addressed the "eating memory" problem by introducing a sysctl knob that allows shrinking the window before running out of memory. However, RFC 7323 does not only state that shrinking the window is necessary in some cases, it also formulates requirements for TCP implementations when doing so (Section 2.4). This commit addresses the receiver-side requirements: After retracting the window, the peer may have a snd_nxt that lies within a previously advertised window but is now beyond the retracted window. This means that all incoming segments (including pure ACKs) will be rejected until the application happens to read enough data to let the peer's snd_nxt be in window again (which may be never). To comply with RFC 7323, the receiver MUST honor any segment that would have been in window for any ACK sent by the receiver and, when window scaling is in effect, SHOULD track the maximum window sequence number it has advertised. This patch tracks that maximum window sequence number rcv_mwnd_seq throughout the connection and uses it in tcp_sequence() when deciding whether a segment is acceptable. rcv_mwnd_seq is updated together with rcv_wup and rcv_wnd in tcp_select_window(). If we count tcp_sequence() as fast path, it is read in the fast path. Therefore, rcv_mwnd_seq is put into rcv_wnd's cacheline group. The logic for handling received data in tcp_data_queue() is already sufficient and does not need to be updated. Signed-off-by: Simon Baatz Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260309-tcp_rfc7323_retract_wnd_rfc-v3-1-4c7f96b1ec69@gmail.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 3 +++ include/net/tcp.h | 22 ++++++++++++++++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 10 +++++----- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 3 +++ .../net/packetdrill/tcp_rcv_big_endseq.pkt | 2 +- 9 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 563daea10d6c..fecf61166a54 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -121,6 +121,7 @@ u64 delivered_mstamp read_write u32 rate_delivered read_mostly tcp_rate_gen u32 rate_interval_us read_mostly rate_delivered,rate_app_limited u32 rcv_wnd read_write read_mostly tcp_select_window,tcp_receive_window,tcp_fast_path_check +u32 rcv_mwnd_seq read_write tcp_select_window u32 write_seq read_write tcp_rate_check_app_limited,tcp_write_queue_empty,tcp_skb_entail,forced_push,tcp_mark_push u32 notsent_lowat read_mostly tcp_stream_memory_free u32 pushed_seq read_write tcp_mark_push,forced_push diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bcebc4f07532..6982f10e826b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -316,6 +316,9 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_mwnd_seq; /* Maximum window sequence number (RFC 7323, + * section 2.4, receiver requirements) + */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). diff --git a/include/net/tcp.h b/include/net/tcp.h index 48dffcca0a71..f87bdacb5a69 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -934,6 +934,28 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) return (u32) win; } +/* Compute the maximum receive window we ever advertised. + * Rcv_nxt can be after the window if our peer push more data + * than the offered window. + */ +static inline u32 tcp_max_receive_window(const struct tcp_sock *tp) +{ + s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt; + + if (win < 0) + win = 0; + return (u32) win; +} + +/* Check if we need to update the maximum receive window sequence number */ +static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp) +{ + u32 wre = tp->rcv_wup + tp->rcv_wnd; + + if (after(wre, tp->rcv_mwnd_seq)) + tp->rcv_mwnd_seq = wre; +} + /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed6f6712f060..516087c622ad 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3561,6 +3561,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; + tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd; return 0; } @@ -5275,6 +5276,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9fdc19accafd..4e389d609f91 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -377,6 +377,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71ac69b7b75e..2e1b23760815 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4808,20 +4808,18 @@ static enum skb_drop_reason tcp_sequence(const struct sock *sk, const struct tcphdr *th) { const struct tcp_sock *tp = tcp_sk(sk); - u32 seq_limit; if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; - seq_limit = tp->rcv_nxt + tcp_receive_window(tp); - if (unlikely(after(end_seq, seq_limit))) { + if (unlikely(after(end_seq, tp->rcv_nxt + tcp_max_receive_window(tp)))) { /* Some stacks are known to handle FIN incorrectly; allow the * FIN to extend beyond the window and check it in detail later. */ - if (!after(end_seq - th->fin, seq_limit)) + if (!after(end_seq - th->fin, tp->rcv_nxt + tcp_receive_window(tp))) return SKB_NOT_DROPPED_YET; - if (after(seq, seq_limit)) + if (after(seq, tp->rcv_nxt + tcp_max_receive_window(tp))) return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; /* Only accept this packet if receive queue is empty. */ @@ -6903,6 +6901,7 @@ consume: */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. @@ -7015,6 +7014,7 @@ consume: WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dafb63b923d0..d350d794a959 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -604,6 +604,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34a25ef61006..35c3b0ab5a0c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -293,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk) tp->pred_flags = 0; tp->rcv_wnd = 0; tp->rcv_wup = tp->rcv_nxt; + tcp_update_max_rcv_wnd_seq(tp); return 0; } @@ -316,6 +317,7 @@ static u16 tcp_select_window(struct sock *sk) tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + tcp_update_max_rcv_wnd_seq(tp); /* Make sure we do not exceed the maximum possible * scaled window. @@ -4165,6 +4167,7 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; + tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt index 6c0f32c40f19..12882be10f2e 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt @@ -36,7 +36,7 @@ +0 read(4, ..., 100000) = 4000 -// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd +// If queue is empty, accept a packet even if its end_seq is above rcv_mwnd_seq +0 < P. 4001:54001(50000) ack 1 win 257 * > . 1:1(0) ack 54001 win 0 -- cgit v1.2.3 From d15d3de94a4766fb43d7fe7a72ed0479fb268131 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Mar 2026 20:18:23 +0000 Subject: net: dropreason: add SKB_DROP_REASON_RECURSION_LIMIT ip[6]tunnel_xmit() can drop packets if a too deep recursion level is detected. Add SKB_DROP_REASON_RECURSION_LIMIT drop reason. We will use this reason later in __dev_queue_xmit(). Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260312201824.203093-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 3 +++ include/net/ip6_tunnel.h | 2 +- net/ipv4/ip_tunnel_core.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 5c8c2eb3d2c5..de61dd5dbfd9 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -123,6 +123,7 @@ FN(PFMEMALLOC) \ FN(PSP_INPUT) \ FN(PSP_OUTPUT) \ + FN(RECURSION_LIMIT) \ FNe(MAX) /** @@ -582,6 +583,8 @@ enum skb_drop_reason { SKB_DROP_REASON_PSP_INPUT, /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ SKB_DROP_REASON_PSP_OUTPUT, + /** @SKB_DROP_REASON_RECURSION_LIMIT: Dead loop on virtual device. */ + SKB_DROP_REASON_RECURSION_LIMIT, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 359b595f1df9..b99805ee2fd1 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -162,7 +162,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, dev->name); DEV_STATS_INC(dev, tx_errors); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_RECURSION_LIMIT); return; } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 5683c328990f..f430d6f0463e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -65,7 +65,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, DEV_STATS_INC(dev, tx_errors); } ip_rt_put(rt); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_RECURSION_LIMIT); return; } -- cgit v1.2.3 From f807b5b9b89eb9220d034115c272c312251cbcac Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 12 Mar 2026 12:13:52 +0000 Subject: net: stmmac: avoid passing pci_dev The pci_dev is only used to provide the ethtool bus_info using pci_name(priv->plat->pdev). This is the same as dev_name(priv->device). Thus, rather than passing the pci_dev, make use of what we already have. To avoid unexpectedly exposing the device name through ethtool where it wasn't provided before, add a flag priv->plat->provide_bus_info to enable this, which only dwmac-intel needs to set. Signed-off-by: Russell King (Oracle) Reviewed-by: Simon Horman Link: https://patch.msgid.link/E1w0evI-0000000CzY7-1fyo@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 5 ++--- include/linux/stmmac.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 421c6c81ca5e..f621077c30a4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -589,7 +589,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, int ret; int i; - plat->pdev = pdev; + plat->provide_bus_info = true; plat->phy_addr = -1; plat->clk_csr = STMMAC_CSR_250_300M; plat->core_type = DWMAC_CORE_GMAC4; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index c1e26965d9b5..92585d27ab88 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -312,10 +312,9 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev, strscpy(info->driver, MAC100_ETHTOOL_NAME, sizeof(info->driver)); - if (priv->plat->pdev) { - strscpy(info->bus_info, pci_name(priv->plat->pdev), + if (priv->plat->provide_bus_info) + strscpy(info->bus_info, dev_name(priv->device), sizeof(info->bus_info)); - } } static int stmmac_ethtool_get_link_ksettings(struct net_device *dev, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 937985276e6b..72febd246bdb 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -348,7 +348,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; u8 vlan_fail_q; - struct pci_dev *pdev; + bool provide_bus_info; int int_snapshot_num; int msi_mac_vec; int msi_wol_vec; -- cgit v1.2.3 From 6df1459605cedd2112ebf660c77f42bb87d5c306 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:03:31 +0100 Subject: net: phy: make mdio_device.c part of libphy This patch - makes mdio_device.c part of libphy - makes mdio_device_(un)register_reset() static - moves mdiobus_(un)register_device() from mdio_bus.c to mdio_device.c, stops exporting both functions and makes them private to phylib This further decouples the MDIO consumer functionality from libphy. Note: This makes MDIO driver registration part of phylib, therefore adjust Kconfig dependencies where needed. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c6dbf9b3-3ca0-434b-ad3a-71fe602ab809@gmail.com Signed-off-by: Jakub Kicinski --- drivers/clk/qcom/Kconfig | 2 +- drivers/net/phy/Makefile | 6 +++--- drivers/net/phy/mdio-private.h | 11 ----------- drivers/net/phy/mdio_bus.c | 36 ------------------------------------ drivers/net/phy/mdio_device.c | 39 ++++++++++++++++++++++++++++++++++++--- drivers/net/phy/phylib-internal.h | 4 ++++ drivers/phy/broadcom/Kconfig | 4 ++-- include/linux/mdio.h | 2 -- 8 files changed, 46 insertions(+), 58 deletions(-) delete mode 100644 drivers/net/phy/mdio-private.h (limited to 'include') diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig index a8a86ea6bb74..a277c434d641 100644 --- a/drivers/clk/qcom/Kconfig +++ b/drivers/clk/qcom/Kconfig @@ -392,7 +392,7 @@ config IPQ_NSSCC_9574 config IPQ_NSSCC_QCA8K tristate "QCA8K(QCA8386 or QCA8084) NSS Clock Controller" - depends on MDIO_BUS + depends on PHYLIB help Support for NSS(Network SubSystem) clock controller on qca8386/qca8084 chip. diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 3a34917adea7..8d262b4e2be2 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -3,8 +3,8 @@ libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ linkmode.o phy_link_topology.o \ - phy_caps.o mdio_bus_provider.o phy_port.o -mdio-bus-y += mdio_bus.o mdio_device.o + phy_caps.o mdio_bus_provider.o phy_port.o \ + mdio_device.o ifdef CONFIG_PHYLIB # built-in whenever PHYLIB is built-in or module @@ -15,7 +15,7 @@ libphy-$(CONFIG_SWPHY) += swphy.o libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_led_triggers.o libphy-$(CONFIG_OPEN_ALLIANCE_HELPERS) += open_alliance_helpers.o -obj-$(CONFIG_MDIO_BUS) += mdio-bus.o +obj-$(CONFIG_MDIO_BUS) += mdio_bus.o obj-$(CONFIG_PHYLINK) += phylink.o obj-$(CONFIG_PHYLIB) += libphy.o obj-$(CONFIG_PHYLIB) += mdio_devres.o diff --git a/drivers/net/phy/mdio-private.h b/drivers/net/phy/mdio-private.h deleted file mode 100644 index 8bc6d9088af1..000000000000 --- a/drivers/net/phy/mdio-private.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef __MDIO_PRIVATE_H -#define __MDIO_PRIVATE_H - -/* MDIO internal helpers - */ - -int mdio_device_register_reset(struct mdio_device *mdiodev); -void mdio_device_unregister_reset(struct mdio_device *mdiodev); - -#endif /* __MDIO_PRIVATE_H */ diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 48c0447e6a8f..a30c679feeca 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -29,46 +29,10 @@ #include #include #include -#include "mdio-private.h" #define CREATE_TRACE_POINTS #include -int mdiobus_register_device(struct mdio_device *mdiodev) -{ - int err; - - if (mdiodev->bus->mdio_map[mdiodev->addr]) - return -EBUSY; - - if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) { - err = mdio_device_register_reset(mdiodev); - if (err) - return err; - - /* Assert the reset signal */ - mdio_device_reset(mdiodev, 1); - } - - mdiodev->bus->mdio_map[mdiodev->addr] = mdiodev; - - return 0; -} -EXPORT_SYMBOL(mdiobus_register_device); - -int mdiobus_unregister_device(struct mdio_device *mdiodev) -{ - if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev) - return -EINVAL; - - mdio_device_unregister_reset(mdiodev); - - mdiodev->bus->mdio_map[mdiodev->addr] = NULL; - - return 0; -} -EXPORT_SYMBOL(mdiobus_unregister_device); - static struct mdio_device *mdiobus_find_device(struct mii_bus *bus, int addr) { bool addr_valid = addr >= 0 && addr < ARRAY_SIZE(bus->mdio_map); diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c index da4fb7484c7c..56080d3d2d25 100644 --- a/drivers/net/phy/mdio_device.c +++ b/drivers/net/phy/mdio_device.c @@ -22,7 +22,7 @@ #include #include #include -#include "mdio-private.h" +#include "phylib-internal.h" /** * mdio_device_register_reset - Read and initialize the reset properties of @@ -31,7 +31,7 @@ * * Return: Zero if successful, negative error code on failure */ -int mdio_device_register_reset(struct mdio_device *mdiodev) +static int mdio_device_register_reset(struct mdio_device *mdiodev) { struct reset_control *reset; @@ -67,7 +67,7 @@ int mdio_device_register_reset(struct mdio_device *mdiodev) * an mdio device * @mdiodev: mdio_device structure */ -void mdio_device_unregister_reset(struct mdio_device *mdiodev) +static void mdio_device_unregister_reset(struct mdio_device *mdiodev) { gpiod_put(mdiodev->reset_gpio); mdiodev->reset_gpio = NULL; @@ -189,6 +189,39 @@ void mdio_device_remove(struct mdio_device *mdiodev) } EXPORT_SYMBOL(mdio_device_remove); +int mdiobus_register_device(struct mdio_device *mdiodev) +{ + int err; + + if (mdiodev->bus->mdio_map[mdiodev->addr]) + return -EBUSY; + + if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) { + err = mdio_device_register_reset(mdiodev); + if (err) + return err; + + /* Assert the reset signal */ + mdio_device_reset(mdiodev, 1); + } + + mdiodev->bus->mdio_map[mdiodev->addr] = mdiodev; + + return 0; +} + +int mdiobus_unregister_device(struct mdio_device *mdiodev) +{ + if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev) + return -EINVAL; + + mdio_device_unregister_reset(mdiodev); + + mdiodev->bus->mdio_map[mdiodev->addr] = NULL; + + return 0; +} + /** * mdio_probe - probe an MDIO device * @dev: device to probe diff --git a/drivers/net/phy/phylib-internal.h b/drivers/net/phy/phylib-internal.h index dc9592c6bb8e..bfb1aa823868 100644 --- a/drivers/net/phy/phylib-internal.h +++ b/drivers/net/phy/phylib-internal.h @@ -6,6 +6,7 @@ #ifndef __PHYLIB_INTERNAL_H #define __PHYLIB_INTERNAL_H +struct mdio_device; struct phy_device; /* @@ -20,6 +21,9 @@ void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); void phy_check_downshift(struct phy_device *phydev); +int mdiobus_register_device(struct mdio_device *mdiodev); +int mdiobus_unregister_device(struct mdio_device *mdiodev); + int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); #endif /* __PHYLIB_INTERNAL_H */ diff --git a/drivers/phy/broadcom/Kconfig b/drivers/phy/broadcom/Kconfig index 1d89a2fd9b79..46371a8940d7 100644 --- a/drivers/phy/broadcom/Kconfig +++ b/drivers/phy/broadcom/Kconfig @@ -52,7 +52,7 @@ config PHY_BCM_NS_USB3 tristate "Broadcom Northstar USB 3.0 PHY Driver" depends on ARCH_BCM_IPROC || COMPILE_TEST depends on HAS_IOMEM && OF - depends on MDIO_BUS + depends on PHYLIB select GENERIC_PHY help Enable this to support Broadcom USB 3.0 PHY connected to the USB @@ -60,7 +60,7 @@ config PHY_BCM_NS_USB3 config PHY_NS2_PCIE tristate "Broadcom Northstar2 PCIe PHY driver" - depends on (OF && MDIO_BUS_MUX_BCM_IPROC) || (COMPILE_TEST && MDIO_BUS) + depends on (OF && MDIO_BUS_MUX_BCM_IPROC) || (COMPILE_TEST && PHYLIB) select GENERIC_PHY default ARCH_BCM_IPROC help diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 5d1203b9af20..f4f9d9609448 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -688,8 +688,6 @@ static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, val); } -int mdiobus_register_device(struct mdio_device *mdiodev); -int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); -- cgit v1.2.3 From c4399af5e55658e832779b256d8458323011f983 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:06:00 +0100 Subject: net: phy: move remaining provider code to mdio_bus_provider.c This moves definition of mdio_bus class and bus_type to the provider side, what allows to make them private to libphy. As a prerequisite MDIO statistics handling is moved to the provider side as well. Note: This patch causes a checkpatch error "Macros with complex values should be enclosed in parentheses" for MDIO_BUS_STATS_ADDR_ATTR_GROUP. I consider this a false positive here, in addition the patch just moves existing code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/47b85676-b349-4aa0-a5ef-cd37769a4c69@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio_bus.c | 282 ------------------------------------ drivers/net/phy/mdio_bus_provider.c | 275 +++++++++++++++++++++++++++++++++++ drivers/net/phy/phylib-internal.h | 3 + include/linux/phy.h | 3 - 4 files changed, 278 insertions(+), 285 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 9fb473326027..00d0e4159e9b 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -10,20 +10,14 @@ #include #include -#include #include -#include #include #include #include #include #include #include -#include -#include -#include #include -#include #include #include #include @@ -64,218 +58,6 @@ bool mdiobus_is_registered_device(struct mii_bus *bus, int addr) } EXPORT_SYMBOL(mdiobus_is_registered_device); -/** - * mdiobus_release - mii_bus device release callback - * @d: the target struct device that contains the mii_bus - * - * Description: called when the last reference to an mii_bus is - * dropped, to free the underlying memory. - */ -static void mdiobus_release(struct device *d) -{ - struct mii_bus *bus = to_mii_bus(d); - - WARN(bus->state != MDIOBUS_RELEASED && - /* for compatibility with error handling in drivers */ - bus->state != MDIOBUS_ALLOCATED, - "%s: not in RELEASED or ALLOCATED state\n", - bus->id); - - if (bus->state == MDIOBUS_RELEASED) - fwnode_handle_put(dev_fwnode(d)); - - kfree(bus); -} - -struct mdio_bus_stat_attr { - struct device_attribute attr; - int address; - unsigned int field_offset; -}; - -static struct mdio_bus_stat_attr *to_sattr(struct device_attribute *attr) -{ - return container_of(attr, struct mdio_bus_stat_attr, attr); -} - -static u64 mdio_bus_get_stat(struct mdio_bus_stats *s, unsigned int offset) -{ - const u64_stats_t *stats = (const void *)s + offset; - unsigned int start; - u64 val = 0; - - do { - start = u64_stats_fetch_begin(&s->syncp); - val = u64_stats_read(stats); - } while (u64_stats_fetch_retry(&s->syncp, start)); - - return val; -} - -static ssize_t mdio_bus_stat_field_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct mdio_bus_stat_attr *sattr = to_sattr(attr); - struct mii_bus *bus = to_mii_bus(dev); - u64 val = 0; - - if (sattr->address < 0) { - /* get global stats */ - for (int i = 0; i < PHY_MAX_ADDR; i++) - val += mdio_bus_get_stat(&bus->stats[i], - sattr->field_offset); - } else { - val = mdio_bus_get_stat(&bus->stats[sattr->address], - sattr->field_offset); - } - - return sysfs_emit(buf, "%llu\n", val); -} - -static ssize_t mdio_bus_device_stat_field_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct mdio_bus_stat_attr *sattr = to_sattr(attr); - struct mdio_device *mdiodev = to_mdio_device(dev); - struct mii_bus *bus = mdiodev->bus; - int addr = mdiodev->addr; - u64 val; - - val = mdio_bus_get_stat(&bus->stats[addr], sattr->field_offset); - - return sysfs_emit(buf, "%llu\n", val); -} - -#define MDIO_BUS_STATS_ATTR(field) \ -static const struct mdio_bus_stat_attr dev_attr_mdio_bus_##field = { \ - .attr = __ATTR(field, 0444, mdio_bus_stat_field_show, NULL), \ - .address = -1, \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -}; \ -static const struct mdio_bus_stat_attr dev_attr_mdio_bus_device_##field = { \ - .attr = __ATTR(field, 0444, mdio_bus_device_stat_field_show, NULL), \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -} - -MDIO_BUS_STATS_ATTR(transfers); -MDIO_BUS_STATS_ATTR(errors); -MDIO_BUS_STATS_ATTR(writes); -MDIO_BUS_STATS_ATTR(reads); - -#define MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, file) \ -static const struct mdio_bus_stat_attr \ -dev_attr_mdio_bus_addr_##field##_##addr = { \ - .attr = { .attr = { .name = file, .mode = 0444 }, \ - .show = mdio_bus_stat_field_show, \ - }, \ - .address = addr, \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -} - -#define MDIO_BUS_STATS_ADDR_ATTR(field, addr) \ - MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, \ - __stringify(field) "_" __stringify(addr)) - -#define MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(addr) \ - MDIO_BUS_STATS_ADDR_ATTR(transfers, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(errors, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(writes, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(reads, addr) \ - -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(0); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(1); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(2); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(3); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(4); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(5); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(6); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(7); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(8); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(9); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(10); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(11); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(12); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(13); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(14); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(15); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(16); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(17); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(18); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(19); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(20); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(21); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(22); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(23); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(24); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(25); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(26); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(27); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(28); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(29); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(30); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(31); - -#define MDIO_BUS_STATS_ADDR_ATTR_GROUP(addr) \ - &dev_attr_mdio_bus_addr_transfers_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_errors_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_writes_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_reads_##addr.attr.attr \ - -static const struct attribute *const mdio_bus_statistics_attrs[] = { - &dev_attr_mdio_bus_transfers.attr.attr, - &dev_attr_mdio_bus_errors.attr.attr, - &dev_attr_mdio_bus_writes.attr.attr, - &dev_attr_mdio_bus_reads.attr.attr, - MDIO_BUS_STATS_ADDR_ATTR_GROUP(0), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(1), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(2), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(3), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(4), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(5), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(6), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(7), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(8), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(9), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(10), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(11), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(12), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(13), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(14), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(15), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(16), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(17), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(18), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(19), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(20), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(21), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(22), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(23), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(24), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(25), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(26), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(27), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(28), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(29), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(30), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(31), - NULL, -}; - -static const struct attribute_group mdio_bus_statistics_group = { - .name = "statistics", - .attrs_const = mdio_bus_statistics_attrs, -}; -__ATTRIBUTE_GROUPS(mdio_bus_statistics); - -const struct class mdio_bus_class = { - .name = "mdio_bus", - .dev_release = mdiobus_release, - .dev_groups = mdio_bus_statistics_groups, -}; -EXPORT_SYMBOL_GPL(mdio_bus_class); - static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) { preempt_disable(); @@ -841,69 +623,5 @@ int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, } EXPORT_SYMBOL_GPL(mdiobus_c45_modify_changed); -/** - * mdio_bus_match - determine if given MDIO driver supports the given - * MDIO device - * @dev: target MDIO device - * @drv: given MDIO driver - * - * Return: 1 if the driver supports the device, 0 otherwise - * - * Description: This may require calling the devices own match function, - * since different classes of MDIO devices have different match criteria. - */ -static int mdio_bus_match(struct device *dev, const struct device_driver *drv) -{ - const struct mdio_driver *mdiodrv = to_mdio_driver(drv); - struct mdio_device *mdio = to_mdio_device(dev); - - /* Both the driver and device must type-match */ - if (!(mdiodrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY) != - !(mdio->flags & MDIO_DEVICE_FLAG_PHY)) - return 0; - - if (of_driver_match_device(dev, drv)) - return 1; - - if (mdio->bus_match) - return mdio->bus_match(dev, drv); - - return 0; -} - -static int mdio_uevent(const struct device *dev, struct kobj_uevent_env *env) -{ - int rc; - - /* Some devices have extra OF data and an OF-style MODALIAS */ - rc = of_device_uevent_modalias(dev, env); - if (rc != -ENODEV) - return rc; - - return 0; -} - -static const struct attribute *const mdio_bus_device_statistics_attrs[] = { - &dev_attr_mdio_bus_device_transfers.attr.attr, - &dev_attr_mdio_bus_device_errors.attr.attr, - &dev_attr_mdio_bus_device_writes.attr.attr, - &dev_attr_mdio_bus_device_reads.attr.attr, - NULL, -}; - -static const struct attribute_group mdio_bus_device_statistics_group = { - .name = "statistics", - .attrs_const = mdio_bus_device_statistics_attrs, -}; -__ATTRIBUTE_GROUPS(mdio_bus_device_statistics); - -const struct bus_type mdio_bus_type = { - .name = "mdio_bus", - .dev_groups = mdio_bus_device_statistics_groups, - .match = mdio_bus_match, - .uevent = mdio_uevent, -}; -EXPORT_SYMBOL(mdio_bus_type); - MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MDIO bus/device layer"); diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c index d50fe6eb4b02..041576eba47a 100644 --- a/drivers/net/phy/mdio_bus_provider.c +++ b/drivers/net/phy/mdio_bus_provider.c @@ -28,6 +28,281 @@ #include #include #include +#include "phylib-internal.h" + +/** + * mdiobus_release - mii_bus device release callback + * @d: the target struct device that contains the mii_bus + * + * Description: called when the last reference to an mii_bus is + * dropped, to free the underlying memory. + */ +static void mdiobus_release(struct device *d) +{ + struct mii_bus *bus = to_mii_bus(d); + + WARN(bus->state != MDIOBUS_RELEASED && + /* for compatibility with error handling in drivers */ + bus->state != MDIOBUS_ALLOCATED, + "%s: not in RELEASED or ALLOCATED state\n", + bus->id); + + if (bus->state == MDIOBUS_RELEASED) + fwnode_handle_put(dev_fwnode(d)); + + kfree(bus); +} + +struct mdio_bus_stat_attr { + struct device_attribute attr; + int address; + unsigned int field_offset; +}; + +static struct mdio_bus_stat_attr *to_sattr(struct device_attribute *attr) +{ + return container_of(attr, struct mdio_bus_stat_attr, attr); +} + +static u64 mdio_bus_get_stat(struct mdio_bus_stats *s, unsigned int offset) +{ + const u64_stats_t *stats = (const void *)s + offset; + unsigned int start; + u64 val = 0; + + do { + start = u64_stats_fetch_begin(&s->syncp); + val = u64_stats_read(stats); + } while (u64_stats_fetch_retry(&s->syncp, start)); + + return val; +} + +static ssize_t mdio_bus_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdio_bus_stat_attr *sattr = to_sattr(attr); + struct mii_bus *bus = to_mii_bus(dev); + u64 val = 0; + + if (sattr->address < 0) { + /* get global stats */ + for (int i = 0; i < PHY_MAX_ADDR; i++) + val += mdio_bus_get_stat(&bus->stats[i], + sattr->field_offset); + } else { + val = mdio_bus_get_stat(&bus->stats[sattr->address], + sattr->field_offset); + } + + return sysfs_emit(buf, "%llu\n", val); +} + +static ssize_t mdio_bus_device_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdio_bus_stat_attr *sattr = to_sattr(attr); + struct mdio_device *mdiodev = to_mdio_device(dev); + struct mii_bus *bus = mdiodev->bus; + int addr = mdiodev->addr; + u64 val; + + val = mdio_bus_get_stat(&bus->stats[addr], sattr->field_offset); + + return sysfs_emit(buf, "%llu\n", val); +} + +#define MDIO_BUS_STATS_ATTR(field) \ +static const struct mdio_bus_stat_attr dev_attr_mdio_bus_##field = { \ + .attr = __ATTR(field, 0444, mdio_bus_stat_field_show, NULL), \ + .address = -1, \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +}; \ +static const struct mdio_bus_stat_attr dev_attr_mdio_bus_device_##field = { \ + .attr = __ATTR(field, 0444, mdio_bus_device_stat_field_show, NULL), \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +} + +MDIO_BUS_STATS_ATTR(transfers); +MDIO_BUS_STATS_ATTR(errors); +MDIO_BUS_STATS_ATTR(writes); +MDIO_BUS_STATS_ATTR(reads); + +#define MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, file) \ +static const struct mdio_bus_stat_attr \ +dev_attr_mdio_bus_addr_##field##_##addr = { \ + .attr = { .attr = { .name = file, .mode = 0444 }, \ + .show = mdio_bus_stat_field_show, \ + }, \ + .address = addr, \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +} + +#define MDIO_BUS_STATS_ADDR_ATTR(field, addr) \ + MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, \ + __stringify(field) "_" __stringify(addr)) + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(addr) \ + MDIO_BUS_STATS_ADDR_ATTR(transfers, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(errors, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(writes, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(reads, addr) \ + +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(0); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(1); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(2); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(3); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(4); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(5); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(6); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(7); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(8); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(9); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(10); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(11); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(12); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(13); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(14); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(15); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(16); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(17); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(18); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(19); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(20); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(21); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(22); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(23); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(24); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(25); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(26); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(27); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(28); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(29); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(30); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(31); + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP(addr) \ + &(dev_attr_mdio_bus_addr_transfers_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_errors_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_writes_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_reads_##addr).attr.attr \ + +static const struct attribute *const mdio_bus_statistics_attrs[] = { + &dev_attr_mdio_bus_transfers.attr.attr, + &dev_attr_mdio_bus_errors.attr.attr, + &dev_attr_mdio_bus_writes.attr.attr, + &dev_attr_mdio_bus_reads.attr.attr, + MDIO_BUS_STATS_ADDR_ATTR_GROUP(0), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(1), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(2), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(3), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(4), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(5), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(6), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(7), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(8), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(9), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(10), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(11), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(12), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(13), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(14), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(15), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(16), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(17), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(18), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(19), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(20), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(21), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(22), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(23), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(24), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(25), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(26), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(27), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(28), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(29), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(30), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(31), + NULL, +}; + +static const struct attribute_group mdio_bus_statistics_group = { + .name = "statistics", + .attrs_const = mdio_bus_statistics_attrs, +}; +__ATTRIBUTE_GROUPS(mdio_bus_statistics); + +const struct class mdio_bus_class = { + .name = "mdio_bus", + .dev_release = mdiobus_release, + .dev_groups = mdio_bus_statistics_groups, +}; + +/** + * mdio_bus_match - determine if given MDIO driver supports the given + * MDIO device + * @dev: target MDIO device + * @drv: given MDIO driver + * + * Return: 1 if the driver supports the device, 0 otherwise + * + * Description: This may require calling the devices own match function, + * since different classes of MDIO devices have different match criteria. + */ +static int mdio_bus_match(struct device *dev, const struct device_driver *drv) +{ + const struct mdio_driver *mdiodrv = to_mdio_driver(drv); + struct mdio_device *mdio = to_mdio_device(dev); + + /* Both the driver and device must type-match */ + if (!(mdiodrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY) != + !(mdio->flags & MDIO_DEVICE_FLAG_PHY)) + return 0; + + if (of_driver_match_device(dev, drv)) + return 1; + + if (mdio->bus_match) + return mdio->bus_match(dev, drv); + + return 0; +} + +static int mdio_uevent(const struct device *dev, struct kobj_uevent_env *env) +{ + int rc; + + /* Some devices have extra OF data and an OF-style MODALIAS */ + rc = of_device_uevent_modalias(dev, env); + if (rc != -ENODEV) + return rc; + + return 0; +} + +static const struct attribute *const mdio_bus_device_statistics_attrs[] = { + &dev_attr_mdio_bus_device_transfers.attr.attr, + &dev_attr_mdio_bus_device_errors.attr.attr, + &dev_attr_mdio_bus_device_writes.attr.attr, + &dev_attr_mdio_bus_device_reads.attr.attr, + NULL, +}; + +static const struct attribute_group mdio_bus_device_statistics_group = { + .name = "statistics", + .attrs_const = mdio_bus_device_statistics_attrs, +}; +__ATTRIBUTE_GROUPS(mdio_bus_device_statistics); + +const struct bus_type mdio_bus_type = { + .name = "mdio_bus", + .dev_groups = mdio_bus_device_statistics_groups, + .match = mdio_bus_match, + .uevent = mdio_uevent, +}; /** * mdiobus_alloc_size - allocate a mii_bus structure diff --git a/drivers/net/phy/phylib-internal.h b/drivers/net/phy/phylib-internal.h index bfb1aa823868..664ed7faa518 100644 --- a/drivers/net/phy/phylib-internal.h +++ b/drivers/net/phy/phylib-internal.h @@ -9,6 +9,9 @@ struct mdio_device; struct phy_device; +extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; + /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ diff --git a/include/linux/phy.h b/include/linux/phy.h index e9b0d7427b0e..5de4b172cd0b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2446,9 +2446,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct phy_port *phy_get_sfp_port(struct phy_device *phydev); -extern const struct bus_type mdio_bus_type; -extern const struct class mdio_bus_class; - /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register -- cgit v1.2.3 From 68deca0f0f4bba8c5278340c7c142500171d5f9b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:03:55 +0100 Subject: devlink: expose devlink instance index over netlink Each devlink instance has an internally assigned index used for xarray storage. Expose it as a new DEVLINK_ATTR_INDEX uint attribute alongside the existing bus_name and dev_name handle. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 5 +++++ include/uapi/linux/devlink.h | 2 ++ net/devlink/devl_internal.h | 2 ++ net/devlink/port.c | 1 + 4 files changed, 10 insertions(+) (limited to 'include') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 837112da6738..1bed67a0eefb 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -867,6 +867,10 @@ attribute-sets: type: flag doc: Request restoring parameter to its default value. value: 183 + - + name: index + type: uint + doc: Unique devlink instance index. - name: dl-dev-stats subset-of: devlink @@ -1311,6 +1315,7 @@ operations: attributes: - bus-name - dev-name + - index - reload-failed - dev-stats dump: diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index e7d6b6d13470..1ba3436db4ae 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -642,6 +642,8 @@ enum devlink_attr { DEVLINK_ATTR_PARAM_VALUE_DEFAULT, /* dynamic */ DEVLINK_ATTR_PARAM_RESET_DEFAULT, /* flag */ + DEVLINK_ATTR_INDEX, /* uint */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 1377864383bc..31fa98af418e 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -178,6 +178,8 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return -EMSGSIZE; if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) return -EMSGSIZE; + if (nla_put_uint(msg, DEVLINK_ATTR_INDEX, devlink->index)) + return -EMSGSIZE; return 0; } diff --git a/net/devlink/port.c b/net/devlink/port.c index 93d8a25bb920..1ff609571ea4 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -222,6 +222,7 @@ size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + + nla_total_size(8) /* DEVLINK_ATTR_INDEX */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } -- cgit v1.2.3 From 0f5531879afbf904f19a15b39f687a9ec47a82cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:03:56 +0100 Subject: devlink: add helpers to get bus_name/dev_name Introduce devlink_bus_name() and devlink_dev_name() helpers and convert all direct accesses to devlink->dev->bus->name and dev_name(devlink->dev) to use them. This prepares for dev-less devlink instances where these helpers will be extended to handle the missing device. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-3-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 ++ include/trace/events/devlink.h | 24 ++++++++++++------------ net/devlink/core.c | 12 ++++++++++++ net/devlink/devl_internal.h | 8 ++++---- net/devlink/netlink.c | 4 ++-- net/devlink/port.c | 4 ++-- 6 files changed, 34 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index cb839e0435a1..0afb0958b910 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1611,6 +1611,8 @@ struct devlink_ops { void *devlink_priv(struct devlink *devlink); struct devlink *priv_to_devlink(void *priv); struct device *devlink_to_dev(const struct devlink *devlink); +const char *devlink_bus_name(const struct devlink *devlink); +const char *devlink_dev_name(const struct devlink *devlink); /* Devlink instance explicit locking */ void devl_lock(struct devlink *devlink); diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index f241e204fe6b..32304ce9ad15 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -21,8 +21,8 @@ TRACE_EVENT(devlink_hwmsg, TP_ARGS(devlink, incoming, type, buf, len), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __field(bool, incoming) __field(unsigned long, type) @@ -55,8 +55,8 @@ TRACE_EVENT(devlink_hwerr, TP_ARGS(devlink, err, msg), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __field(int, err) __string(msg, msg) @@ -85,8 +85,8 @@ TRACE_EVENT(devlink_health_report, TP_ARGS(devlink, reporter_name, msg), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __string(reporter_name, reporter_name) __string(msg, msg) @@ -116,8 +116,8 @@ TRACE_EVENT(devlink_health_recover_aborted, TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __string(reporter_name, reporter_name) __field(bool, health_state) @@ -150,8 +150,8 @@ TRACE_EVENT(devlink_health_reporter_state_update, TP_ARGS(devlink, reporter_name, new_state), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __string(reporter_name, reporter_name) __field(u8, new_state) @@ -181,8 +181,8 @@ TRACE_EVENT(devlink_trap_report, TP_ARGS(devlink, skb, metadata), TP_STRUCT__entry( - __string(bus_name, devlink_to_dev(devlink)->bus->name) - __string(dev_name, dev_name(devlink_to_dev(devlink))) + __string(bus_name, devlink_bus_name(devlink)) + __string(dev_name, devlink_dev_name(devlink)) __string(driver_name, devlink_to_dev(devlink)->driver->name) __string(trap_name, metadata->trap_name) __string(trap_group_name, metadata->trap_group_name) diff --git a/net/devlink/core.c b/net/devlink/core.c index d8e509a669bf..63709c132a7c 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -248,6 +248,18 @@ struct device *devlink_to_dev(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_to_dev); +const char *devlink_bus_name(const struct devlink *devlink) +{ + return devlink->dev->bus->name; +} +EXPORT_SYMBOL_GPL(devlink_bus_name); + +const char *devlink_dev_name(const struct devlink *devlink) +{ + return dev_name(devlink->dev); +} +EXPORT_SYMBOL_GPL(devlink_dev_name); + struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 31fa98af418e..1b770de0313e 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -174,9 +174,9 @@ devlink_dump_state(struct netlink_callback *cb) static inline int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) { - if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink_bus_name(devlink))) return -EMSGSIZE; - if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, devlink_dev_name(devlink))) return -EMSGSIZE; if (nla_put_uint(msg, DEVLINK_ATTR_INDEX, devlink->index)) return -EMSGSIZE; @@ -211,8 +211,8 @@ static inline void devlink_nl_obj_desc_init(struct devlink_obj_desc *desc, struct devlink *devlink) { memset(desc, 0, sizeof(*desc)); - desc->bus_name = devlink->dev->bus->name; - desc->dev_name = dev_name(devlink->dev); + desc->bus_name = devlink_bus_name(devlink); + desc->dev_name = devlink_dev_name(devlink); } static inline void devlink_nl_obj_desc_port_set(struct devlink_obj_desc *desc, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 593605c1b1ef..56817b85a3f9 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -193,8 +193,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); devlinks_xa_for_each_registered_get(net, index, devlink) { - if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) { + if (strcmp(devlink_bus_name(devlink), busname) == 0 && + strcmp(devlink_dev_name(devlink), devname) == 0) { devl_dev_lock(devlink, dev_lock); if (devl_is_registered(devlink)) return devlink; diff --git a/net/devlink/port.c b/net/devlink/port.c index 1ff609571ea4..fa3e1597711b 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -220,8 +220,8 @@ size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; - return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ - + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + return nla_total_size(strlen(devlink_bus_name(devlink)) + 1) /* DEVLINK_ATTR_BUS_NAME */ + + nla_total_size(strlen(devlink_dev_name(devlink)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(8) /* DEVLINK_ATTR_INDEX */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } -- cgit v1.2.3 From 725d5fdb7b9c01d9e7079682acf998703762475b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:03:59 +0100 Subject: devlink: support index-based lookup via bus_name/dev_name handle Devlink instances without a backing device use bus_name "devlink_index" and dev_name set to the decimal index string. When user space sends this handle, detect the pattern and perform a direct xarray lookup by index instead of iterating all instances. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-6-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 2 ++ net/devlink/netlink.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1ba3436db4ae..7de2d8cc862f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -19,6 +19,8 @@ #define DEVLINK_GENL_VERSION 0x1 #define DEVLINK_GENL_MCGRP_CONFIG_NAME "config" +#define DEVLINK_INDEX_BUS_NAME "devlink_index" + enum devlink_command { /* don't change the order or add anything between, this is ABI! */ DEVLINK_CMD_UNSPEC, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 9cba40285de4..fa38fca22fe4 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -203,6 +203,15 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + if (!strcmp(busname, DEVLINK_INDEX_BUS_NAME)) { + if (kstrtoul(devname, 10, &index)) + return ERR_PTR(-ENODEV); + devlink = devlinks_xa_lookup_get(net, index); + if (!devlink) + return ERR_PTR(-ENODEV); + goto found; + } + devlinks_xa_for_each_registered_get(net, index, devlink) { if (strcmp(devlink_bus_name(devlink), busname) == 0 && strcmp(devlink_dev_name(devlink), devname) == 0) -- cgit v1.2.3 From 20b0f383aae7d26990a769d52b4d5c0e570e659c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:04:02 +0100 Subject: devlink: add devlink_dev_driver_name() helper and use it in trace events In preparation to dev-less devlinks, add devlink_dev_driver_name() that returns the driver name stored in devlink struct, and use it in all trace events. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-9-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 1 + include/trace/events/devlink.h | 12 ++++++------ net/devlink/core.c | 6 ++++++ 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 0afb0958b910..45dec7067a8e 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1613,6 +1613,7 @@ struct devlink *priv_to_devlink(void *priv); struct device *devlink_to_dev(const struct devlink *devlink); const char *devlink_bus_name(const struct devlink *devlink); const char *devlink_dev_name(const struct devlink *devlink); +const char *devlink_dev_driver_name(const struct devlink *devlink); /* Devlink instance explicit locking */ void devl_lock(struct devlink *devlink); diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 32304ce9ad15..4f8edf77dfbe 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -23,7 +23,7 @@ TRACE_EVENT(devlink_hwmsg, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __field(bool, incoming) __field(unsigned long, type) __dynamic_array(u8, buf, len) @@ -57,7 +57,7 @@ TRACE_EVENT(devlink_hwerr, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __field(int, err) __string(msg, msg) ), @@ -87,7 +87,7 @@ TRACE_EVENT(devlink_health_report, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __string(reporter_name, reporter_name) __string(msg, msg) ), @@ -118,7 +118,7 @@ TRACE_EVENT(devlink_health_recover_aborted, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __string(reporter_name, reporter_name) __field(bool, health_state) __field(u64, time_since_last_recover) @@ -152,7 +152,7 @@ TRACE_EVENT(devlink_health_reporter_state_update, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __string(reporter_name, reporter_name) __field(u8, new_state) ), @@ -183,7 +183,7 @@ TRACE_EVENT(devlink_trap_report, TP_STRUCT__entry( __string(bus_name, devlink_bus_name(devlink)) __string(dev_name, devlink_dev_name(devlink)) - __string(driver_name, devlink_to_dev(devlink)->driver->name) + __string(driver_name, devlink_dev_driver_name(devlink)) __string(trap_name, metadata->trap_name) __string(trap_group_name, metadata->trap_group_name) __array(char, input_dev_name, IFNAMSIZ) diff --git a/net/devlink/core.c b/net/devlink/core.c index fcb73d3e56aa..34eb06d88544 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -260,6 +260,12 @@ const char *devlink_dev_name(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_dev_name); +const char *devlink_dev_driver_name(const struct devlink *devlink) +{ + return devlink->dev_driver->name; +} +EXPORT_SYMBOL_GPL(devlink_dev_driver_name); + struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); -- cgit v1.2.3 From 1850e76b38049548ecb03c62bb10d40b94eecaac Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:04:05 +0100 Subject: devlink: introduce shared devlink instance for PFs on same chip Multiple PFs may reside on the same physical chip, running a single firmware. Some of the resources and configurations may be shared among these PFs. Currently, there is no good object to pin the configuration knobs on. Introduce a shared devlink instance, instantiated upon probe of the first PF and removed during remove of the last PF. The shared devlink instance is not backed by any device device, as there is no PCI device related to it. The implementation uses reference counting to manage the lifecycle: each PF that probes calls devlink_shd_get() to get or create the shared instance, and calls devlink_shd_put() when it removes. The shared instance is automatically destroyed when the last PF removes. Example: pci/0000:08:00.0: index 0 nested_devlink: auxiliary/mlx5_core.eth.0 devlink_index/1: index 1 nested_devlink: pci/0000:08:00.0 pci/0000:08:00.1 auxiliary/mlx5_core.eth.0: index 2 pci/0000:08:00.1: index 3 nested_devlink: auxiliary/mlx5_core.eth.1 auxiliary/mlx5_core.eth.1: index 4 Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-12-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 7 +++ net/devlink/Makefile | 2 +- net/devlink/sh_dev.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 net/devlink/sh_dev.c (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 45dec7067a8e..3038af6ec017 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1647,6 +1647,13 @@ void devlink_register(struct devlink *devlink); void devlink_unregister(struct devlink *devlink); void devlink_free(struct devlink *devlink); +struct devlink *devlink_shd_get(const char *id, + const struct devlink_ops *ops, + size_t priv_size, + const struct device_driver *driver); +void devlink_shd_put(struct devlink *devlink); +void *devlink_shd_get_priv(struct devlink *devlink); + /** * struct devlink_port_ops - Port operations * @port_split: Callback used to split the port into multiple ones. diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 000da622116a..8f2adb5e5836 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o param.o region.o health.o trap.o rate.o linecard.o + resource.o param.o region.o health.o trap.o rate.o linecard.o sh_dev.o diff --git a/net/devlink/sh_dev.c b/net/devlink/sh_dev.c new file mode 100644 index 000000000000..85acce97e788 --- /dev/null +++ b/net/devlink/sh_dev.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include + +#include "devl_internal.h" + +static LIST_HEAD(shd_list); +static DEFINE_MUTEX(shd_mutex); /* Protects shd_list and shd->list */ + +/* This structure represents a shared devlink instance, + * there is one created per identifier (e.g., serial number). + */ +struct devlink_shd { + struct list_head list; /* Node in shd list */ + const char *id; /* Identifier string (e.g., serial number) */ + refcount_t refcount; /* Reference count */ + size_t priv_size; /* Size of driver private data */ + char priv[] __aligned(NETDEV_ALIGN) __counted_by(priv_size); +}; + +static struct devlink_shd *devlink_shd_lookup(const char *id) +{ + struct devlink_shd *shd; + + list_for_each_entry(shd, &shd_list, list) { + if (!strcmp(shd->id, id)) + return shd; + } + + return NULL; +} + +static struct devlink_shd *devlink_shd_create(const char *id, + const struct devlink_ops *ops, + size_t priv_size, + const struct device_driver *driver) +{ + struct devlink_shd *shd; + struct devlink *devlink; + + devlink = __devlink_alloc(ops, sizeof(struct devlink_shd) + priv_size, + &init_net, NULL, driver); + if (!devlink) + return NULL; + shd = devlink_priv(devlink); + + shd->id = kstrdup(id, GFP_KERNEL); + if (!shd->id) + goto err_devlink_free; + shd->priv_size = priv_size; + refcount_set(&shd->refcount, 1); + + devl_lock(devlink); + devl_register(devlink); + devl_unlock(devlink); + + list_add_tail(&shd->list, &shd_list); + + return shd; + +err_devlink_free: + devlink_free(devlink); + return NULL; +} + +static void devlink_shd_destroy(struct devlink_shd *shd) +{ + struct devlink *devlink = priv_to_devlink(shd); + + list_del(&shd->list); + devl_lock(devlink); + devl_unregister(devlink); + devl_unlock(devlink); + kfree(shd->id); + devlink_free(devlink); +} + +/** + * devlink_shd_get - Get or create a shared devlink instance + * @id: Identifier string (e.g., serial number) for the shared instance + * @ops: Devlink operations structure + * @priv_size: Size of private data structure + * @driver: Driver associated with the shared devlink instance + * + * Get an existing shared devlink instance identified by @id, or create + * a new one if it doesn't exist. Return the devlink instance with a + * reference held. The caller must call devlink_shd_put() when done. + * + * All callers sharing the same @id must pass identical @ops, @priv_size + * and @driver. A mismatch triggers a warning and returns NULL. + * + * Return: Pointer to the shared devlink instance on success, + * NULL on failure + */ +struct devlink *devlink_shd_get(const char *id, + const struct devlink_ops *ops, + size_t priv_size, + const struct device_driver *driver) +{ + struct devlink *devlink; + struct devlink_shd *shd; + + mutex_lock(&shd_mutex); + + shd = devlink_shd_lookup(id); + if (!shd) { + shd = devlink_shd_create(id, ops, priv_size, driver); + goto unlock; + } + + devlink = priv_to_devlink(shd); + if (WARN_ON_ONCE(devlink->ops != ops || + shd->priv_size != priv_size || + devlink->dev_driver != driver)) { + shd = NULL; + goto unlock; + } + refcount_inc(&shd->refcount); + +unlock: + mutex_unlock(&shd_mutex); + return shd ? priv_to_devlink(shd) : NULL; +} +EXPORT_SYMBOL_GPL(devlink_shd_get); + +/** + * devlink_shd_put - Release a reference on a shared devlink instance + * @devlink: Shared devlink instance + * + * Release a reference on a shared devlink instance obtained via + * devlink_shd_get(). + */ +void devlink_shd_put(struct devlink *devlink) +{ + struct devlink_shd *shd; + + mutex_lock(&shd_mutex); + shd = devlink_priv(devlink); + if (refcount_dec_and_test(&shd->refcount)) + devlink_shd_destroy(shd); + mutex_unlock(&shd_mutex); +} +EXPORT_SYMBOL_GPL(devlink_shd_put); + +/** + * devlink_shd_get_priv - Get private data from shared devlink instance + * @devlink: Devlink instance + * + * Returns a pointer to the driver's private data structure within + * the shared devlink instance. + * + * Return: Pointer to private data + */ +void *devlink_shd_get_priv(struct devlink *devlink) +{ + struct devlink_shd *shd = devlink_priv(devlink); + + return shd->priv; +} +EXPORT_SYMBOL_GPL(devlink_shd_get_priv); -- cgit v1.2.3 From 2a8c8a03f306e21a0ea74c93d4332119557f4575 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:04:07 +0100 Subject: net/mlx5: Add a shared devlink instance for PFs on same chip Use the previously introduced shared devlink infrastructure to create a shared devlink instance for mlx5 PFs that reside on the same physical chip. The shared instance is identified by the chip's serial number extracted from PCI VPD (V3 keyword, with fallback to serial number for older devices). Each PF that probes calls mlx5_shd_init() which extracts the chip serial number and uses devlink_shd_get() to get or create the shared instance. When a PF is removed, mlx5_shd_uninit() calls devlink_shd_put() to release the reference. The shared instance is automatically destroyed when the last PF is removed. Make the PF devlink instances nested in this shared devlink instance, allowing userspace to identify which PFs belong to the same physical chip. Example: pci/0000:08:00.0: index 0 nested_devlink: auxiliary/mlx5_core.eth.0 devlink_index/1: index 1 nested_devlink: pci/0000:08:00.0 pci/0000:08:00.1 auxiliary/mlx5_core.eth.0: index 2 pci/0000:08:00.1: index 3 nested_devlink: auxiliary/mlx5_core.eth.1 auxiliary/mlx5_core.eth.1: index 4 Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-14-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 5 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 17 ++++++ .../net/ethernet/mellanox/mlx5/core/sh_devlink.c | 61 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/sh_devlink.h | 12 +++++ include/linux/mlx5/driver.h | 1 + 5 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 8ffa286a18f5..d39fe9c4a87c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -16,8 +16,9 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ - diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o + diag/fw_tracer.o diag/crdump.o devlink.o sh_devlink.o diag/rsc_dump.o \ + diag/reporter_vnic.o fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o \ + lib/nv_param.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..1c35c3fc3bb3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -74,6 +74,7 @@ #include "mlx5_irq.h" #include "hwmon.h" #include "lag/lag.h" +#include "sh_devlink.h" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); @@ -1520,10 +1521,16 @@ int mlx5_init_one(struct mlx5_core_dev *dev) int err; devl_lock(devlink); + if (dev->shd) { + err = devl_nested_devlink_set(dev->shd, devlink); + if (err) + goto unlock; + } devl_register(devlink); err = mlx5_init_one_devl_locked(dev); if (err) devl_unregister(devlink); +unlock: devl_unlock(devlink); return err; } @@ -2005,6 +2012,13 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto pci_init_err; } + err = mlx5_shd_init(dev); + if (err) { + mlx5_core_err(dev, "mlx5_shd_init failed with error code %d\n", + err); + goto shd_init_err; + } + err = mlx5_init_one(dev); if (err) { mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n", @@ -2018,6 +2032,8 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) return 0; err_init_one: + mlx5_shd_uninit(dev); +shd_init_err: mlx5_pci_close(dev); pci_init_err: mlx5_mdev_uninit(dev); @@ -2039,6 +2055,7 @@ static void remove_one(struct pci_dev *pdev) mlx5_drain_health_wq(dev); mlx5_sriov_disable(pdev, false); mlx5_uninit_one(dev); + mlx5_shd_uninit(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); mlx5_adev_idx_free(dev->priv.adev_idx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c new file mode 100644 index 000000000000..bc33f95302df --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include + +#include "sh_devlink.h" + +static const struct devlink_ops mlx5_shd_ops = { +}; + +int mlx5_shd_init(struct mlx5_core_dev *dev) +{ + u8 *vpd_data __free(kfree) = NULL; + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + struct devlink *devlink; + char *sn, *end; + int start; + int err; + + if (!mlx5_core_is_pf(dev)) + return 0; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + err = PTR_ERR(vpd_data); + return err == -ENODEV ? 0 : err; + } + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len); + if (start < 0) { + /* Fall-back to SN for older devices. */ + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, + PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len); + if (start < 0) + return -ENOENT; + } + sn = kstrndup(vpd_data + start, kw_len, GFP_KERNEL); + if (!sn) + return -ENOMEM; + /* Firmware may return spaces at the end of the string, strip it. */ + end = strchrnul(sn, ' '); + *end = '\0'; + + /* Get or create shared devlink instance */ + devlink = devlink_shd_get(sn, &mlx5_shd_ops, 0, pdev->dev.driver); + kfree(sn); + if (!devlink) + return -ENOMEM; + + dev->shd = devlink; + return 0; +} + +void mlx5_shd_uninit(struct mlx5_core_dev *dev) +{ + if (!dev->shd) + return; + + devlink_shd_put(dev->shd); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h new file mode 100644 index 000000000000..8ab8d6940227 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_SH_DEVLINK_H__ +#define __MLX5_SH_DEVLINK_H__ + +#include + +int mlx5_shd_init(struct mlx5_core_dev *dev); +void mlx5_shd_uninit(struct mlx5_core_dev *dev); + +#endif /* __MLX5_SH_DEVLINK_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..1268fcf35ec7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -798,6 +798,7 @@ struct mlx5_core_dev { enum mlx5_wc_state wc_state; /* sync write combining state */ struct mutex wc_state_lock; + struct devlink *shd; }; struct mlx5_db { -- cgit v1.2.3 From 0834d6f4abd0ca35b5706d267a6e4b78303a95de Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:29 +0100 Subject: PCI: endpoint: Do not mark the BAR succeeding a 64-bit BAR as BAR_RESERVED A BAR that can only be configured as a 64-bit BAR by an EPC driver is marked as such using the "only_64bit" flag. Currently, the documentation says that an EPC driver should explicitly mark the BAR succeeding an "only_64bit" BAR as BAR_RESERVED. However, a 64-bit BAR will always take up two BARs. It is thus redundant to mark both BARs. pci_epc_get_next_free_bar() already skips the BAR succeeding a "only_64bit" BAR, regardless if the succeeding BAR is marked as BAR_RESERVED or not. Thus, drop the BAR_RESERVED for a BAR succeeding a "only_64bit" BAR. No functional changes. Suggested-by: Manivannan Sadhasivam Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-13-cassel@kernel.org --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 2 -- drivers/pci/controller/dwc/pcie-keembay.c | 3 --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 2 -- drivers/pci/controller/dwc/pcie-tegra194.c | 1 - drivers/pci/controller/dwc/pcie-uniphier-ep.c | 5 ----- drivers/pci/controller/pcie-rcar-ep.c | 3 --- include/linux/pci-epc.h | 3 +-- 7 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index a4a800699f89..79d226e0cc80 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -251,9 +251,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) pci->ops = pcie->drvdata->dw_pcie_ops; ls_epc->bar[BAR_2].only_64bit = true; - ls_epc->bar[BAR_3].type = BAR_RESERVED; ls_epc->bar[BAR_4].only_64bit = true; - ls_epc->bar[BAR_5].type = BAR_RESERVED; ls_epc->linkup_notifier = true; pcie->pci = pci; diff --git a/drivers/pci/controller/dwc/pcie-keembay.c b/drivers/pci/controller/dwc/pcie-keembay.c index 2666a9c3d67e..7cf2c312ecec 100644 --- a/drivers/pci/controller/dwc/pcie-keembay.c +++ b/drivers/pci/controller/dwc/pcie-keembay.c @@ -313,11 +313,8 @@ static const struct pci_epc_features keembay_pcie_epc_features = { .msi_capable = true, .msix_capable = true, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, .align = SZ_16K, }; diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 18460f01b2c6..ffb4409c0468 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -850,9 +850,7 @@ static const struct pci_epc_features qcom_pcie_epc_features = { .msi_capable = true, .align = SZ_4K, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, }; static const struct pci_epc_features * diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 06571d806ab3..f1f70fb824b2 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -1993,7 +1993,6 @@ static const struct pci_epc_features tegra_pcie_epc_features = { .msi_capable = true, .bar[BAR_0] = { .type = BAR_FIXED, .fixed_size = SZ_1M, .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .type = BAR_RESERVED, }, .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_RESERVED, }, diff --git a/drivers/pci/controller/dwc/pcie-uniphier-ep.c b/drivers/pci/controller/dwc/pcie-uniphier-ep.c index d52753060970..b7020131f626 100644 --- a/drivers/pci/controller/dwc/pcie-uniphier-ep.c +++ b/drivers/pci/controller/dwc/pcie-uniphier-ep.c @@ -426,9 +426,7 @@ static const struct uniphier_pcie_ep_soc_data uniphier_pro5_data = { .msix_capable = false, .align = 1 << 16, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_RESERVED, }, .bar[BAR_5] = { .type = BAR_RESERVED, }, }, @@ -445,11 +443,8 @@ static const struct uniphier_pcie_ep_soc_data uniphier_nx1_data = { .msix_capable = false, .align = 1 << 12, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, }, }; diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c index 657875ef4657..c2da8ac1f2e8 100644 --- a/drivers/pci/controller/pcie-rcar-ep.c +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -440,13 +440,10 @@ static const struct pci_epc_features rcar_pcie_epc_features = { /* use 64-bit BARs so mark BAR[1,3,5] as reserved */ .bar[BAR_0] = { .type = BAR_FIXED, .fixed_size = 128, .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .type = BAR_FIXED, .fixed_size = 256, .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_FIXED, .fixed_size = 256, .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, }; static const struct pci_epc_features* diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c021c7af175f..c981ea7d52c0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -206,8 +206,7 @@ enum pci_epc_bar_type { * @fixed_size: the fixed size, only applicable if type is BAR_FIXED_MASK. * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must - * configure this BAR as 64-bit. Additionally, the BAR succeeding - * this BAR must be set to type BAR_RESERVED. + * configure this BAR as 64-bit. * * only_64bit should not be set on a BAR of type BAR_RESERVED. * (If BARx is a 64-bit BAR that an EPF driver is not allowed to -- cgit v1.2.3 From 27ce1d8ecb9b9ae025b9e9e199845624bc950998 Mon Sep 17 00:00:00 2001 From: Manikanta Maddireddy Date: Thu, 12 Mar 2026 14:02:30 +0100 Subject: PCI: endpoint: Allow only_64bit on BAR_RESERVED Remove the documentation that forbids setting only_64bit on a BAR of type BAR_RESERVED. When a reserved BAR is 64-bit by default, setting only_64bit is the most accurate description. If we later add support to disable a reserved BAR (e.g. disable_bar() for BARs that were never set via set_bar()), the implementation will need to clear the adjacent BAR (upper 32 bits) as well; having only_64bit set documents that requirement. Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-14-cassel@kernel.org --- include/linux/pci-epc.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c981ea7d52c0..5c59f5606869 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -207,11 +207,6 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. - * - * only_64bit should not be set on a BAR of type BAR_RESERVED. - * (If BARx is a 64-bit BAR that an EPF driver is not allowed to - * touch, then both BARx and BARx+1 must be set to type - * BAR_RESERVED.) */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; -- cgit v1.2.3 From f51644eb40a73677fcd0c92d8174eddde5d0be0e Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Thu, 12 Mar 2026 14:02:31 +0100 Subject: PCI: endpoint: Describe reserved subregions within BARs Some endpoint controllers expose platform-owned, fixed register windows within a BAR that EPF drivers must not reprogram (e.g. a BAR marked BAR_RESERVED). Even in that case, EPF drivers may need to reference a well-defined subset of that BAR, e.g. to reuse an integrated DMA controller MMIO window as a doorbell target. Introduce struct pci_epc_bar_rsvd_region and extend struct pci_epc_bar_desc so EPC drivers can advertise such fixed subregions in a controller-agnostic way. No functional change for existing users. Signed-off-by: Koichiro Den Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Manikanta Maddireddy Tested-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260312130229.2282001-15-cassel@kernel.org --- include/linux/pci-epc.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5c59f5606869..ebcdf70aa9b9 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -200,6 +200,30 @@ enum pci_epc_bar_type { BAR_RESERVED, }; +/** + * enum pci_epc_bar_rsvd_region_type - type of a fixed subregion behind a BAR + * @PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO: Integrated DMA controller MMIO window + * + * BARs marked BAR_RESERVED are owned by the SoC/EPC hardware and must not be + * reprogrammed by EPF drivers. Some of them still expose fixed subregions that + * EPFs may want to reference (e.g. embedded doorbell fallback). + */ +enum pci_epc_bar_rsvd_region_type { + PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO = 0, +}; + +/** + * struct pci_epc_bar_rsvd_region - fixed subregion behind a BAR + * @type: reserved region type + * @offset: offset within the BAR aperture + * @size: size of the reserved region + */ +struct pci_epc_bar_rsvd_region { + enum pci_epc_bar_rsvd_region_type type; + resource_size_t offset; + resource_size_t size; +}; + /** * struct pci_epc_bar_desc - hardware description for a BAR * @type: the type of the BAR @@ -207,11 +231,15 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. + * @nr_rsvd_regions: number of fixed subregions described for BAR_RESERVED + * @rsvd_regions: fixed subregions behind BAR_RESERVED */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; u64 fixed_size; bool only_64bit; + u8 nr_rsvd_regions; + const struct pci_epc_bar_rsvd_region *rsvd_regions; }; /** -- cgit v1.2.3 From 33642e9e36dc084e4fc9245a266c9843bc8303b9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:33 +0100 Subject: PCI: endpoint: Introduce pci_epc_bar_type BAR_DISABLED Add a pci_epc_bar_type BAR_DISABLED to more clearly differentiate from BAR_RESERVED. This BAR type will only be used to describe a BAR that the EPC driver should disable, and will thus never be available to an EPF driver. (Unlike BAR_RESERVED, which will never be disabled by default by an EPC driver.) Co-developed-by: Manikanta Maddireddy Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Koichiro Den Tested-by: Manikanta Maddireddy Reviewed-by: Frank Li Reviewed-by: Manikanta Maddireddy Link: https://patch.msgid.link/20260312130229.2282001-17-cassel@kernel.org --- drivers/pci/endpoint/pci-epc-core.c | 5 +++-- include/linux/pci-epc.h | 10 +++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index e546b3dbb240..6c3c58185fc5 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -103,8 +103,9 @@ enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features bar++; for (i = bar; i < PCI_STD_NUM_BARS; i++) { - /* If the BAR is not reserved, return it. */ - if (epc_features->bar[i].type != BAR_RESERVED) + /* If the BAR is not reserved or disabled, return it. */ + if (epc_features->bar[i].type != BAR_RESERVED && + epc_features->bar[i].type != BAR_DISABLED) return i; } diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index ebcdf70aa9b9..334c2b7578d0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -191,13 +191,21 @@ struct pci_epc { * @BAR_RESIZABLE: The BAR implements the PCI-SIG Resizable BAR Capability. * NOTE: An EPC driver can currently only set a single supported * size. - * @BAR_RESERVED: The BAR should not be touched by an EPF driver. + * @BAR_RESERVED: Used for HW-backed BARs (e.g. MSI-X table, DMA regs). The BAR + * should not be disabled by an EPC driver. The BAR should not be + * reprogrammed by an EPF driver. An EPF driver is allowed to + * disable the BAR if absolutely necessary. (However, right now + * there is no EPC operation to disable a BAR that has not been + * programmed using pci_epc_set_bar().) + * @BAR_DISABLED: The BAR should be disabled by an EPC driver. The BAR will be + * unavailable to an EPF driver. */ enum pci_epc_bar_type { BAR_PROGRAMMABLE = 0, BAR_FIXED, BAR_RESIZABLE, BAR_RESERVED, + BAR_DISABLED, }; /** -- cgit v1.2.3 From 7d8632f1ef6c8ed0b53771c16f130f18d636931e Mon Sep 17 00:00:00 2001 From: James Calligeros Date: Sun, 1 Mar 2026 18:05:23 +1000 Subject: ASoC: soc-dai: define possible idle TDM slot modes Some audio devices, such as certain Texas Instruments codecs, include configurable bus keepers. We currently don't have a standardised way to configure such hardware, and instead rely on the hardware initialising setting itself up into a sane state. There are situations where this is insufficient, however, and some platforms require more concrete guarantees as to the state of the bus, and being able to explicitly configure bus keepers enables this. For example, some Apple Silicon machines have an odd bus topology where the SDOUT pins of all codecs are split across two data lines, which are summed via an OR gate in front of the receiving port on the SoC's I2S peripheral. Each line must transmit 0 while a codec on the other line is actively transmitting data, or the SoC will receive garbage data. To do this, one codec on each line must be configured to transmit zeroes during the other line's active TDM slots. Thus, we define seven possible bus-keeping modes that a device can be in: NONE (UB/as initialised), OFF (explicitly disabled), ZERO (actively transmit a 0), PULLDOWN, HIZ (floating), PULLUP, and DRIVE_HIGH. These will be consumed by CODEC/CPU drivers via a common DAI op, enabling the explicit configuration of bus keepers where required. Signed-off-by: James Calligeros Link: https://patch.msgid.link/20260301-tdm-idle-slots-v3-4-c6ac5351489a@gmail.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 224396927aef..44dd06add52e 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -52,6 +52,21 @@ struct snd_compr_stream; #define SND_SOC_POSSIBLE_DAIFMT_AC97 (1 << SND_SOC_DAI_FORMAT_AC97) #define SND_SOC_POSSIBLE_DAIFMT_PDM (1 << SND_SOC_DAI_FORMAT_PDM) +/* + * DAI TDM slot idle modes + * + * Describes a CODEC/CPU's behaviour when not actively receiving or + * transmitting on a given TDM slot. NONE is undefined behaviour. + * Add new modes to the end. + */ +#define SND_SOC_DAI_TDM_IDLE_NONE 0 +#define SND_SOC_DAI_TDM_IDLE_OFF 1 +#define SND_SOC_DAI_TDM_IDLE_ZERO 2 +#define SND_SOC_DAI_TDM_IDLE_PULLDOWN 3 +#define SND_SOC_DAI_TDM_IDLE_HIZ 4 +#define SND_SOC_DAI_TDM_IDLE_PULLUP 5 +#define SND_SOC_DAI_TDM_IDLE_DRIVE_HIGH 6 + /* * DAI Clock gating. * -- cgit v1.2.3 From b758d3574e88537f9089bd757a51b35cf9675179 Mon Sep 17 00:00:00 2001 From: James Calligeros Date: Sun, 1 Mar 2026 18:05:24 +1000 Subject: ASoC: soc-dai: add common operation to set TDM idle mode Some audio devices, like certain Texas Instruments codecs, integrate configurable bus keepers that dictate the codec's behaviour during idle TDM slots. Now that we have definitions for various idle modes, add a snd_soc_dai_set_tdm_idle() operation to control this in a standardised way. This is useful on Apple Silicon laptops, where a single I2S bus is comprised of two physical lines which are ORed just before the receiving port. When a codec on one line is transmitting, we must guarantee that the other line is low. We can achieve this by configuring one codec on each line to use its bus keeper to fill its line with zeroes during the active slots of the other line. Signed-off-by: James Calligeros Link: https://patch.msgid.link/20260301-tdm-idle-slots-v3-5-c6ac5351489a@gmail.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 7 +++++++ sound/soc/soc-dai.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index 44dd06add52e..6a42812bba8c 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -196,6 +196,10 @@ int snd_soc_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt); int snd_soc_dai_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask, unsigned int rx_mask, int slots, int slot_width); +int snd_soc_dai_set_tdm_idle(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, + int tx_mode, int rx_mode); + int snd_soc_dai_set_channel_map(struct snd_soc_dai *dai, unsigned int tx_num, const unsigned int *tx_slot, unsigned int rx_num, const unsigned int *rx_slot); @@ -312,6 +316,9 @@ struct snd_soc_dai_ops { int (*set_tdm_slot)(struct snd_soc_dai *dai, unsigned int tx_mask, unsigned int rx_mask, int slots, int slot_width); + int (*set_tdm_idle)(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, + int tx_mode, int rx_mode); int (*set_channel_map)(struct snd_soc_dai *dai, unsigned int tx_num, const unsigned int *tx_slot, unsigned int rx_num, const unsigned int *rx_slot); diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index a1e05307067d..2f370fda1266 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -282,6 +282,46 @@ err: } EXPORT_SYMBOL_GPL(snd_soc_dai_set_tdm_slot); +/** + * snd_soc_dai_set_tdm_idle() - Configure a DAI's TDM idle mode + * @dai: The DAI to configure + * @tx_mask: bitmask representing idle TX slots. + * @rx_mask: bitmask representing idle RX slots. + * @tx_mode: idle mode to set for TX slots. + * @rx_mode: idle mode to set for RX slots. + * + * This function configures the DAI to handle idle TDM slots in the + * specified manner. @tx_mode and @rx_mode can be one of + * SND_SOC_DAI_TDM_IDLE_NONE, SND_SOC_DAI_TDM_IDLE_ZERO, + * SND_SOC_DAI_TDM_IDLE_PULLDOWN, or SND_SOC_DAI_TDM_IDLE_HIZ. + * SND_SOC_TDM_IDLE_NONE represents the DAI's default/unset idle slot + * handling state and could be any of the other modes depending on the + * hardware behind the DAI. It is therefore undefined behaviour when set + * explicitly. + * + * Mode and mask can be set independently for both the TX and RX direction. + * Some hardware may ignore both TX and RX masks depending on its + * capabilities. + */ +int snd_soc_dai_set_tdm_idle(struct snd_soc_dai *dai, + unsigned int tx_mask, unsigned int rx_mask, + int tx_mode, int rx_mode) +{ + int ret = -EOPNOTSUPP; + + /* You can't write to the RX line */ + if (rx_mode == SND_SOC_DAI_TDM_IDLE_ZERO) + return soc_dai_ret(dai, -EINVAL); + + if (dai->driver->ops && + dai->driver->ops->set_tdm_idle) + ret = dai->driver->ops->set_tdm_idle(dai, tx_mask, rx_mask, + tx_mode, rx_mode); + + return soc_dai_ret(dai, ret); +} +EXPORT_SYMBOL_GPL(snd_soc_dai_set_tdm_idle); + /** * snd_soc_dai_set_channel_map - configure DAI audio channel map * @dai: DAI -- cgit v1.2.3 From 583157bee545e5c2da6ae094bcac7f68dbc5d265 Mon Sep 17 00:00:00 2001 From: Aelin Reidel Date: Mon, 2 Mar 2026 14:10:14 +0100 Subject: dt-bindings: arm: qcom,ids: Add SoC IDs for SM7450 and SM7450P SM7450 and SM7450P are two SoCs of the 'fillmore' family. Signed-off-by: Aelin Reidel Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260302-fillmore-socids-v2-1-e6c5ad167ec4@mainlining.org Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index febb69b0438f..4985f6afa204 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -245,6 +245,7 @@ #define QCOM_ID_IPQ5000 503 #define QCOM_ID_IPQ0509 504 #define QCOM_ID_IPQ0518 505 +#define QCOM_ID_SM7450 506 #define QCOM_ID_SM6375 507 #define QCOM_ID_IPQ9514 510 #define QCOM_ID_IPQ9550 511 @@ -264,6 +265,7 @@ #define QCOM_ID_QRU1000 539 #define QCOM_ID_SM8475_2 540 #define QCOM_ID_QDU1000 545 +#define QCOM_ID_SM7450P 547 #define QCOM_ID_X1E80100 555 #define QCOM_ID_SM8650 557 #define QCOM_ID_SM4450 568 -- cgit v1.2.3 From 45c2a55d13c698ba6a281315676934c44225b034 Mon Sep 17 00:00:00 2001 From: Unnathi Chalicheemala Date: Thu, 5 Mar 2026 19:12:05 -0800 Subject: soc: qcom: llcc: Add per-slice counter and common llcc slice descriptor Fix incorrect slice activation/deactivation accounting by replacing the bitmap-based activation tracking with per-slice atomic reference counters. This resolves mismatches that occur when multiple client drivers vote for the same slice or when llcc_slice_getd() is called multiple times. As part of this fix, simplify slice descriptor handling by eliminating dynamic allocation. llcc_slice_getd() now returns a pointer to a preallocated descriptor, removing the need for repeated allocation/free cycles and ensuring consistent reference tracking across all users. Signed-off-by: Unnathi Chalicheemala Signed-off-by: Francisco Munoz Ruiz Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20260305-external_llcc_changes1set-v1-1-6347e52e648e@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/llcc-qcom.c | 57 +++++++++++++++++++------------------- include/linux/soc/qcom/llcc-qcom.h | 8 +++--- 2 files changed, 32 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c index e221e3c4982b..8fdca7658393 100644 --- a/drivers/soc/qcom/llcc-qcom.c +++ b/drivers/soc/qcom/llcc-qcom.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include @@ -4535,8 +4534,7 @@ static struct llcc_drv_data *drv_data = (void *) -EPROBE_DEFER; struct llcc_slice_desc *llcc_slice_getd(u32 uid) { const struct llcc_slice_config *cfg; - struct llcc_slice_desc *desc; - u32 sz, count; + u32 sz, i; if (IS_ERR(drv_data)) return ERR_CAST(drv_data); @@ -4544,21 +4542,14 @@ struct llcc_slice_desc *llcc_slice_getd(u32 uid) cfg = drv_data->cfg; sz = drv_data->cfg_size; - for (count = 0; cfg && count < sz; count++, cfg++) + for (i = 0; cfg && i < sz; i++, cfg++) if (cfg->usecase_id == uid) break; - if (count == sz || !cfg) + if (i == sz) return ERR_PTR(-ENODEV); - desc = kzalloc_obj(*desc); - if (!desc) - return ERR_PTR(-ENOMEM); - - desc->slice_id = cfg->slice_id; - desc->slice_size = cfg->max_cap; - - return desc; + return &drv_data->desc[i]; } EXPORT_SYMBOL_GPL(llcc_slice_getd); @@ -4569,7 +4560,7 @@ EXPORT_SYMBOL_GPL(llcc_slice_getd); void llcc_slice_putd(struct llcc_slice_desc *desc) { if (!IS_ERR_OR_NULL(desc)) - kfree(desc); + return; } EXPORT_SYMBOL_GPL(llcc_slice_putd); @@ -4645,7 +4636,8 @@ int llcc_slice_activate(struct llcc_slice_desc *desc) return -EINVAL; mutex_lock(&drv_data->lock); - if (test_bit(desc->slice_id, drv_data->bitmap)) { + /* Already active; try to take another reference. */ + if (refcount_inc_not_zero(&desc->refcount)) { mutex_unlock(&drv_data->lock); return 0; } @@ -4659,7 +4651,8 @@ int llcc_slice_activate(struct llcc_slice_desc *desc) return ret; } - __set_bit(desc->slice_id, drv_data->bitmap); + /* Set first reference */ + refcount_set(&desc->refcount, 1); mutex_unlock(&drv_data->lock); return ret; @@ -4685,10 +4678,12 @@ int llcc_slice_deactivate(struct llcc_slice_desc *desc) return -EINVAL; mutex_lock(&drv_data->lock); - if (!test_bit(desc->slice_id, drv_data->bitmap)) { + /* refcount > 1, drop one ref and we’re done. */ + if (refcount_dec_not_one(&desc->refcount)) { mutex_unlock(&drv_data->lock); return 0; } + act_ctrl_val = ACT_CTRL_OPCODE_DEACTIVATE << ACT_CTRL_OPCODE_SHIFT; ret = llcc_update_act_ctrl(desc->slice_id, act_ctrl_val, @@ -4698,7 +4693,8 @@ int llcc_slice_deactivate(struct llcc_slice_desc *desc) return ret; } - __clear_bit(desc->slice_id, drv_data->bitmap); + /* Finalize: atomically transition 1 -> 0 */ + WARN_ON_ONCE(!refcount_dec_if_one(&desc->refcount)); mutex_unlock(&drv_data->lock); return ret; @@ -4742,7 +4738,7 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config, u32 attr1_val; u32 attr0_val; u32 max_cap_cacheline; - struct llcc_slice_desc desc; + struct llcc_slice_desc *desc; attr1_val = config->cache_mode; attr1_val |= config->probe_target_ways << ATTR1_PROBE_TARGET_WAYS_SHIFT; @@ -4891,8 +4887,11 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config, } if (config->activate_on_init) { - desc.slice_id = config->slice_id; - ret = llcc_slice_activate(&desc); + desc = llcc_slice_getd(config->usecase_id); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + ret = llcc_slice_activate(desc); } return ret; @@ -5205,18 +5204,18 @@ static int qcom_llcc_probe(struct platform_device *pdev) llcc_cfg = cfg->sct_data; sz = cfg->size; - - for (i = 0; i < sz; i++) - if (llcc_cfg[i].slice_id > drv_data->max_slices) - drv_data->max_slices = llcc_cfg[i].slice_id; - - drv_data->bitmap = devm_bitmap_zalloc(dev, drv_data->max_slices, - GFP_KERNEL); - if (!drv_data->bitmap) { + drv_data->desc = devm_kcalloc(dev, sz, sizeof(struct llcc_slice_desc), GFP_KERNEL); + if (!drv_data->desc) { ret = -ENOMEM; goto err; } + for (i = 0; i < sz; i++) { + drv_data->desc[i].slice_id = llcc_cfg[i].slice_id; + drv_data->desc[i].slice_size = llcc_cfg[i].max_cap; + refcount_set(&drv_data->desc[i].refcount, 0); + } + drv_data->cfg = llcc_cfg; drv_data->cfg_size = sz; drv_data->edac_reg_offset = cfg->edac_reg_offset; diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 8243ab3a12a8..227125d84318 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -91,10 +91,12 @@ * struct llcc_slice_desc - Cache slice descriptor * @slice_id: llcc slice id * @slice_size: Size allocated for the llcc slice + * @refcount: Atomic counter to track activate/deactivate calls */ struct llcc_slice_desc { u32 slice_id; size_t slice_size; + refcount_t refcount; }; /** @@ -152,11 +154,10 @@ struct llcc_edac_reg_offset { * @edac_reg_offset: Offset of the LLCC EDAC registers * @lock: mutex associated with each slice * @cfg_size: size of the config data table - * @max_slices: max slices as read from device tree * @num_banks: Number of llcc banks - * @bitmap: Bit map to track the active slice ids * @ecc_irq: interrupt for llcc cache error detection and reporting * @ecc_irq_configured: 'True' if firmware has already configured the irq propagation + * @desc: Array pointer of pre-allocated LLCC slice descriptors * @version: Indicates the LLCC version */ struct llcc_drv_data { @@ -167,12 +168,11 @@ struct llcc_drv_data { const struct llcc_edac_reg_offset *edac_reg_offset; struct mutex lock; u32 cfg_size; - u32 max_slices; u32 num_banks; - unsigned long *bitmap; int ecc_irq; bool ecc_irq_configured; u32 version; + struct llcc_slice_desc *desc; }; #if IS_ENABLED(CONFIG_QCOM_LLCC) -- cgit v1.2.3 From e4ee7621d732162ea2ec714ae76dac2f70519417 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 10 Mar 2026 00:03:30 +0100 Subject: soc: qcom: qmi: Enumerate the service IDs of QMI The QMI framework proposes a set of services which are defined by an integer identifier. The different QMI client lookup for the services via this identifier. Moreover, the function qmi_add_lookup() and qmi_add_server() must match the service ID but the code in different places set the same value but with a different macro name. These macros are spreaded across the different subsystems implementing the protocols associated with a service. It would make more sense to define them in the QMI header for the sake of consistency and clarity. This change use an unified naming for the services and enumerate the ones implemented in the Linux kernel. More services can come later and put the service ID in this same header. Signed-off-by: Daniel Lezcano Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20260309230346.3584252-2-daniel.lezcano@oss.qualcomm.com [bjorn: Lower case hex constants] Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/qmi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h index 291cdc7ef49c..b9dcb437a0be 100644 --- a/include/linux/soc/qcom/qmi.h +++ b/include/linux/soc/qcom/qmi.h @@ -92,6 +92,18 @@ struct qmi_elem_info { #define QMI_ERR_INCOMPATIBLE_STATE_V01 90 #define QMI_ERR_NOT_SUPPORTED_V01 94 +/* + * Enumerate the IDs of the QMI services + */ +#define QMI_SERVICE_ID_TEST 0x0f /* 15 */ +#define QMI_SERVICE_ID_SSCTL 0x2b /* 43 */ +#define QMI_SERVICE_ID_IPA 0x31 /* 49 */ +#define QMI_SERVICE_ID_SERVREG_LOC 0x40 /* 64 */ +#define QMI_SERVICE_ID_SERVREG_NOTIF 0x42 /* 66 */ +#define QMI_SERVICE_ID_WLFW 0x45 /* 69 */ +#define QMI_SERVICE_ID_SLIMBUS 0x301 /* 769 */ +#define QMI_SERVICE_ID_USB_AUDIO_STREAM 0x41d /* 1053 */ + /** * struct qmi_response_type_v01 - common response header (decoded) * @result: result of the transaction -- cgit v1.2.3 From 508e58ac65eec4e272f89e39d9b64588f7fe21cc Mon Sep 17 00:00:00 2001 From: Kathiravan Thirumoorthy Date: Fri, 13 Mar 2026 21:32:56 +0530 Subject: dt-bindings: arm: qcom,ids: add SOC IDs for IPQ5210 family SoCs based on IPQ5210 is shipped under two different naming schemes namely IPQ52xx and QCF2xxx/QCF3xxx. In the later variants Passive Optical Network (PON) interface acts as the backhaul where as in the former it is ethernet backhaul. Document the same. Signed-off-by: Kathiravan Thirumoorthy Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260313-b4-ipq5210_soc_ids-v1-1-97faae3fef95@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index 4985f6afa204..0316b85747d9 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -298,6 +298,11 @@ #define QCOM_ID_QCS615 680 #define QCOM_ID_CQ7790M 731 #define QCOM_ID_CQ7790S 732 +#define QCOM_ID_IPQ5200 765 +#define QCOM_ID_IPQ5210 766 +#define QCOM_ID_QCF2200 767 +#define QCOM_ID_QCF3200 768 +#define QCOM_ID_QCF3210 769 /* * The board type and revision information, used by Qualcomm bootloaders and -- cgit v1.2.3 From dea046e7f46f2357124a465e058c92cac3e351c5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 9 Mar 2026 13:42:41 +0100 Subject: gpio: remove machine hogs With no more users, remove legacy machine hog API from the kernel. Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260309-gpio-hog-fwnode-v2-5-4e61f3dbf06a@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/board.rst | 16 -------- drivers/gpio/gpiolib.c | 71 --------------------------------- include/linux/gpio/machine.h | 33 --------------- 3 files changed, 120 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index 069b54d8591b..0993cac891fb 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -239,22 +239,6 @@ mapping and is thus transparent to GPIO consumers. A set of functions such as gpiod_set_value() is available to work with the new descriptor-oriented interface. -Boards using platform data can also hog GPIO lines by defining GPIO hog tables. - -.. code-block:: c - - struct gpiod_hog gpio_hog_table[] = { - GPIO_HOG("gpio.0", 10, "foo", GPIO_ACTIVE_LOW, GPIOD_OUT_HIGH), - { } - }; - -And the table can be added to the board code as follows:: - - gpiod_add_hogs(gpio_hog_table); - -The line will be hogged as soon as the gpiochip is created or - in case the -chip was created earlier - when the hog table is registered. - Arrays of pins -------------- In addition to requesting pins belonging to a function one by one, a device may diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 4a57d9882600..56fda7891d55 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -103,9 +103,6 @@ static DEFINE_MUTEX(gpio_devices_lock); /* Ensures coherence during read-only accesses to the list of GPIO devices. */ DEFINE_STATIC_SRCU(gpio_devices_srcu); -static DEFINE_MUTEX(gpio_machine_hogs_mutex); -static LIST_HEAD(gpio_machine_hogs); - const char *const gpio_suffixes[] = { "gpios", "gpio", NULL }; static void gpiochip_free_hogs(struct gpio_chip *gc); @@ -930,36 +927,6 @@ err_remove_device: return ret; } -static void gpiochip_machine_hog(struct gpio_chip *gc, struct gpiod_hog *hog) -{ - struct gpio_desc *desc; - int rv; - - desc = gpiochip_get_desc(gc, hog->chip_hwnum); - if (IS_ERR(desc)) { - gpiochip_err(gc, "%s: unable to get GPIO desc: %ld\n", - __func__, PTR_ERR(desc)); - return; - } - - rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags); - if (rv) - gpiod_err(desc, "%s: unable to hog GPIO line (%s:%u): %d\n", - __func__, gc->label, hog->chip_hwnum, rv); -} - -static void gpiochip_machine_hog_lines(struct gpio_chip *gc) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - list_for_each_entry(hog, &gpio_machine_hogs, list) { - if (!strcmp(gc->label, hog->chip_label)) - gpiochip_machine_hog(gc, hog); - } -} - int gpiochip_add_hog(struct gpio_chip *gc, struct fwnode_handle *fwnode) { struct fwnode_handle *gc_node = dev_fwnode(&gc->gpiodev->dev); @@ -1047,8 +1014,6 @@ static int gpiochip_hog_lines(struct gpio_chip *gc) return ret; } - gpiochip_machine_hog_lines(gc); - return 0; } @@ -4578,42 +4543,6 @@ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) } EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table); -/** - * gpiod_add_hogs() - register a set of GPIO hogs from machine code - * @hogs: table of gpio hog entries with a zeroed sentinel at the end - */ -void gpiod_add_hogs(struct gpiod_hog *hogs) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - for (hog = &hogs[0]; hog->chip_label; hog++) { - list_add_tail(&hog->list, &gpio_machine_hogs); - - /* - * The chip may have been registered earlier, so check if it - * exists and, if so, try to hog the line now. - */ - struct gpio_device *gdev __free(gpio_device_put) = - gpio_device_find_by_label(hog->chip_label); - if (gdev) - gpiochip_machine_hog(gpio_device_get_chip(gdev), hog); - } -} -EXPORT_SYMBOL_GPL(gpiod_add_hogs); - -void gpiod_remove_hogs(struct gpiod_hog *hogs) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - for (hog = &hogs[0]; hog->chip_label; hog++) - list_del(&hog->list); -} -EXPORT_SYMBOL_GPL(gpiod_remove_hogs); - static bool gpiod_match_lookup_table(struct device *dev, const struct gpiod_lookup_table *table) { diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 44e5f162973e..5eb88f5d0630 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -46,23 +46,6 @@ struct gpiod_lookup_table { struct gpiod_lookup table[]; }; -/** - * struct gpiod_hog - GPIO line hog table - * @chip_label: name of the chip the GPIO belongs to - * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO - * @line_name: consumer name for the hogged line - * @lflags: bitmask of gpio_lookup_flags GPIO_* values - * @dflags: GPIO flags used to specify the direction and value - */ -struct gpiod_hog { - struct list_head list; - const char *chip_label; - u16 chip_hwnum; - const char *line_name; - unsigned long lflags; - int dflags; -}; - /* * Helper for lookup tables with just one single lookup for a device. */ @@ -95,24 +78,10 @@ static struct gpiod_lookup_table _name = { \ .flags = _flags, \ } -/* - * Simple definition of a single GPIO hog in an array. - */ -#define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags) \ -(struct gpiod_hog) { \ - .chip_label = _chip_label, \ - .chip_hwnum = _chip_hwnum, \ - .line_name = _line_name, \ - .lflags = _lflags, \ - .dflags = _dflags, \ -} - #ifdef CONFIG_GPIOLIB void gpiod_add_lookup_table(struct gpiod_lookup_table *table); void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n); void gpiod_remove_lookup_table(struct gpiod_lookup_table *table); -void gpiod_add_hogs(struct gpiod_hog *hogs); -void gpiod_remove_hogs(struct gpiod_hog *hogs); #else /* ! CONFIG_GPIOLIB */ static inline void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {} @@ -120,8 +89,6 @@ static inline void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {} static inline void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {} -static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {} -static inline void gpiod_remove_hogs(struct gpiod_hog *hogs) {} #endif /* CONFIG_GPIOLIB */ #endif /* __LINUX_GPIO_MACHINE_H */ -- cgit v1.2.3 From 37a6ed2c284b594470e5512df3528abb50b9815e Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 11 Mar 2026 16:18:16 +0200 Subject: drm/{i915, xe}/bo: move display bo calls to parent interface Continue i915 and xe separation from display by moving the bo calls to the display parent interface. Instead of adding all these functions to intel_parent.[ch], reuse the now vacated intel_bo.[ch], and avoid mass renames to calls of these functions. This is similar to intel_display_rpm.[ch]. Make many of the hooks optional to avoid having to implement dummy functions in xe. Indeed now we can remove many of the existing dummy functions. Reviewed-by: Suraj Kandpal Link: https://patch.msgid.link/7899eef2ccf0cd603df69099df065226a0df917b.1773238670.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/display/intel_bo.c | 66 ++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_bo.c | 32 +++++++++----- drivers/gpu/drm/i915/i915_bo.h | 9 ++++ drivers/gpu/drm/i915/i915_driver.c | 2 + drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/display/xe_display.c | 2 + drivers/gpu/drm/xe/display/xe_display_bo.c | 45 +++++-------------- drivers/gpu/drm/xe/display/xe_display_bo.h | 9 ++++ include/drm/intel/display_parent_interface.h | 16 +++++++ 10 files changed, 138 insertions(+), 45 deletions(-) create mode 100644 drivers/gpu/drm/i915/display/intel_bo.c create mode 100644 drivers/gpu/drm/i915/i915_bo.h create mode 100644 drivers/gpu/drm/xe/display/xe_display_bo.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 52a82608b8b1..425933fb26a5 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -240,6 +240,7 @@ i915-y += \ display/intel_atomic.o \ display/intel_audio.o \ display/intel_bios.o \ + display/intel_bo.o \ display/intel_bw.o \ display/intel_casf.o \ display/intel_cdclk.o \ diff --git a/drivers/gpu/drm/i915/display/intel_bo.c b/drivers/gpu/drm/i915/display/intel_bo.c new file mode 100644 index 000000000000..e356ab4e0640 --- /dev/null +++ b/drivers/gpu/drm/i915/display/intel_bo.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +/* Copyright © 2026 Intel Corporation */ + +#include +#include + +#include "intel_bo.h" +#include "intel_display_core.h" +#include "intel_display_types.h" + +bool intel_bo_is_tiled(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->is_tiled && display->parent->bo->is_tiled(obj); +} + +bool intel_bo_is_userptr(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->is_userptr && display->parent->bo->is_userptr(obj); +} + +bool intel_bo_is_shmem(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->is_shmem && display->parent->bo->is_shmem(obj); +} + +bool intel_bo_is_protected(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->is_protected(obj); +} + +int intel_bo_key_check(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->key_check(obj); +} + +int intel_bo_fb_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->fb_mmap(obj, vma); +} + +int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->read_from_page(obj, offset, dst, size); +} + +void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + if (display->parent->bo->describe) + display->parent->bo->describe(m, obj); +} diff --git a/drivers/gpu/drm/i915/i915_bo.c b/drivers/gpu/drm/i915/i915_bo.c index 21a4533ba341..04fc0e3b7ef6 100644 --- a/drivers/gpu/drm/i915/i915_bo.c +++ b/drivers/gpu/drm/i915/i915_bo.c @@ -2,51 +2,63 @@ /* Copyright © 2024 Intel Corporation */ #include - -#include "display/intel_bo.h" +#include #include "gem/i915_gem_mman.h" #include "gem/i915_gem_object.h" #include "gem/i915_gem_object_frontbuffer.h" #include "pxp/intel_pxp.h" + +#include "i915_bo.h" #include "i915_debugfs.h" -bool intel_bo_is_tiled(struct drm_gem_object *obj) +static bool i915_bo_is_tiled(struct drm_gem_object *obj) { return i915_gem_object_is_tiled(to_intel_bo(obj)); } -bool intel_bo_is_userptr(struct drm_gem_object *obj) +static bool i915_bo_is_userptr(struct drm_gem_object *obj) { return i915_gem_object_is_userptr(to_intel_bo(obj)); } -bool intel_bo_is_shmem(struct drm_gem_object *obj) +static bool i915_bo_is_shmem(struct drm_gem_object *obj) { return i915_gem_object_is_shmem(to_intel_bo(obj)); } -bool intel_bo_is_protected(struct drm_gem_object *obj) +static bool i915_bo_is_protected(struct drm_gem_object *obj) { return i915_gem_object_is_protected(to_intel_bo(obj)); } -int intel_bo_key_check(struct drm_gem_object *obj) +static int i915_bo_key_check(struct drm_gem_object *obj) { return intel_pxp_key_check(obj, false); } -int intel_bo_fb_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) +static int i915_bo_fb_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) { return i915_gem_fb_mmap(to_intel_bo(obj), vma); } -int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size) +static int i915_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size) { return i915_gem_object_read_from_page(to_intel_bo(obj), offset, dst, size); } -void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) +static void i915_bo_describe(struct seq_file *m, struct drm_gem_object *obj) { i915_debugfs_describe_obj(m, to_intel_bo(obj)); } + +const struct intel_display_bo_interface i915_display_bo_interface = { + .is_tiled = i915_bo_is_tiled, + .is_userptr = i915_bo_is_userptr, + .is_shmem = i915_bo_is_shmem, + .is_protected = i915_bo_is_protected, + .key_check = i915_bo_key_check, + .fb_mmap = i915_bo_fb_mmap, + .read_from_page = i915_bo_read_from_page, + .describe = i915_bo_describe, +}; diff --git a/drivers/gpu/drm/i915/i915_bo.h b/drivers/gpu/drm/i915/i915_bo.h new file mode 100644 index 000000000000..57255d052dd9 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_bo.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __I915_BO_H__ +#define __I915_BO_H__ + +extern const struct intel_display_bo_interface i915_display_bo_interface; + +#endif /* __I915_BO_H__ */ diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 7a8c59a8c865..385a634c3ed0 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -90,6 +90,7 @@ #include "pxp/intel_pxp_debugfs.h" #include "pxp/intel_pxp_pm.h" +#include "i915_bo.h" #include "i915_debugfs.h" #include "i915_display_pc8.h" #include "i915_dpt.h" @@ -765,6 +766,7 @@ static bool vgpu_active(struct drm_device *drm) } static const struct intel_display_parent_interface parent = { + .bo = &i915_display_bo_interface, .dpt = &i915_display_dpt_interface, .dsb = &i915_display_dsb_interface, .frontbuffer = &i915_display_frontbuffer_interface, diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index ac8c1f2cb7f9..10b4ed30f843 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -233,6 +233,7 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \ i915-display/intel_audio.o \ i915-display/intel_backlight.o \ i915-display/intel_bios.o \ + i915-display/intel_bo.o \ i915-display/intel_bw.o \ i915-display/intel_casf.o \ i915-display/intel_cdclk.o \ diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index f1e1889a52d3..49b6f98e7391 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -35,6 +35,7 @@ #include "intel_hotplug.h" #include "intel_opregion.h" #include "skl_watermark.h" +#include "xe_display_bo.h" #include "xe_display_pcode.h" #include "xe_display_rpm.h" #include "xe_dsb_buffer.h" @@ -541,6 +542,7 @@ static const struct intel_display_irq_interface xe_display_irq_interface = { }; static const struct intel_display_parent_interface parent = { + .bo = &xe_display_bo_interface, .dsb = &xe_display_dsb_interface, .frontbuffer = &xe_display_frontbuffer_interface, .hdcp = &xe_display_hdcp_interface, diff --git a/drivers/gpu/drm/xe/display/xe_display_bo.c b/drivers/gpu/drm/xe/display/xe_display_bo.c index fa1f2c796b81..a53ba3f247ec 100644 --- a/drivers/gpu/drm/xe/display/xe_display_bo.c +++ b/drivers/gpu/drm/xe/display/xe_display_bo.c @@ -2,52 +2,27 @@ /* Copyright © 2024 Intel Corporation */ #include +#include -#include "intel_bo.h" -#include "intel_frontbuffer.h" #include "xe_bo.h" +#include "xe_display_bo.h" #include "xe_pxp.h" -bool intel_bo_is_tiled(struct drm_gem_object *obj) -{ - /* legacy tiling is unused */ - return false; -} - -bool intel_bo_is_userptr(struct drm_gem_object *obj) -{ - /* xe does not have userptr bos */ - return false; -} - -bool intel_bo_is_shmem(struct drm_gem_object *obj) -{ - return false; -} - -bool intel_bo_is_protected(struct drm_gem_object *obj) +static bool xe_display_bo_is_protected(struct drm_gem_object *obj) { return xe_bo_is_protected(gem_to_xe_bo(obj)); } -int intel_bo_key_check(struct drm_gem_object *obj) -{ - return xe_pxp_obj_key_check(obj); -} - -int intel_bo_fb_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) -{ - return drm_gem_prime_mmap(obj, vma); -} - -int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size) +static int xe_display_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, int size) { struct xe_bo *bo = gem_to_xe_bo(obj); return xe_bo_read(bo, offset, dst, size); } -void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) -{ - /* FIXME */ -} +const struct intel_display_bo_interface xe_display_bo_interface = { + .is_protected = xe_display_bo_is_protected, + .key_check = xe_pxp_obj_key_check, + .fb_mmap = drm_gem_prime_mmap, + .read_from_page = xe_display_bo_read_from_page, +}; diff --git a/drivers/gpu/drm/xe/display/xe_display_bo.h b/drivers/gpu/drm/xe/display/xe_display_bo.h new file mode 100644 index 000000000000..6879c104b0b1 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_display_bo.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __XE_DISPLAY_BO_H__ +#define __XE_DISPLAY_BO_H__ + +extern const struct intel_display_bo_interface xe_display_bo_interface; + +#endif diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index c044472b9400..2b53d12b0e0a 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -23,9 +23,22 @@ struct intel_initial_plane_config; struct intel_panic; struct intel_stolen_node; struct ref_tracker; +struct seq_file; +struct vm_area_struct; /* Keep struct definitions sorted */ +struct intel_display_bo_interface { + bool (*is_tiled)(struct drm_gem_object *obj); /* Optional */ + bool (*is_userptr)(struct drm_gem_object *obj); /* Optional */ + bool (*is_shmem)(struct drm_gem_object *obj); /* Optional */ + bool (*is_protected)(struct drm_gem_object *obj); + int (*key_check)(struct drm_gem_object *obj); + int (*fb_mmap)(struct drm_gem_object *obj, struct vm_area_struct *vma); + int (*read_from_page)(struct drm_gem_object *obj, u64 offset, void *dst, int size); + void (*describe)(struct seq_file *m, struct drm_gem_object *obj); /* Optional */ +}; + struct intel_display_dpt_interface { struct intel_dpt *(*create)(struct drm_gem_object *obj, size_t size); void (*destroy)(struct intel_dpt *dpt); @@ -174,6 +187,9 @@ struct intel_display_vma_interface { * check the optional pointers. */ struct intel_display_parent_interface { + /** @bo: BO interface */ + const struct intel_display_bo_interface *bo; + /** @dpt: DPT interface. Optional. */ const struct intel_display_dpt_interface *dpt; -- cgit v1.2.3 From 9876394f64a7c166964e003585806473ad6f532b Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 11 Mar 2026 16:18:18 +0200 Subject: drm/{i915,xe}: move framebuffer bo to parent interface Add .framebuffer_init, .framebuffer_fini and .framebuffer_lookup to the bo parent interface. While they're about framebuffers, they're specifically about framebuffer objects, so the bo interface is a good enough fit, and there's no need to add another interface struct. Reviewed-by: Suraj Kandpal Link: https://patch.msgid.link/848d32a44bf844cba3d66e44ba9f20bea4a8352d.1773238670.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/Makefile | 1 - drivers/gpu/drm/i915/display/intel_bo.c | 21 ++++++ drivers/gpu/drm/i915/display/intel_bo.h | 9 +++ drivers/gpu/drm/i915/display/intel_fb.c | 12 ++-- drivers/gpu/drm/i915/display/intel_fb_bo.c | 99 ---------------------------- drivers/gpu/drm/i915/display/intel_fb_bo.h | 25 ------- drivers/gpu/drm/i915/i915_bo.c | 92 ++++++++++++++++++++++++++ drivers/gpu/drm/xe/Makefile | 1 - drivers/gpu/drm/xe/display/intel_fb_bo.c | 91 ------------------------- drivers/gpu/drm/xe/display/xe_display_bo.c | 84 +++++++++++++++++++++++ include/drm/intel/display_parent_interface.h | 6 ++ 11 files changed, 218 insertions(+), 223 deletions(-) delete mode 100644 drivers/gpu/drm/i915/display/intel_fb_bo.c delete mode 100644 drivers/gpu/drm/i915/display/intel_fb_bo.h delete mode 100644 drivers/gpu/drm/xe/display/intel_fb_bo.c (limited to 'include') diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 425933fb26a5..be976a90c5a6 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -278,7 +278,6 @@ i915-y += \ display/intel_drrs.o \ display/intel_dsb.o \ display/intel_fb.o \ - display/intel_fb_bo.o \ display/intel_fb_pin.o \ display/intel_fbc.o \ display/intel_fdi.o \ diff --git a/drivers/gpu/drm/i915/display/intel_bo.c b/drivers/gpu/drm/i915/display/intel_bo.c index e356ab4e0640..3b82d38a0504 100644 --- a/drivers/gpu/drm/i915/display/intel_bo.c +++ b/drivers/gpu/drm/i915/display/intel_bo.c @@ -64,3 +64,24 @@ void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj) if (display->parent->bo->describe) display->parent->bo->describe(m, obj); } + +int intel_bo_framebuffer_init(struct drm_gem_object *obj, struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct intel_display *display = to_intel_display(obj->dev); + + return display->parent->bo->framebuffer_init(obj, mode_cmd); +} + +void intel_bo_framebuffer_fini(struct drm_gem_object *obj) +{ + struct intel_display *display = to_intel_display(obj->dev); + + display->parent->bo->framebuffer_fini(obj); +} + +struct drm_gem_object *intel_bo_framebuffer_lookup(struct intel_display *display, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *user_mode_cmd) +{ + return display->parent->bo->framebuffer_lookup(display->drm, filp, user_mode_cmd); +} diff --git a/drivers/gpu/drm/i915/display/intel_bo.h b/drivers/gpu/drm/i915/display/intel_bo.h index 40390ed92ceb..aec188c706c2 100644 --- a/drivers/gpu/drm/i915/display/intel_bo.h +++ b/drivers/gpu/drm/i915/display/intel_bo.h @@ -6,8 +6,11 @@ #include +struct drm_file; struct drm_gem_object; +struct drm_mode_fb_cmd2; struct drm_scanout_buffer; +struct intel_display; struct intel_framebuffer; struct seq_file; struct vm_area_struct; @@ -22,4 +25,10 @@ int intel_bo_read_from_page(struct drm_gem_object *obj, u64 offset, void *dst, i void intel_bo_describe(struct seq_file *m, struct drm_gem_object *obj); +void intel_bo_framebuffer_fini(struct drm_gem_object *obj); +int intel_bo_framebuffer_init(struct drm_gem_object *obj, struct drm_mode_fb_cmd2 *mode_cmd); +struct drm_gem_object *intel_bo_framebuffer_lookup(struct intel_display *display, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *user_mode_cmd); + #endif /* __INTEL_BO__ */ diff --git a/drivers/gpu/drm/i915/display/intel_fb.c b/drivers/gpu/drm/i915/display/intel_fb.c index 49c6ca9d94c6..5768619f840f 100644 --- a/drivers/gpu/drm/i915/display/intel_fb.c +++ b/drivers/gpu/drm/i915/display/intel_fb.c @@ -17,7 +17,6 @@ #include "intel_display_types.h" #include "intel_display_utils.h" #include "intel_fb.h" -#include "intel_fb_bo.h" #include "intel_frontbuffer.h" #include "intel_parent.h" #include "intel_plane.h" @@ -2111,7 +2110,7 @@ static void intel_user_framebuffer_destroy(struct drm_framebuffer *fb) if (intel_fb_uses_dpt(fb)) intel_parent_dpt_destroy(display, intel_fb->dpt); - intel_fb_bo_framebuffer_fini(intel_fb_bo(fb)); + intel_bo_framebuffer_fini(intel_fb_bo(fb)); intel_parent_frontbuffer_put(display, intel_fb->frontbuffer); @@ -2222,7 +2221,7 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, /* * intel_parent_frontbuffer_get() must be done before - * intel_fb_bo_framebuffer_init() to avoid set_tiling vs. addfb race. + * intel_bo_framebuffer_init() to avoid set_tiling vs. addfb race. */ intel_fb->frontbuffer = intel_parent_frontbuffer_get(display, obj); if (!intel_fb->frontbuffer) { @@ -2230,7 +2229,7 @@ int intel_framebuffer_init(struct intel_framebuffer *intel_fb, goto err_free_panic; } - ret = intel_fb_bo_framebuffer_init(obj, mode_cmd); + ret = intel_bo_framebuffer_init(obj, mode_cmd); if (ret) goto err_frontbuffer_put; @@ -2333,7 +2332,7 @@ err_free_dpt: if (intel_fb_uses_dpt(fb)) intel_parent_dpt_destroy(display, intel_fb->dpt); err_bo_framebuffer_fini: - intel_fb_bo_framebuffer_fini(obj); + intel_bo_framebuffer_fini(obj); err_frontbuffer_put: intel_parent_frontbuffer_put(display, intel_fb->frontbuffer); err_free_panic: @@ -2348,11 +2347,12 @@ intel_user_framebuffer_create(struct drm_device *dev, const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *user_mode_cmd) { + struct intel_display *display = to_intel_display(dev); struct drm_framebuffer *fb; struct drm_gem_object *obj; struct drm_mode_fb_cmd2 mode_cmd = *user_mode_cmd; - obj = intel_fb_bo_lookup_valid_bo(dev, filp, &mode_cmd); + obj = intel_bo_framebuffer_lookup(display, filp, &mode_cmd); if (IS_ERR(obj)) return ERR_CAST(obj); diff --git a/drivers/gpu/drm/i915/display/intel_fb_bo.c b/drivers/gpu/drm/i915/display/intel_fb_bo.c deleted file mode 100644 index a4d49ef450d9..000000000000 --- a/drivers/gpu/drm/i915/display/intel_fb_bo.c +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#include -#include - -#include "gem/i915_gem_object.h" - -#include "i915_drv.h" -#include "intel_fb.h" -#include "intel_fb_bo.h" - -void intel_fb_bo_framebuffer_fini(struct drm_gem_object *obj) -{ - /* Nothing to do for i915 */ -} - -int intel_fb_bo_framebuffer_init(struct drm_gem_object *_obj, - struct drm_mode_fb_cmd2 *mode_cmd) -{ - struct drm_i915_gem_object *obj = to_intel_bo(_obj); - struct drm_i915_private *i915 = to_i915(obj->base.dev); - unsigned int tiling, stride; - - i915_gem_object_lock(obj, NULL); - tiling = i915_gem_object_get_tiling(obj); - stride = i915_gem_object_get_stride(obj); - i915_gem_object_unlock(obj); - - if (mode_cmd->flags & DRM_MODE_FB_MODIFIERS) { - /* - * If there's a fence, enforce that - * the fb modifier and tiling mode match. - */ - if (tiling != I915_TILING_NONE && - tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) { - drm_dbg_kms(&i915->drm, - "tiling_mode doesn't match fb modifier\n"); - return -EINVAL; - } - } else { - if (tiling == I915_TILING_X) { - mode_cmd->modifier[0] = I915_FORMAT_MOD_X_TILED; - } else if (tiling == I915_TILING_Y) { - drm_dbg_kms(&i915->drm, - "No Y tiling for legacy addfb\n"); - return -EINVAL; - } - } - - /* - * gen2/3 display engine uses the fence if present, - * so the tiling mode must match the fb modifier exactly. - */ - if (GRAPHICS_VER(i915) < 4 && - tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) { - drm_dbg_kms(&i915->drm, - "tiling_mode must match fb modifier exactly on gen2/3\n"); - return -EINVAL; - } - - /* - * If there's a fence, enforce that - * the fb pitch and fence stride match. - */ - if (tiling != I915_TILING_NONE && mode_cmd->pitches[0] != stride) { - drm_dbg_kms(&i915->drm, - "pitch (%d) must match tiling stride (%d)\n", - mode_cmd->pitches[0], stride); - return -EINVAL; - } - - return 0; -} - -struct drm_gem_object * -intel_fb_bo_lookup_valid_bo(struct drm_device *drm, - struct drm_file *filp, - const struct drm_mode_fb_cmd2 *mode_cmd) -{ - struct drm_i915_private *i915 = to_i915(drm); - struct drm_i915_gem_object *obj; - - obj = i915_gem_object_lookup(filp, mode_cmd->handles[0]); - if (!obj) - return ERR_PTR(-ENOENT); - - /* object is backed with LMEM for discrete */ - if (HAS_LMEM(i915) && !i915_gem_object_can_migrate(obj, INTEL_REGION_LMEM_0)) { - /* object is "remote", not in local memory */ - i915_gem_object_put(obj); - drm_dbg_kms(&i915->drm, "framebuffer must reside in local memory\n"); - return ERR_PTR(-EREMOTE); - } - - return intel_bo_to_drm_bo(obj); -} diff --git a/drivers/gpu/drm/i915/display/intel_fb_bo.h b/drivers/gpu/drm/i915/display/intel_fb_bo.h deleted file mode 100644 index d775773c6c03..000000000000 --- a/drivers/gpu/drm/i915/display/intel_fb_bo.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#ifndef __INTEL_FB_BO_H__ -#define __INTEL_FB_BO_H__ - -struct drm_device; -struct drm_file; -struct drm_framebuffer; -struct drm_gem_object; -struct drm_mode_fb_cmd2; - -void intel_fb_bo_framebuffer_fini(struct drm_gem_object *obj); - -int intel_fb_bo_framebuffer_init(struct drm_gem_object *obj, - struct drm_mode_fb_cmd2 *mode_cmd); - -struct drm_gem_object * -intel_fb_bo_lookup_valid_bo(struct drm_device *drm, - struct drm_file *filp, - const struct drm_mode_fb_cmd2 *user_mode_cmd); - -#endif diff --git a/drivers/gpu/drm/i915/i915_bo.c b/drivers/gpu/drm/i915/i915_bo.c index 04fc0e3b7ef6..1789f7cab05c 100644 --- a/drivers/gpu/drm/i915/i915_bo.c +++ b/drivers/gpu/drm/i915/i915_bo.c @@ -2,8 +2,10 @@ /* Copyright © 2024 Intel Corporation */ #include +#include #include +#include "display/intel_fb.h" #include "gem/i915_gem_mman.h" #include "gem/i915_gem_object.h" #include "gem/i915_gem_object_frontbuffer.h" @@ -11,6 +13,7 @@ #include "i915_bo.h" #include "i915_debugfs.h" +#include "i915_drv.h" static bool i915_bo_is_tiled(struct drm_gem_object *obj) { @@ -52,6 +55,92 @@ static void i915_bo_describe(struct seq_file *m, struct drm_gem_object *obj) i915_debugfs_describe_obj(m, to_intel_bo(obj)); } +static int i915_bo_framebuffer_init(struct drm_gem_object *_obj, + struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct drm_i915_gem_object *obj = to_intel_bo(_obj); + struct drm_i915_private *i915 = to_i915(obj->base.dev); + unsigned int tiling, stride; + + i915_gem_object_lock(obj, NULL); + tiling = i915_gem_object_get_tiling(obj); + stride = i915_gem_object_get_stride(obj); + i915_gem_object_unlock(obj); + + if (mode_cmd->flags & DRM_MODE_FB_MODIFIERS) { + /* + * If there's a fence, enforce that + * the fb modifier and tiling mode match. + */ + if (tiling != I915_TILING_NONE && + tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) { + drm_dbg_kms(&i915->drm, + "tiling_mode doesn't match fb modifier\n"); + return -EINVAL; + } + } else { + if (tiling == I915_TILING_X) { + mode_cmd->modifier[0] = I915_FORMAT_MOD_X_TILED; + } else if (tiling == I915_TILING_Y) { + drm_dbg_kms(&i915->drm, + "No Y tiling for legacy addfb\n"); + return -EINVAL; + } + } + + /* + * gen2/3 display engine uses the fence if present, + * so the tiling mode must match the fb modifier exactly. + */ + if (GRAPHICS_VER(i915) < 4 && + tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) { + drm_dbg_kms(&i915->drm, + "tiling_mode must match fb modifier exactly on gen2/3\n"); + return -EINVAL; + } + + /* + * If there's a fence, enforce that + * the fb pitch and fence stride match. + */ + if (tiling != I915_TILING_NONE && mode_cmd->pitches[0] != stride) { + drm_dbg_kms(&i915->drm, + "pitch (%d) must match tiling stride (%d)\n", + mode_cmd->pitches[0], stride); + return -EINVAL; + } + + return 0; +} + +static void i915_bo_framebuffer_fini(struct drm_gem_object *obj) +{ + /* Nothing to do for i915 */ +} + +static struct drm_gem_object * +i915_bo_framebuffer_lookup(struct drm_device *drm, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct drm_i915_private *i915 = to_i915(drm); + struct drm_i915_gem_object *obj; + + obj = i915_gem_object_lookup(filp, mode_cmd->handles[0]); + if (!obj) + return ERR_PTR(-ENOENT); + + /* object is backed with LMEM for discrete */ + if (HAS_LMEM(i915) && !i915_gem_object_can_migrate(obj, INTEL_REGION_LMEM_0)) { + /* object is "remote", not in local memory */ + i915_gem_object_put(obj); + drm_dbg_kms(&i915->drm, "framebuffer must reside in local memory\n"); + return ERR_PTR(-EREMOTE); + } + + return intel_bo_to_drm_bo(obj); +} + const struct intel_display_bo_interface i915_display_bo_interface = { .is_tiled = i915_bo_is_tiled, .is_userptr = i915_bo_is_userptr, @@ -61,4 +150,7 @@ const struct intel_display_bo_interface i915_display_bo_interface = { .fb_mmap = i915_bo_fb_mmap, .read_from_page = i915_bo_read_from_page, .describe = i915_bo_describe, + .framebuffer_init = i915_bo_framebuffer_init, + .framebuffer_fini = i915_bo_framebuffer_fini, + .framebuffer_lookup = i915_bo_framebuffer_lookup, }; diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 10b4ed30f843..468599492af1 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -209,7 +209,6 @@ $(obj)/i915-display/%.o: $(srctree)/drivers/gpu/drm/i915/display/%.c FORCE # Display code specific to xe xe-$(CONFIG_DRM_XE_DISPLAY) += \ - display/intel_fb_bo.o \ display/intel_fbdev_fb.o \ display/xe_display.o \ display/xe_display_bo.o \ diff --git a/drivers/gpu/drm/xe/display/intel_fb_bo.c b/drivers/gpu/drm/xe/display/intel_fb_bo.c deleted file mode 100644 index db8b1a27b4de..000000000000 --- a/drivers/gpu/drm/xe/display/intel_fb_bo.c +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#include -#include - -#include "intel_display_types.h" -#include "intel_fb.h" -#include "intel_fb_bo.h" -#include "xe_bo.h" - -void intel_fb_bo_framebuffer_fini(struct drm_gem_object *obj) -{ - struct xe_bo *bo = gem_to_xe_bo(obj); - - if (bo->flags & XE_BO_FLAG_PINNED) { - /* Unpin our kernel fb first */ - xe_bo_lock(bo, false); - xe_bo_unpin(bo); - xe_bo_unlock(bo); - } - xe_bo_put(bo); -} - -int intel_fb_bo_framebuffer_init(struct drm_gem_object *obj, - struct drm_mode_fb_cmd2 *mode_cmd) -{ - struct xe_bo *bo = gem_to_xe_bo(obj); - struct xe_device *xe = to_xe_device(bo->ttm.base.dev); - int ret; - - /* - * Some modifiers require physical alignment of 64KiB VRAM pages; - * require that the BO in those cases is created correctly. - */ - if (XE_IOCTL_DBG(xe, intel_fb_needs_64k_phys(mode_cmd->modifier[0]) && - !(bo->flags & XE_BO_FLAG_NEEDS_64K))) - return -EINVAL; - - xe_bo_get(bo); - - ret = ttm_bo_reserve(&bo->ttm, true, false, NULL); - if (ret) - goto err; - - if (!(bo->flags & XE_BO_FLAG_SCANOUT)) { - /* - * XE_BO_FLAG_SCANOUT should ideally be set at creation, or is - * automatically set when creating FB. We cannot change caching - * mode when the bo is VM_BINDed, so we can only set - * coherency with display when unbound. - */ - if (XE_IOCTL_DBG(xe, xe_bo_is_vm_bound(bo))) { - ttm_bo_unreserve(&bo->ttm); - ret = -EINVAL; - goto err; - } - bo->flags |= XE_BO_FLAG_SCANOUT; - } - ttm_bo_unreserve(&bo->ttm); - return 0; - -err: - xe_bo_put(bo); - return ret; -} - -struct drm_gem_object *intel_fb_bo_lookup_valid_bo(struct drm_device *drm, - struct drm_file *filp, - const struct drm_mode_fb_cmd2 *mode_cmd) -{ - struct xe_device *xe = to_xe_device(drm); - struct xe_bo *bo; - struct drm_gem_object *gem = drm_gem_object_lookup(filp, mode_cmd->handles[0]); - - if (!gem) - return ERR_PTR(-ENOENT); - - bo = gem_to_xe_bo(gem); - /* Require vram placement or dma-buf import */ - if (IS_DGFX(xe) && - !xe_bo_can_migrate(bo, XE_PL_VRAM0) && - bo->ttm.type != ttm_bo_type_sg) { - drm_gem_object_put(gem); - return ERR_PTR(-EREMOTE); - } - - return gem; -} diff --git a/drivers/gpu/drm/xe/display/xe_display_bo.c b/drivers/gpu/drm/xe/display/xe_display_bo.c index a53ba3f247ec..a689f71e7b14 100644 --- a/drivers/gpu/drm/xe/display/xe_display_bo.c +++ b/drivers/gpu/drm/xe/display/xe_display_bo.c @@ -4,6 +4,7 @@ #include #include +#include "intel_fb.h" #include "xe_bo.h" #include "xe_display_bo.h" #include "xe_pxp.h" @@ -20,9 +21,92 @@ static int xe_display_bo_read_from_page(struct drm_gem_object *obj, u64 offset, return xe_bo_read(bo, offset, dst, size); } +static int xe_display_bo_framebuffer_init(struct drm_gem_object *obj, + struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct xe_bo *bo = gem_to_xe_bo(obj); + struct xe_device *xe = to_xe_device(bo->ttm.base.dev); + int ret; + + /* + * Some modifiers require physical alignment of 64KiB VRAM pages; + * require that the BO in those cases is created correctly. + */ + if (XE_IOCTL_DBG(xe, intel_fb_needs_64k_phys(mode_cmd->modifier[0]) && + !(bo->flags & XE_BO_FLAG_NEEDS_64K))) + return -EINVAL; + + xe_bo_get(bo); + + ret = ttm_bo_reserve(&bo->ttm, true, false, NULL); + if (ret) + goto err; + + if (!(bo->flags & XE_BO_FLAG_SCANOUT)) { + /* + * XE_BO_FLAG_SCANOUT should ideally be set at creation, or is + * automatically set when creating FB. We cannot change caching + * mode when the bo is VM_BINDed, so we can only set + * coherency with display when unbound. + */ + if (XE_IOCTL_DBG(xe, xe_bo_is_vm_bound(bo))) { + ttm_bo_unreserve(&bo->ttm); + ret = -EINVAL; + goto err; + } + bo->flags |= XE_BO_FLAG_SCANOUT; + } + ttm_bo_unreserve(&bo->ttm); + return 0; + +err: + xe_bo_put(bo); + return ret; +} + +static void xe_display_bo_framebuffer_fini(struct drm_gem_object *obj) +{ + struct xe_bo *bo = gem_to_xe_bo(obj); + + if (bo->flags & XE_BO_FLAG_PINNED) { + /* Unpin our kernel fb first */ + xe_bo_lock(bo, false); + xe_bo_unpin(bo); + xe_bo_unlock(bo); + } + xe_bo_put(bo); +} + +static struct drm_gem_object * +xe_display_bo_framebuffer_lookup(struct drm_device *drm, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct xe_device *xe = to_xe_device(drm); + struct xe_bo *bo; + struct drm_gem_object *gem = drm_gem_object_lookup(filp, mode_cmd->handles[0]); + + if (!gem) + return ERR_PTR(-ENOENT); + + bo = gem_to_xe_bo(gem); + /* Require vram placement or dma-buf import */ + if (IS_DGFX(xe) && + !xe_bo_can_migrate(bo, XE_PL_VRAM0) && + bo->ttm.type != ttm_bo_type_sg) { + drm_gem_object_put(gem); + return ERR_PTR(-EREMOTE); + } + + return gem; +} + const struct intel_display_bo_interface xe_display_bo_interface = { .is_protected = xe_display_bo_is_protected, .key_check = xe_pxp_obj_key_check, .fb_mmap = drm_gem_prime_mmap, .read_from_page = xe_display_bo_read_from_page, + .framebuffer_init = xe_display_bo_framebuffer_init, + .framebuffer_fini = xe_display_bo_framebuffer_fini, + .framebuffer_lookup = xe_display_bo_framebuffer_lookup, }; diff --git a/include/drm/intel/display_parent_interface.h b/include/drm/intel/display_parent_interface.h index 2b53d12b0e0a..97ec94a2e749 100644 --- a/include/drm/intel/display_parent_interface.h +++ b/include/drm/intel/display_parent_interface.h @@ -12,6 +12,7 @@ struct drm_device; struct drm_file; struct drm_framebuffer; struct drm_gem_object; +struct drm_mode_fb_cmd2; struct drm_plane_state; struct drm_scanout_buffer; struct i915_vma; @@ -37,6 +38,11 @@ struct intel_display_bo_interface { int (*fb_mmap)(struct drm_gem_object *obj, struct vm_area_struct *vma); int (*read_from_page)(struct drm_gem_object *obj, u64 offset, void *dst, int size); void (*describe)(struct seq_file *m, struct drm_gem_object *obj); /* Optional */ + int (*framebuffer_init)(struct drm_gem_object *obj, struct drm_mode_fb_cmd2 *mode_cmd); + void (*framebuffer_fini)(struct drm_gem_object *obj); + struct drm_gem_object *(*framebuffer_lookup)(struct drm_device *drm, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *user_mode_cmd); }; struct intel_display_dpt_interface { -- cgit v1.2.3 From a25f48fd920b557e6ad02f692f690520c82f5914 Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Wed, 11 Mar 2026 15:31:20 +0100 Subject: gpio: kempld: Implement the interrupt controller Add a GPIO IRQ chip implementation for the kempld GPIO controller. Of note is only how the parent IRQ is obtained. The IRQ for the GPIO controller can be configured in the BIOS, along with the IRQ for the I2C controller. These IRQ are returned by ACPI but this information is only usable if both IRQ are configured. When only one is configured, only one is returned making it impossible to know which one it is. Luckily the BIOS will set the configured IRQ in the PLD registers, so it can be read from there instead, and that also work on platforms without ACPI. The vendor driver allowed to override the IRQ using a module parameters, so there are boards in field which used this parameter instead of properly configuring the BIOS. This implementation provides this as well for compatibility. Signed-off-by: Alban Bedel Link: https://patch.msgid.link/20260311143120.2179347-5-alban.bedel@lht.dlh.de Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-kempld.c | 192 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/kempld.h | 1 + 3 files changed, 194 insertions(+) (limited to 'include') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index b45fb799e36c..d665afe19709 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1440,6 +1440,7 @@ config GPIO_JANZ_TTL config GPIO_KEMPLD tristate "Kontron ETX / COMexpress GPIO" depends on MFD_KEMPLD + select GPIOLIB_IRQCHIP help This enables support for the PLD GPIO interface on some Kontron ETX and COMexpress (ETXexpress) modules. diff --git a/drivers/gpio/gpio-kempld.c b/drivers/gpio/gpio-kempld.c index 7dd94ff6f2df..5a63df3ea5fa 100644 --- a/drivers/gpio/gpio-kempld.c +++ b/drivers/gpio/gpio-kempld.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,13 +20,26 @@ #define KEMPLD_GPIO_MASK(x) (BIT((x) % 8)) #define KEMPLD_GPIO_DIR 0x40 #define KEMPLD_GPIO_LVL 0x42 +#define KEMPLD_GPIO_STS 0x44 #define KEMPLD_GPIO_EVT_LVL_EDGE 0x46 +#define KEMPLD_GPIO_EVT_LOW_HIGH 0x48 #define KEMPLD_GPIO_IEN 0x4A +#define KEMPLD_GPIO_OUT_LVL 0x4E + +/* The IRQ to use if none was configured in the BIOS */ +static unsigned int gpio_irq; +module_param_hw(gpio_irq, uint, irq, 0444); +MODULE_PARM_DESC(gpio_irq, "Set legacy GPIO IRQ (1-15)"); struct kempld_gpio_data { struct gpio_chip chip; struct kempld_device_data *pld; u8 out_lvl_reg; + + struct mutex irq_lock; + u16 ien; + u16 evt_low_high; + u16 evt_lvl_edge; }; /* @@ -193,6 +207,180 @@ static int kempld_gpio_pincount(struct kempld_device_data *pld) return evt ? __ffs(evt) : 16; } +static void kempld_irq_mask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + gpio->ien &= ~BIT(irqd_to_hwirq(data)); + gpiochip_disable_irq(chip, irqd_to_hwirq(data)); +} + +static void kempld_irq_unmask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + gpiochip_enable_irq(chip, irqd_to_hwirq(data)); + gpio->ien |= BIT(irqd_to_hwirq(data)); +} + +static int kempld_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + gpio->evt_low_high |= BIT(data->hwirq); + gpio->evt_lvl_edge |= BIT(data->hwirq); + break; + case IRQ_TYPE_EDGE_FALLING: + gpio->evt_low_high &= ~BIT(data->hwirq); + gpio->evt_lvl_edge |= BIT(data->hwirq); + break; + case IRQ_TYPE_LEVEL_HIGH: + gpio->evt_low_high |= BIT(data->hwirq); + gpio->evt_lvl_edge &= ~BIT(data->hwirq); + break; + case IRQ_TYPE_LEVEL_LOW: + gpio->evt_low_high &= ~BIT(data->hwirq); + gpio->evt_lvl_edge &= ~BIT(data->hwirq); + break; + default: + return -EINVAL; + } + + return 0; +} + +static void kempld_irq_bus_lock(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + mutex_lock(&gpio->irq_lock); +} + +static void kempld_irq_bus_sync_unlock(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + struct kempld_device_data *pld = gpio->pld; + + kempld_get_mutex(pld); + kempld_write16(pld, KEMPLD_GPIO_EVT_LVL_EDGE, gpio->evt_lvl_edge); + kempld_write16(pld, KEMPLD_GPIO_EVT_LOW_HIGH, gpio->evt_low_high); + kempld_write16(pld, KEMPLD_GPIO_IEN, gpio->ien); + kempld_release_mutex(pld); + + mutex_unlock(&gpio->irq_lock); +} + +static const struct irq_chip kempld_irqchip = { + .name = "kempld-gpio", + .irq_mask = kempld_irq_mask, + .irq_unmask = kempld_irq_unmask, + .irq_set_type = kempld_irq_set_type, + .irq_bus_lock = kempld_irq_bus_lock, + .irq_bus_sync_unlock = kempld_irq_bus_sync_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static irqreturn_t kempld_gpio_irq_handler(int irq, void *data) +{ + struct kempld_gpio_data *gpio = data; + struct gpio_chip *chip = &gpio->chip; + unsigned int pin, child_irq; + unsigned long status; + + kempld_get_mutex(gpio->pld); + + status = kempld_read16(gpio->pld, KEMPLD_GPIO_STS); + if (status) + kempld_write16(gpio->pld, KEMPLD_GPIO_STS, status); + + kempld_release_mutex(gpio->pld); + + status &= gpio->ien; + if (!status) + return IRQ_NONE; + + for_each_set_bit(pin, &status, chip->ngpio) { + child_irq = irq_find_mapping(chip->irq.domain, pin); + handle_nested_irq(child_irq); + } + + return IRQ_HANDLED; +} + +static int kempld_gpio_irq_init(struct device *dev, + struct kempld_gpio_data *gpio) +{ + struct kempld_device_data *pld = gpio->pld; + struct gpio_chip *chip = &gpio->chip; + struct gpio_irq_chip *girq; + unsigned int irq; + int ret; + + /* Get the IRQ configured by the BIOS in the PLD */ + kempld_get_mutex(pld); + irq = kempld_read8(pld, KEMPLD_IRQ_GPIO); + kempld_release_mutex(pld); + + if (irq == 0xff) { + dev_info(dev, "GPIO controller has no IRQ support\n"); + return 0; + } + + /* Allow overriding the IRQ with the module parameter */ + if (gpio_irq > 0) { + dev_warn(dev, "Forcing IRQ to %d\n", gpio_irq); + irq &= ~KEMPLD_IRQ_GPIO_MASK; + irq |= gpio_irq & KEMPLD_IRQ_GPIO_MASK; + } + + if (!(irq & KEMPLD_IRQ_GPIO_MASK)) { + dev_warn(dev, "No IRQ configured\n"); + return 0; + } + + /* Get the current config, disable all child interrupts, clear them + * and set the parent IRQ + */ + kempld_get_mutex(pld); + gpio->evt_low_high = kempld_read16(pld, KEMPLD_GPIO_EVT_LOW_HIGH); + gpio->evt_lvl_edge = kempld_read16(pld, KEMPLD_GPIO_EVT_LVL_EDGE); + kempld_write16(pld, KEMPLD_GPIO_IEN, 0); + kempld_write16(pld, KEMPLD_GPIO_STS, 0xFFFF); + kempld_write16(pld, KEMPLD_IRQ_GPIO, irq); + kempld_release_mutex(pld); + + girq = &chip->irq; + gpio_irq_chip_set_chip(girq, &kempld_irqchip); + + girq->parent_handler = NULL; + girq->num_parents = 0; + girq->parents = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_simple_irq; + girq->threaded = true; + + mutex_init(&gpio->irq_lock); + + ret = devm_request_threaded_irq(dev, irq & KEMPLD_IRQ_GPIO_MASK, + NULL, kempld_gpio_irq_handler, + IRQF_ONESHOT, chip->label, + gpio); + if (ret) { + dev_err(dev, "failed to request irq %d\n", irq); + return ret; + } + + return 0; +} + static int kempld_gpio_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -247,6 +435,10 @@ static int kempld_gpio_probe(struct platform_device *pdev) return -ENODEV; } + ret = kempld_gpio_irq_init(dev, gpio); + if (ret) + return ret; + ret = devm_gpiochip_add_data(dev, chip, gpio); if (ret) { dev_err(dev, "Could not register GPIO chip\n"); diff --git a/include/linux/mfd/kempld.h b/include/linux/mfd/kempld.h index 643c096b93ac..2dbd80abfd1d 100644 --- a/include/linux/mfd/kempld.h +++ b/include/linux/mfd/kempld.h @@ -37,6 +37,7 @@ #define KEMPLD_SPEC_GET_MINOR(x) (x & 0x0f) #define KEMPLD_SPEC_GET_MAJOR(x) ((x >> 4) & 0x0f) #define KEMPLD_IRQ_GPIO 0x35 +#define KEMPLD_IRQ_GPIO_MASK 0x0f #define KEMPLD_IRQ_I2C 0x36 #define KEMPLD_CFG 0x37 #define KEMPLD_CFG_GPIO_I2C_MUX (1 << 0) -- cgit v1.2.3 From 428c56525bf5dbc3bd5e30014df1f5213f8bd7c8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 13 Mar 2026 09:22:18 +0100 Subject: jump_label: use ATOMIC_INIT() for initialization of .enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ATOMIC_INIT() is not used because in the past that macro was provided by linux/atomic.h which is not usable from linux/jump_label.h. However since commit 7ca8cf5347f7 ("locking/atomic: Move ATOMIC_INIT into linux/types.h") the macro only requires linux/types.h. Remove the now unnecessary workaround and the associated assertions. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-1-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- kernel/jump_label.c | 9 --------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index fdb79dd1ebd8..e494b360d36d 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -238,18 +238,11 @@ extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); -/* - * We should be using ATOMIC_INIT() for initializing .enabled, but - * the inclusion of atomic.h is problematic for inclusion of jump_label.h - * in 'low-level' headers. Thus, we are initializing .enabled with a - * raw value, but have added a BUILD_BUG_ON() to catch any issues in - * jump_label_init() see: kernel/jump_label.c. - */ #define STATIC_KEY_INIT_TRUE \ - { .enabled = { 1 }, \ + { .enabled = ATOMIC_INIT(1), \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ - { .enabled = { 0 }, \ + { .enabled = ATOMIC_INIT(0), \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7cb19e601426..e851e4b37d0e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -529,15 +529,6 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; - /* - * Since we are initializing the static_key.enabled field with - * with the 'raw' int values (to avoid pulling in atomic.h) in - * jump_label.h, let's make sure that is safe. There are only two - * cases to check since we initialize to 0 or 1. - */ - BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); - BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); - if (static_key_initialized) return; -- cgit v1.2.3 From acb38872d4cbec5b6825345d9d757e21d2d9d953 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 13 Mar 2026 09:22:19 +0100 Subject: jump_label: remove workaround for old compilers in initializations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extra braces for the initialization of the anonymous union members were added in commit cd8d860dcce9 ("jump_label: Fix anonymous union initialization") to compensate for limitations in gcc < 4.6. Versions of gcc this old are not supported anymore, so drop the workaround. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-2-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index e494b360d36d..b9c7b0ebf7b9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -87,13 +87,6 @@ struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* - * Note: - * To make anonymous unions work with old compilers, the static - * initialization of them requires brackets. This creates a dependency - * on the order of the struct with the initializers. If any fields - * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need - * to be modified. - * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod @@ -240,10 +233,10 @@ extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); #define STATIC_KEY_INIT_TRUE \ { .enabled = ATOMIC_INIT(1), \ - { .type = JUMP_TYPE_TRUE } } + .type = JUMP_TYPE_TRUE } #define STATIC_KEY_INIT_FALSE \ { .enabled = ATOMIC_INIT(0), \ - { .type = JUMP_TYPE_FALSE } } + .type = JUMP_TYPE_FALSE } #else /* !CONFIG_JUMP_LABEL */ -- cgit v1.2.3 From 2deccd5c862a0337a691bcfaa87919b4216e6103 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2026 17:40:42 +0100 Subject: cleanup: Optimize guards Andrew reported that a guard() conversion of zone_lock increased the code size unnecessarily. It turns out the unconditional __GUARD_IS_ERR() is to blame. As explored earlier [1], __GUARD_IS_ERR(), similar to IS_ERR_OR_NULL(), generates somewhat sub-optimal code. However, looking at things again, it is possible to avoid doing the __GUARD_IS_ERR() unconditionally. Revert the normal destructors to a simple NULL test and only add the IS_ERR bit to COND guards. This cures the reported overhead; as compiled by GCC-16: page_alloc.o: pre: Total: Before=45299, After=45371, chg +0.16% post: Total: Before=45299, After=45026, chg -0.60% [1] https://lkml.kernel.org/r/20250513085001.GC25891@noisy.programming.kicks-ass.net Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dan Williams Link: https://patch.msgid.link/20260309164516.GE606826@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index dbc4162921e9..ea95ca4bc11c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -286,15 +286,18 @@ static __always_inline _type class_##_name##_constructor(_init_args) \ __no_context_analysis \ { _type t = _init; return t; } -#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ -typedef lock_##_name##_t lock_##_name##ext##_t; \ +#define EXTEND_CLASS_COND(_name, ext, _cond, _init, _init_args...) \ +typedef lock_##_name##_t lock_##_name##ext##_t; \ typedef class_##_name##_t class_##_name##ext##_t; \ -static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ -{ class_##_name##_destructor(p); } \ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *_T) \ +{ if (_cond) return; class_##_name##_destructor(_T); } \ static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ __no_context_analysis \ { class_##_name##_t t = _init; return t; } +#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ + EXTEND_CLASS_COND(_name, ext, 0, _init, _init_args) + #define CLASS(_name, var) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor @@ -394,12 +397,12 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond __DEFINE_GUARD_LOCK_PTR(_name, _T) #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, if (!__GUARD_IS_ERR(_T)) { _unlock; }, ({ _lock; _T; }), _type _T); \ + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(*_T), \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ @@ -488,7 +491,7 @@ typedef struct { \ static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ __no_context_analysis \ { \ - if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ + if (_T->lock) { _unlock; } \ } \ \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) @@ -565,7 +568,7 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) #define DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(_T->lock), \ ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ int _RET = (_lock); \ if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ -- cgit v1.2.3 From 756a0e011cfca0b45a48464aa25b05d9a9c2fb0b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:07 -0700 Subject: locking: Fix rwlock support in Architecture support for rwlocks must be available whether or not CONFIG_DEBUG_SPINLOCK has been defined. Move the definitions of the arch_{read,write}_{lock,trylock,unlock}() macros such that these become visbile if CONFIG_DEBUG_SPINLOCK=n. This patch prepares for converting do_raw_{read,write}_trylock() into inline functions. Without this patch that conversion triggers a build failure for UP architectures, e.g. arm-ep93xx. I used the following kernel configuration to build the kernel for that architecture: CONFIG_ARCH_MULTIPLATFORM=y CONFIG_ARCH_MULTI_V7=n CONFIG_ATAGS=y CONFIG_MMU=y CONFIG_ARCH_MULTI_V4T=y CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_ARCH_EP93XX=y Fixes: fb1c8f93d869 ("[PATCH] spinlock consolidation") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-2-bvanassche@acm.org --- include/linux/spinlock_up.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 1e84e71ca495..3a50976471d7 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 1; } -/* - * Read-write spinlocks. No debug version. - */ -#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) - #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched/core.c and kernel_lock.c: */ @@ -68,4 +58,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_contended(lock) (((void)(lock), 0)) +/* + * Read-write spinlocks. No debug version. + */ +#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) + #endif /* __LINUX_SPINLOCK_UP_H */ -- cgit v1.2.3 From c4d3b8c77d85082d32250c505beb1d9e46ee47ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:08 -0700 Subject: locking: Add lock context support in do_raw_{read,write}_trylock() Convert do_raw_{read,write}_trylock() from macros into inline functions and annotate these inline functions with __cond_acquires_shared() or __cond_acquires() as appropriate. This change is necessary to build kernel drivers or subsystems that use rwlock synchronization objects with lock context analysis enabled. The return type 'int' matches the return type for CONFIG_DEBUG_SPINLOCK=y. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-3-bvanassche@acm.org --- include/linux/rwlock.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 21ceefc4a49f..4e67cd934d8f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -37,10 +37,20 @@ extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_read_trylock(rwlock) arch_read_trylock(&(rwlock)->raw_lock) +static inline int do_raw_read_trylock(rwlock_t *rwlock) + __cond_acquires_shared(true, rwlock) + __no_context_analysis +{ + return arch_read_trylock(&(rwlock)->raw_lock); +} # define do_raw_read_unlock(rwlock) do {arch_read_unlock(&(rwlock)->raw_lock); __release_shared(lock); } while (0) # define do_raw_write_lock(rwlock) do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_write_trylock(rwlock) arch_write_trylock(&(rwlock)->raw_lock) +static inline int do_raw_write_trylock(rwlock_t *rwlock) + __cond_acquires(true, rwlock) + __no_context_analysis +{ + return arch_write_trylock(&(rwlock)->raw_lock); +} # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif -- cgit v1.2.3 From cb15d8e6cbe8d085ac585016deb2e1e0107b99e5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 14 Mar 2026 23:56:49 +0100 Subject: ASoC: codec: arizona: Convert to use GPIO descriptors This converts the Arizona driver to use GPIO descriptors exclusively, deletes the legacy code path an updates the in-tree user of legacy GPIO. The GPIO lines for mic detect polarity and headphone ID detection are made exclusively descriptor-oriented. The headphone ID detection could actually only be used by the legacy GPIO code, but I converted it to use a descriptor if someone would actually need it so we don't just drop useful code. The compatible "wlf,hpdet-id-gpio" is not in the device tree bindings and only intended to be used by software nodes if any. If someone insists I can try to add a binding for it, but I doubt there is any real user so it seems pointless. Signed-off-by: Linus Walleij Reviewed-by: Charles Keepax Reviewed-by: Bartosz Golaszewski Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260314-asoc-arizona-v1-1-ecc9a165307c@kernel.org Signed-off-by: Mark Brown --- arch/arm/mach-s3c/mach-crag6410-module.c | 6 +- include/linux/mfd/arizona/pdata.h | 10 ---- sound/soc/codecs/arizona-jack.c | 95 ++++++++++---------------------- sound/soc/codecs/arizona.h | 1 + 4 files changed, 34 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-s3c/mach-crag6410-module.c b/arch/arm/mach-s3c/mach-crag6410-module.c index 4ffcf024b09d..14b0f9cc103e 100644 --- a/arch/arm/mach-s3c/mach-crag6410-module.c +++ b/arch/arm/mach-s3c/mach-crag6410-module.c @@ -239,7 +239,6 @@ static struct gpiod_lookup_table wm8994_gpiod_table = { static struct arizona_pdata wm5102_reva_pdata = { .gpio_base = CODEC_GPIO_BASE, .irq_flags = IRQF_TRIGGER_HIGH, - .micd_pol_gpio = CODEC_GPIO_BASE + 4, .micd_rate = 6, .gpio_defaults = { [2] = 0x10000, /* AIF3TXLRCLK */ @@ -265,6 +264,8 @@ static struct gpiod_lookup_table wm5102_reva_gpiod_table = { .table = { GPIO_LOOKUP("GPION", 7, "wlf,ldoena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("arizona", 4, + "wlf,micd-pol", GPIO_ACTIVE_HIGH), { }, }, }; @@ -272,7 +273,6 @@ static struct gpiod_lookup_table wm5102_reva_gpiod_table = { static struct arizona_pdata wm5102_pdata = { .gpio_base = CODEC_GPIO_BASE, .irq_flags = IRQF_TRIGGER_HIGH, - .micd_pol_gpio = CODEC_GPIO_BASE + 2, .gpio_defaults = { [2] = 0x10000, /* AIF3TXLRCLK */ [3] = 0x4, /* OPCLK */ @@ -297,6 +297,8 @@ static struct gpiod_lookup_table wm5102_gpiod_table = { .table = { GPIO_LOOKUP("GPION", 7, "wlf,ldo1ena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("arizona", 2, + "wlf,micd-pol", GPIO_ACTIVE_HIGH), { }, }, }; diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index f72e6d4b14a7..d465dcd8c90a 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,11 +117,6 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO used for mic isolation with HPDET */ - int hpdet_id_gpio; -#endif - /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -131,11 +126,6 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO for mic detection polarity */ - int micd_pol_gpio; -#endif - /** Mic detect ramp rate */ unsigned int micd_bias_start_time; diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 303c1d44ebd8..a9063bac2752 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -459,11 +458,6 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; -#ifdef CONFIG_GPIOLIB_LEGACY - int id_gpio = arizona->pdata.hpdet_id_gpio; -#else - int id_gpio = 0; -#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -474,9 +468,8 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; -#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ - if (id_gpio && info->num_hpdet_res == 1) { + if (info->hpdet_id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); regmap_update_bits(arizona->regmap, @@ -486,13 +479,12 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_ACCDET_MODE_HPR | info->micd_modes[0].src); - gpio_set_value_cansleep(id_gpio, 1); + gpiod_set_value_cansleep(info->hpdet_id_gpio, 1); regmap_update_bits(arizona->regmap, ARIZONA_HEADPHONE_DETECT_1, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } -#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -514,7 +506,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, /* * If we measure the mic as high impedance */ - if (!id_gpio || info->hpdet_res[1] > 50) { + if (!info->hpdet_id_gpio || info->hpdet_res[1] > 50) { dev_dbg(arizona->dev, "Detected mic\n"); *mic = true; info->detecting = true; @@ -533,9 +525,6 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; -#ifdef CONFIG_GPIOLIB_LEGACY - int id_gpio = arizona->pdata.hpdet_id_gpio; -#endif int ret, reading, state, report; bool mic = false; @@ -591,10 +580,8 @@ done: arizona_extcon_hp_clamp(info, false); -#ifdef CONFIG_GPIOLIB_LEGACY - if (id_gpio) - gpio_set_value_cansleep(id_gpio, 0); -#endif + if (info->hpdet_id_gpio) + gpiod_set_value_cansleep(info->hpdet_id_gpio, 0); /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1325,58 +1312,33 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); -#ifdef CONFIG_GPIOLIB_LEGACY - if (pdata->micd_pol_gpio > 0) { - if (info->micd_modes[0].gpio) - mode = GPIOF_OUT_INIT_HIGH; - else - mode = GPIOF_OUT_INIT_LOW; - - ret = devm_gpio_request_one(dev, pdata->micd_pol_gpio, - mode, "MICD polarity"); - if (ret != 0) { - dev_err(arizona->dev, "Failed to request GPIO%d: %d\n", - pdata->micd_pol_gpio, ret); - return ret; - } - - info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else -#endif - { - if (info->micd_modes[0].gpio) - mode = GPIOD_OUT_HIGH; - else - mode = GPIOD_OUT_LOW; + if (info->micd_modes[0].gpio) + mode = GPIOD_OUT_HIGH; + else + mode = GPIOD_OUT_LOW; - /* We can't use devm here because we need to do the get - * against the MFD device, as that is where the of_node - * will reside, but if we devm against that the GPIO - * will not be freed if the extcon driver is unloaded. - */ - info->micd_pol_gpio = gpiod_get_optional(arizona->dev, - "wlf,micd-pol", - mode); - if (IS_ERR(info->micd_pol_gpio)) { - ret = PTR_ERR(info->micd_pol_gpio); - dev_err_probe(arizona->dev, ret, "getting microphone polarity GPIO\n"); - return ret; - } + /* We can't use devm here because we need to do the get + * against the MFD device, as that is where the of_node + * will reside, but if we devm against that the GPIO + * will not be freed if the extcon driver is unloaded. + */ + info->micd_pol_gpio = gpiod_get_optional(arizona->dev, + "wlf,micd-pol", + mode); + if (IS_ERR(info->micd_pol_gpio)) { + ret = PTR_ERR(info->micd_pol_gpio); + dev_err_probe(arizona->dev, ret, "getting microphone polarity GPIO\n"); + return ret; } -#ifdef CONFIG_GPIOLIB_LEGACY - if (arizona->pdata.hpdet_id_gpio > 0) { - ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, - GPIOF_OUT_INIT_LOW, - "HPDET"); - if (ret != 0) { - dev_err(arizona->dev, "Failed to request GPIO%d: %d\n", - arizona->pdata.hpdet_id_gpio, ret); - gpiod_put(info->micd_pol_gpio); - return ret; - } + info->hpdet_id_gpio = gpiod_get_optional(arizona->dev, + "wlf,hpdet-id-gpio", + mode); + if (IS_ERR(info->hpdet_id_gpio)) { + ret = PTR_ERR(info->hpdet_id_gpio); + dev_err_probe(arizona->dev, ret, "getting headphone detect ID GPIO\n"); + return ret; } -#endif return 0; } @@ -1385,6 +1347,7 @@ EXPORT_SYMBOL_GPL(arizona_jack_codec_dev_probe); int arizona_jack_codec_dev_remove(struct arizona_priv *info) { gpiod_put(info->micd_pol_gpio); + gpiod_put(info->hpdet_id_gpio); return 0; } EXPORT_SYMBOL_GPL(arizona_jack_codec_dev_remove); diff --git a/sound/soc/codecs/arizona.h b/sound/soc/codecs/arizona.h index ecd8890eefc1..0703182d87b3 100644 --- a/sound/soc/codecs/arizona.h +++ b/sound/soc/codecs/arizona.h @@ -100,6 +100,7 @@ struct arizona_priv { struct snd_soc_jack *jack; struct regulator *micvdd; struct gpio_desc *micd_pol_gpio; + struct gpio_desc *hpdet_id_gpio; u16 last_jackdet; -- cgit v1.2.3 From 68130eef1e0d3c1770952e738f7f8d9f340bd42d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 16 Mar 2026 02:24:43 +0000 Subject: ASoC: soc-component: re-add pcm_new()/pcm_free() Because old pcm_new()/pcm_free() didn't care about parameter component, to avoid name collisions, we have added pcm_construct()/pcm_destruct() by commit c64bfc9066007 ("ASoC: soc-core: add new pcm_construct/pcm_destruct") Because all driver switch to new pcm_construct()/pcm_destruct(), old pcm_new()/pcm_free() were remoted by commit e9067bb502787 ("ASoC: soc-component: remove snd_pcm_ops from component driver") But naming of pcm_construct()/pcm_destruct() are not goot. re-add pcm_new()/pcm_free(), and switch to use it, again. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87a4w8lde4.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 4 ++++ sound/soc/generic/audio-graph-card.c | 1 + sound/soc/soc-component.c | 10 +++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 2a2b74b24a60..0435ba376369 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -90,6 +90,10 @@ struct snd_soc_component_driver { struct snd_soc_pcm_runtime *rtd); void (*pcm_destruct)(struct snd_soc_component *component, struct snd_pcm *pcm); + int (*pcm_new)(struct snd_soc_component *component, + struct snd_soc_pcm_runtime *rtd); + void (*pcm_free)(struct snd_soc_component *component, + struct snd_pcm *pcm); /* component wide operations */ int (*set_sysclk)(struct snd_soc_component *component, diff --git a/sound/soc/generic/audio-graph-card.c b/sound/soc/generic/audio-graph-card.c index 8a5f41704739..74e8f2ab7ffc 100644 --- a/sound/soc/generic/audio-graph-card.c +++ b/sound/soc/generic/audio-graph-card.c @@ -77,6 +77,7 @@ static bool soc_component_is_pcm(struct snd_soc_dai_link_component *dlc) struct snd_soc_dai *dai = snd_soc_find_dai_with_mutex(dlc); if (dai && (dai->component->driver->pcm_construct || + dai->component->driver->pcm_new || (dai->driver->ops && dai->driver->ops->pcm_new))) return true; diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c index 89f236ab3034..77ad33383974 100644 --- a/sound/soc/soc-component.c +++ b/sound/soc/soc-component.c @@ -1042,6 +1042,11 @@ int snd_soc_pcm_component_new(struct snd_soc_pcm_runtime *rtd) if (ret < 0) return soc_component_ret(component, ret); } + if (component->driver->pcm_new) { + ret = component->driver->pcm_new(component, rtd); + if (ret < 0) + return soc_component_ret(component, ret); + } } return 0; @@ -1055,9 +1060,12 @@ void snd_soc_pcm_component_free(struct snd_soc_pcm_runtime *rtd) if (!rtd->pcm) return; - for_each_rtd_components(rtd, i, component) + for_each_rtd_components(rtd, i, component) { if (component->driver->pcm_destruct) component->driver->pcm_destruct(component, rtd->pcm); + if (component->driver->pcm_free) + component->driver->pcm_free(component, rtd->pcm); + } } int snd_soc_pcm_component_prepare(struct snd_pcm_substream *substream) -- cgit v1.2.3 From 175f733325ac2ce875cafd051980be2d2c06dec9 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 16 Mar 2026 02:27:57 +0000 Subject: ASoC: soc-component: remove pcm_construct()/pcm_destruct() All driver have switched to use pcm_new()/pcm_free(), let's remove pcm_construct()/pcm_destruct(). Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/875x6wjyoa.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc-component.h | 4 ---- sound/soc/generic/audio-graph-card.c | 3 +-- sound/soc/soc-component.c | 10 +--------- 3 files changed, 2 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 0435ba376369..60f73c4b0bbb 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -86,10 +86,6 @@ struct snd_soc_component_driver { unsigned int reg, unsigned int val); /* pcm creation and destruction */ - int (*pcm_construct)(struct snd_soc_component *component, - struct snd_soc_pcm_runtime *rtd); - void (*pcm_destruct)(struct snd_soc_component *component, - struct snd_pcm *pcm); int (*pcm_new)(struct snd_soc_component *component, struct snd_soc_pcm_runtime *rtd); void (*pcm_free)(struct snd_soc_component *component, diff --git a/sound/soc/generic/audio-graph-card.c b/sound/soc/generic/audio-graph-card.c index 74e8f2ab7ffc..18ce4ee06350 100644 --- a/sound/soc/generic/audio-graph-card.c +++ b/sound/soc/generic/audio-graph-card.c @@ -76,8 +76,7 @@ static bool soc_component_is_pcm(struct snd_soc_dai_link_component *dlc) { struct snd_soc_dai *dai = snd_soc_find_dai_with_mutex(dlc); - if (dai && (dai->component->driver->pcm_construct || - dai->component->driver->pcm_new || + if (dai && (dai->component->driver->pcm_new || (dai->driver->ops && dai->driver->ops->pcm_new))) return true; diff --git a/sound/soc/soc-component.c b/sound/soc/soc-component.c index 77ad33383974..0f5e120d32b7 100644 --- a/sound/soc/soc-component.c +++ b/sound/soc/soc-component.c @@ -1037,11 +1037,6 @@ int snd_soc_pcm_component_new(struct snd_soc_pcm_runtime *rtd) int i; for_each_rtd_components(rtd, i, component) { - if (component->driver->pcm_construct) { - ret = component->driver->pcm_construct(component, rtd); - if (ret < 0) - return soc_component_ret(component, ret); - } if (component->driver->pcm_new) { ret = component->driver->pcm_new(component, rtd); if (ret < 0) @@ -1060,12 +1055,9 @@ void snd_soc_pcm_component_free(struct snd_soc_pcm_runtime *rtd) if (!rtd->pcm) return; - for_each_rtd_components(rtd, i, component) { - if (component->driver->pcm_destruct) - component->driver->pcm_destruct(component, rtd->pcm); + for_each_rtd_components(rtd, i, component) if (component->driver->pcm_free) component->driver->pcm_free(component, rtd->pcm); - } } int snd_soc_pcm_component_prepare(struct snd_pcm_substream *substream) -- cgit v1.2.3 From 032322b44c02f5e8a127d1dca6798f91cc72eb1d Mon Sep 17 00:00:00 2001 From: Cen Zhang Date: Mon, 16 Mar 2026 16:50:47 +0800 Subject: ALSA: pcm: oss: use proper stream lock for runtime->state access __snd_pcm_set_state() writes runtime->state under the PCM stream lock. However, the OSS I/O functions snd_pcm_oss_write3(), snd_pcm_oss_read3(), snd_pcm_oss_writev3() and snd_pcm_oss_readv3() read runtime->state without holding the stream lock, only holding oss.params_lock (a different mutex that does not synchronize with the stream lock). Since __snd_pcm_set_state() is called from IRQ context (e.g., snd_pcm_period_elapsed -> snd_pcm_update_state -> __snd_pcm_xrun -> snd_pcm_stop -> snd_pcm_post_stop) while the OSS read/write paths run in process context, these are concurrent accesses that constitute a data race. Rather than using READ_ONCE()/WRITE_ONCE() barriers, introduce a snd_pcm_get_state() helper that reads runtime->state under the stream lock, matching the locking discipline used elsewhere in the PCM layer. Also export snd_pcm_set_state() for completeness. Use snd_pcm_get_state() in all four OSS I/O functions, caching the result in a local variable where the same snapshot is used for multiple comparisons to avoid taking the lock repeatedly. Signed-off-by: Cen Zhang Link: https://patch.msgid.link/20260316085047.2876451-1-zzzccc427@gmail.com Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 4 ++++ sound/core/oss/pcm_oss.c | 44 +++++++++++++++++++++++++------------------- sound/core/pcm_native.c | 23 +++++++++++++++++++++-- 3 files changed, 50 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index a7860c047503..76fc33dce537 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -729,6 +729,10 @@ static inline void __snd_pcm_set_state(struct snd_pcm_runtime *runtime, runtime->status->state = state; /* copy for mmap */ } +void snd_pcm_set_state(struct snd_pcm_substream *substream, + snd_pcm_state_t state); +snd_pcm_state_t snd_pcm_get_state(struct snd_pcm_substream *substream); + /** * bytes_to_samples - Unit conversion of the size from bytes to samples * @runtime: PCM runtime instance diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index d4fd4dfc7fc3..a140a0d9abb8 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1227,14 +1227,16 @@ static int snd_pcm_oss_capture_position_fixup(struct snd_pcm_substream *substrea snd_pcm_sframes_t snd_pcm_oss_write3(struct snd_pcm_substream *substream, const char *ptr, snd_pcm_uframes_t frames, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; + snd_pcm_state_t state; int ret; while (1) { - if (runtime->state == SNDRV_PCM_STATE_XRUN || - runtime->state == SNDRV_PCM_STATE_SUSPENDED) { + state = snd_pcm_get_state(substream); + if (state == SNDRV_PCM_STATE_XRUN || + state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: write: recovering from %s\n", - runtime->state == SNDRV_PCM_STATE_XRUN ? + state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); @@ -1249,7 +1251,7 @@ snd_pcm_sframes_t snd_pcm_oss_write3(struct snd_pcm_substream *substream, const break; /* test, if we can't store new data, because the stream */ /* has not been started */ - if (runtime->state == SNDRV_PCM_STATE_PREPARED) + if (snd_pcm_get_state(substream) == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; @@ -1259,20 +1261,22 @@ snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *p { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t delay; + snd_pcm_state_t state; int ret; while (1) { - if (runtime->state == SNDRV_PCM_STATE_XRUN || - runtime->state == SNDRV_PCM_STATE_SUSPENDED) { + state = snd_pcm_get_state(substream); + if (state == SNDRV_PCM_STATE_XRUN || + state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: read: recovering from %s\n", - runtime->state == SNDRV_PCM_STATE_XRUN ? + state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; - } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { + } else if (state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; @@ -1285,7 +1289,7 @@ snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *p frames, in_kernel); mutex_lock(&runtime->oss.params_lock); if (ret == -EPIPE) { - if (runtime->state == SNDRV_PCM_STATE_DRAINING) { + if (snd_pcm_get_state(substream) == SNDRV_PCM_STATE_DRAINING) { ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (ret < 0) break; @@ -1301,15 +1305,16 @@ snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *p #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { - struct snd_pcm_runtime *runtime = substream->runtime; + snd_pcm_state_t state; int ret; while (1) { - if (runtime->state == SNDRV_PCM_STATE_XRUN || - runtime->state == SNDRV_PCM_STATE_SUSPENDED) { + state = snd_pcm_get_state(substream); + if (state == SNDRV_PCM_STATE_XRUN || + state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: writev: recovering from %s\n", - runtime->state == SNDRV_PCM_STATE_XRUN ? + state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); @@ -1322,7 +1327,7 @@ snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void /* test, if we can't store new data, because the stream */ /* has not been started */ - if (runtime->state == SNDRV_PCM_STATE_PREPARED) + if (snd_pcm_get_state(substream) == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; @@ -1330,21 +1335,22 @@ snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void snd_pcm_sframes_t snd_pcm_oss_readv3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { - struct snd_pcm_runtime *runtime = substream->runtime; + snd_pcm_state_t state; int ret; while (1) { - if (runtime->state == SNDRV_PCM_STATE_XRUN || - runtime->state == SNDRV_PCM_STATE_SUSPENDED) { + state = snd_pcm_get_state(substream); + if (state == SNDRV_PCM_STATE_XRUN || + state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: readv: recovering from %s\n", - runtime->state == SNDRV_PCM_STATE_XRUN ? + state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; - } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { + } else if (state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 67cf6a0e17ba..394f86bc4d29 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -618,13 +618,32 @@ static int period_to_usecs(struct snd_pcm_runtime *runtime) return usecs; } -static void snd_pcm_set_state(struct snd_pcm_substream *substream, - snd_pcm_state_t state) +/** + * snd_pcm_set_state - Set the PCM runtime state with stream lock + * @substream: PCM substream + * @state: state to set + */ +void snd_pcm_set_state(struct snd_pcm_substream *substream, + snd_pcm_state_t state) { guard(pcm_stream_lock_irq)(substream); if (substream->runtime->state != SNDRV_PCM_STATE_DISCONNECTED) __snd_pcm_set_state(substream->runtime, state); } +EXPORT_SYMBOL_GPL(snd_pcm_set_state); + +/** + * snd_pcm_get_state - Read the PCM runtime state with stream lock + * @substream: PCM substream + * + * Return: the current PCM state + */ +snd_pcm_state_t snd_pcm_get_state(struct snd_pcm_substream *substream) +{ + guard(pcm_stream_lock_irqsave)(substream); + return substream->runtime->state; +} +EXPORT_SYMBOL_GPL(snd_pcm_get_state); static inline void snd_pcm_timer_notify(struct snd_pcm_substream *substream, int event) -- cgit v1.2.3 From e29d097ead33d0172f028b5b23f10812fe8e8335 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 3 Mar 2026 15:53:07 +0000 Subject: ASoC: dapm: Add a named controls variant of a mux widget There is already a version of the mixer widget that forces use of the specified control name, rather than factoring in the widget name. Add the same feature for mux widgets. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260303155308.138989-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 1 + sound/soc/soc-dapm.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 49f0fe05db01..4f8fb7622a13 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -424,6 +424,7 @@ enum snd_soc_dapm_type { snd_soc_dapm_input = 0, /* input pin */ snd_soc_dapm_output, /* output pin */ snd_soc_dapm_mux, /* selects 1 analog signal from many inputs */ + snd_soc_dapm_mux_named_ctl, /* mux with named controls */ snd_soc_dapm_demux, /* connects the input to one of multiple outputs */ snd_soc_dapm_mixer, /* mixes several analog signals together */ snd_soc_dapm_mixer_named_ctl, /* mixer with named controls */ diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 2768ba5bfc9f..d6192204e613 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -89,6 +89,7 @@ static int dapm_up_seq[] = { [snd_soc_dapm_input] = 6, [snd_soc_dapm_output] = 6, [snd_soc_dapm_mux] = 7, + [snd_soc_dapm_mux_named_ctl] = 7, [snd_soc_dapm_demux] = 7, [snd_soc_dapm_dac] = 8, [snd_soc_dapm_switch] = 9, @@ -140,6 +141,7 @@ static int dapm_down_seq[] = { [snd_soc_dapm_micbias] = 10, [snd_soc_dapm_vmid] = 10, [snd_soc_dapm_mux] = 11, + [snd_soc_dapm_mux_named_ctl] = 11, [snd_soc_dapm_demux] = 11, [snd_soc_dapm_aif_in] = 12, [snd_soc_dapm_aif_out] = 12, @@ -577,6 +579,7 @@ static int dapm_check_dynamic_path( switch (sink->id) { case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: case snd_soc_dapm_switch: case snd_soc_dapm_mixer: case snd_soc_dapm_mixer_named_ctl: @@ -668,6 +671,7 @@ static int dapm_add_path( switch (wsink->id) { case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: ret = dapm_connect_mux(dapm, path, control, wsink); if (ret != 0) goto err; @@ -766,6 +770,7 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, break; case snd_soc_dapm_demux: case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: e = (struct soc_enum *)kcontrol->private_value; if (e->autodisable) { @@ -915,6 +920,7 @@ static bool dapm_kcontrol_set_value(const struct snd_kcontrol *kcontrol, break; case snd_soc_dapm_demux: case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: data->widget->on_val = value >> data->widget->shift; break; default: @@ -1198,6 +1204,7 @@ static int dapm_create_or_share_kcontrol(struct snd_soc_dapm_widget *w, wname_in_long_name = true; kcname_in_long_name = true; break; + case snd_soc_dapm_mux_named_ctl: case snd_soc_dapm_mixer_named_ctl: wname_in_long_name = false; kcname_in_long_name = true; @@ -1317,6 +1324,7 @@ static int dapm_new_mux(struct snd_soc_dapm_widget *w) switch (w->id) { case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: dir = SND_SOC_DAPM_DIR_OUT; type = "mux"; break; @@ -2399,6 +2407,7 @@ static const char * const dapm_type_name[] = { [snd_soc_dapm_input] = "input", [snd_soc_dapm_output] = "output", [snd_soc_dapm_mux] = "mux", + [snd_soc_dapm_mux_named_ctl] = "mux_named_ctl", [snd_soc_dapm_demux] = "demux", [snd_soc_dapm_mixer] = "mixer", [snd_soc_dapm_mixer_named_ctl] = "mixer_named_ctl", @@ -3347,6 +3356,7 @@ int snd_soc_dapm_new_widgets(struct snd_soc_card *card) dapm_new_mixer(w); break; case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: case snd_soc_dapm_demux: dapm_new_mux(w); break; @@ -3834,6 +3844,7 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, break; case snd_soc_dapm_mux: + case snd_soc_dapm_mux_named_ctl: case snd_soc_dapm_demux: case snd_soc_dapm_switch: case snd_soc_dapm_mixer: -- cgit v1.2.3 From f8e761655997cc0ee434fb5f35570d2e93d3a707 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 9 Mar 2026 11:34:27 +0200 Subject: net/mlx5: Add IFC bits for shared headroom pool PBMC support Add hardware interface definitions for shared headroom pool (SHP) in port buffer management: - shp_pbmc_pbsr_support: capability bit in PCAM enhanced features indicating device support for shared headroom pool in PBMC/PBSR. - shared_headroom_pool: buffer entry in PBMC register (pbmc_reg_bits) for the shared headroom pool configuration, reusing the bufferx layout; reduce trailing reserved region accordingly. Signed-off-by: Alexei Lazar Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a3948b36820d..a76c54bf1927 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10845,7 +10845,9 @@ struct mlx5_ifc_pcam_enhanced_features_bits { u8 fec_200G_per_lane_in_pplm[0x1]; u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; - u8 reserved_at_49[0xa]; + u8 reserved_at_49[0x2]; + u8 shp_pbmc_pbsr_support[0x1]; + u8 reserved_at_4c[0x7]; u8 buffer_ownership[0x1]; u8 resereved_at_54[0x14]; u8 fec_50G_per_lane_in_pplm[0x1]; @@ -12090,8 +12092,9 @@ struct mlx5_ifc_pbmc_reg_bits { u8 port_buffer_size[0x10]; struct mlx5_ifc_bufferx_reg_bits buffer[10]; + struct mlx5_ifc_bufferx_reg_bits shared_headroom_pool; - u8 reserved_at_2e0[0x80]; + u8 reserved_at_320[0x40]; }; struct mlx5_ifc_sbpr_reg_bits { -- cgit v1.2.3 From 691dffc7255e740bc3df1c68b50b36786aadeb3a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:28 +0200 Subject: net/mlx5: Add silent mode set/query and VHCA RX IFC bits Update the mlx5 IFC headers with newly defined capability and command-layout bits: - Add silent_mode_query and rename silent_mode to silent_mode_set cap fields. - Add forward_vhca_rx and MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX. - Expose silent mode fields in the L2 table query command structures. Update the SD support check to use the new capability name (silent_mode_set) to match the updated IFC definition. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 19 ++++++++++++++----- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index c348ee62cd3a..16b28028609d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -1183,7 +1183,7 @@ int mlx5_fs_cmd_set_l2table_entry_silent(struct mlx5_core_dev *dev, u8 silent_mo { u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {}; - if (silent_mode && !MLX5_CAP_GEN(dev, silent_mode)) + if (silent_mode && !MLX5_CAP_GEN(dev, silent_mode_set)) return -EOPNOTSUPP; MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 954942ad93c5..762c783156b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -107,7 +107,7 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) /* Disconnect secondaries from the network */ if (!MLX5_CAP_GEN(dev, eswitch_manager)) return false; - if (!MLX5_CAP_GEN(dev, silent_mode)) + if (!MLX5_CAP_GEN(dev, silent_mode_set)) return false; /* RX steering from primary to secondaries */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a76c54bf1927..8fa4fb3d36cf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -469,7 +469,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 table_miss_action_domain[0x1]; u8 termination_table[0x1]; u8 reformat_and_fwd_to_table[0x1]; - u8 reserved_at_1a[0x2]; + u8 forward_vhca_rx[0x1]; + u8 reserved_at_1b[0x1]; u8 ipsec_encrypt[0x1]; u8 ipsec_decrypt[0x1]; u8 sw_owner_v2[0x1]; @@ -2012,12 +2013,14 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; u8 reserved_at_3e8[0x1]; - u8 silent_mode[0x1]; + u8 silent_mode_set[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; - u8 reserved_at_3f8[0x3]; + u8 reserved_at_3f8[0x1]; + u8 silent_mode_query[0x1]; + u8 reserved_at_3fa[0x1]; u8 log_max_current_uc_list[0x5]; u8 general_obj_types[0x40]; @@ -2279,6 +2282,7 @@ enum mlx5_ifc_flow_destination_type { MLX5_IFC_FLOW_DESTINATION_TYPE_VPORT = 0x0, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX = 0x4, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE = 0xA, @@ -6265,7 +6269,9 @@ struct mlx5_ifc_query_l2_table_entry_out_bits { u8 reserved_at_40[0xa0]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x11]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; @@ -6281,7 +6287,10 @@ struct mlx5_ifc_query_l2_table_entry_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x60]; + u8 reserved_at_40[0x40]; + + u8 silent_mode_query[0x1]; + u8 reserved_at_81[0x1f]; u8 reserved_at_a0[0x8]; u8 table_index[0x18]; -- cgit v1.2.3 From 971b28accc09436fe6a6d5afd667dcbfb3ed7e03 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:32 +0200 Subject: net/mlx5: LAG, replace mlx5_get_dev_index with LAG sequence number Introduce mlx5_lag_get_dev_seq() which returns a device's sequence number within the LAG: master is always 0, remaining devices numbered sequentially. This provides a stable index for peer flow tracking and vport ordering without depending on native_port_num. Replace mlx5_get_dev_index() usage in en_tc.c (peer flow array indexing) and ib_rep.c (vport index ordering) with the new API. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-7-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 4 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++--- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 34 +++++++++++++++++++++++ include/linux/mlx5/lag.h | 11 ++++++++ 4 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 include/linux/mlx5/lag.h (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 621834d75205..df8f049c5806 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -3,6 +3,7 @@ * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ +#include #include #include "ib_rep.h" #include "srq.h" @@ -134,7 +135,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) /* Only 1 ib port is the representor for all uplinks */ peer_n_ports--; - if (mlx5_get_dev_index(peer_dev) < mlx5_get_dev_index(dev)) + if (mlx5_lag_get_dev_seq(peer_dev) < + mlx5_lag_get_dev_seq(dev)) vport_index += peer_n_ports; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 1434b65d4746..397a93584fd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -2131,7 +2132,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, mutex_unlock(&esw->offloads.peer_mutex); list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { - if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev)) + if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev)) continue; list_del(&peer_flow->peer_flows); @@ -2154,7 +2155,7 @@ static void mlx5e_tc_del_fdb_peers_flow(struct mlx5e_tc_flow *flow) devcom = flow->priv->mdev->priv.eswitch->devcom; mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { - i = mlx5_get_dev_index(peer_esw->dev); + i = mlx5_lag_get_dev_seq(peer_esw->dev); mlx5e_tc_del_fdb_peer_flow(flow, i); } } @@ -4584,7 +4585,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f, struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr; struct mlx5e_tc_flow_parse_attr *parse_attr; - int i = mlx5_get_dev_index(peer_esw->dev); + int i = mlx5_lag_get_dev_seq(peer_esw->dev); struct mlx5e_rep_priv *peer_urpriv; struct mlx5e_tc_flow *peer_flow; struct mlx5_core_dev *in_mdev; @@ -5525,7 +5526,7 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw) devcom = esw->devcom; mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { - i = mlx5_get_dev_index(peer_esw->dev); + i = mlx5_lag_get_dev_seq(peer_esw->dev); list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows[i], peer[i]) mlx5e_tc_del_fdb_peers_flow(flow); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 4beee64c937a..51ec8f61ecbb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "lib/mlx5.h" #include "lib/devcom.h" #include "mlx5_core.h" @@ -369,6 +370,39 @@ int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq) return -ENOENT; } +/* Reverse of mlx5_lag_get_dev_index_by_seq: given a device, return its + * sequence number in the LAG. Master is always 0, others numbered + * sequentially starting from 1. + */ +int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = mlx5_lag_dev(dev); + int master_idx, i, num = 1; + struct lag_func *pf; + + if (!ldev) + return -ENOENT; + + master_idx = mlx5_lag_get_master_idx(ldev); + if (master_idx < 0) + return -ENOENT; + + pf = mlx5_lag_pf(ldev, master_idx); + if (pf && pf->dev == dev) + return 0; + + mlx5_ldev_for_each(i, 0, ldev) { + if (i == master_idx) + continue; + pf = mlx5_lag_pf(ldev, i); + if (pf->dev == dev) + return num; + num++; + } + return -ENOENT; +} +EXPORT_SYMBOL(mlx5_lag_get_dev_seq); + /* Devcom events for LAG master marking */ #define LAG_DEVCOM_PAIR (0) #define LAG_DEVCOM_UNPAIR (1) diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h new file mode 100644 index 000000000000..d370dfd19055 --- /dev/null +++ b/include/linux/mlx5/lag.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LAG_API_H__ +#define __MLX5_LAG_API_H__ + +struct mlx5_core_dev; + +int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); + +#endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 0bc9059fab6365feaf95cc9a796a3d381915a70f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:33 +0200 Subject: net/mlx5: Add VHCA RX flow destination support for FW steering Introduce MLX5_FLOW_DESTINATION_TYPE_VHCA_RX as a new flow steering destination type. Wire the new destination through flow steering command setup by mapping it to MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX and passing the vhca id, extend forward-destination validation to accept it, and teach the flow steering tracepoint formatter to print rx_vhca_id. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-8-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 7 +++++-- include/linux/mlx5/fs.h | 4 ++++ 4 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c index 6d73127b7217..2cf1d3825def 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -282,6 +282,9 @@ const char *parse_fs_dst(struct trace_seq *p, case MLX5_FLOW_DESTINATION_TYPE_NONE: trace_seq_printf(p, "none\n"); break; + case MLX5_FLOW_DESTINATION_TYPE_VHCA_RX: + trace_seq_printf(p, "rx_vhca_id=%u\n", dst->vhca.id); + break; } trace_seq_putc(p, 0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 16b28028609d..1cd4cd898ec2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -716,6 +716,10 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, id = dst->dest_attr.ft->id; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE; break; + case MLX5_FLOW_DESTINATION_TYPE_VHCA_RX: + id = dst->dest_attr.vhca.id; + ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX; + break; default: id = dst->dest_attr.tir_num; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TIR; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2c3544880a30..003d211306a7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -503,7 +503,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type) type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER || type == MLX5_FLOW_DESTINATION_TYPE_TIR || type == MLX5_FLOW_DESTINATION_TYPE_RANGE || - type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE || + type == MLX5_FLOW_DESTINATION_TYPE_VHCA_RX; } static bool check_valid_spec(const struct mlx5_flow_spec *spec) @@ -1890,7 +1891,9 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, d1->range.hit_ft == d2->range.hit_ft && d1->range.miss_ft == d2->range.miss_ft && d1->range.min == d2->range.min && - d1->range.max == d2->range.max)) + d1->range.max == d2->range.max) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_VHCA_RX && + d1->vhca.id == d2->vhca.id)) return true; } diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9cadb1d5e6df..02064424e868 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -55,6 +55,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, MLX5_FLOW_DESTINATION_TYPE_RANGE, MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE, + MLX5_FLOW_DESTINATION_TYPE_VHCA_RX, }; enum { @@ -189,6 +190,9 @@ struct mlx5_flow_destination { u32 ft_num; struct mlx5_flow_table *ft; struct mlx5_fc *counter; + struct { + u16 id; + } vhca; struct { u16 num; u16 vhca_id; -- cgit v1.2.3 From d6c9b4de8109a3b4ca9c6c6b7c5fbc42cfeff9ae Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:34 +0200 Subject: {net/RDMA}/mlx5: Add LAG demux table API and vport demux rules Downstream patches will introduce SW-only LAG (e.g. shared_fdb without HW LAG). In this mode the firmware cannot create the LAG demux table, but vport demuxing is still required. Move LAG demux flow-table ownership to the LAG layer and introduce APIs to init/cleanup the demux table and add/delete per-vport rules. Adjust the RDMA driver to use the new APIs. In this mode, the LAG layer will create a flow group that matches vport metadata. Vports that are not native to the LAG master eswitch add the demux rule during IB representor load and remove it on unload. The demux rule forward traffic from said vports to their native eswitch manager via a new dest type - MLX5_FLOW_DESTINATION_TYPE_VHCA_RX. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-9-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 20 ++- drivers/infiniband/hw/mlx5/main.c | 21 +-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 12 ++ .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 83 ++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 +- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 152 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 12 ++ include/linux/mlx5/fs.h | 6 +- include/linux/mlx5/lag.h | 10 ++ 10 files changed, 300 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index df8f049c5806..1709b628702e 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -10,11 +10,13 @@ static int mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, + struct mlx5_core_dev *rep_dev, struct mlx5_eswitch_rep *rep, int vport_index) { struct mlx5_ib_dev *ibdev; struct net_device *ndev; + int ret; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -24,7 +26,17 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, rep->rep_data[REP_IB].priv = ibdev; ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); + ret = ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); + if (ret) + return ret; + + /* Only Vports that are not native to the LAG master eswitch need to add + * demux rule. + */ + if (mlx5_eswitch_get_total_vports(dev) > vport_index) + return 0; + + return mlx5_lag_demux_rule_add(rep_dev, rep->vport, vport_index); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -131,7 +143,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) if (mlx5_lag_is_master(peer_dev)) lag_master = peer_dev; - else if (!mlx5_lag_is_mpesw(dev)) + else if (!mlx5_lag_is_mpesw(peer_dev)) /* Only 1 ib port is the representor for all uplinks */ peer_n_ports--; @@ -145,7 +157,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) if (rep->vport == MLX5_VPORT_UPLINK && !new_uplink) profile = &raw_eth_profile; else - return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); + return mlx5_ib_set_vport_rep(lag_master, dev, rep, vport_index); if (mlx5_lag_is_shared_fdb(dev)) { ret = mlx5_ib_take_transport(lag_master); @@ -233,6 +245,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) vport_index = i; } + mlx5_lag_demux_rule_del(mdev, vport_index); + port = &dev->port[vport_index]; ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 635002e684a5..9fb0629978bd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -3678,12 +3679,12 @@ static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev, - MLX5_FLOW_NAMESPACE_LAG); - struct mlx5_flow_table *ft; + struct mlx5_flow_namespace *ns; int err; + ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_LAG); if (!ns || !mlx5_lag_is_active(mdev)) return 0; @@ -3691,14 +3692,15 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) if (err) return err; - ft = mlx5_create_lag_demux_flow_table(ns, 0, 0); - if (IS_ERR(ft)) { - err = PTR_ERR(ft); + ft_attr.level = 0; + ft_attr.prio = 0; + ft_attr.max_fte = dev->num_ports; + + err = mlx5_lag_demux_init(mdev, &ft_attr); + if (err) goto err_destroy_vport_lag; - } mlx5e_lag_event_register(dev); - dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; return 0; @@ -3716,8 +3718,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) dev->lag_active = false; mlx5e_lag_event_unregister(dev); - mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); - dev->flow_db->lag_demux_ft = NULL; + mlx5_lag_demux_cleanup(mdev); mlx5_cmd_destroy_vport_lag(mdev); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4f4114d95130..3fc31415e107 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -306,7 +306,6 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_rx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; - struct mlx5_flow_table *lag_demux_ft; struct mlx5_ib_flow_prio *rdma_transport_rx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; struct mlx5_ib_flow_prio *rdma_transport_tx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; /* Protect flow steering bypass flow tables diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 96309a732d50..9b729789537f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -940,6 +940,12 @@ int mlx5_esw_ipsec_vf_packet_offload_supported(struct mlx5_core_dev *dev, u16 vport_num); bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev); void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev); +struct mlx5_flow_group * +mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft); +struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft); #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } @@ -1025,6 +1031,12 @@ mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id) static inline void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev) {} +static inline struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_MLX5_ESWITCH */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 90e6f97bdf4a..f98837470f39 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1459,6 +1459,83 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) return flow_rule; } +struct mlx5_flow_group * +mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *match_criteria; + void *flow_group_in; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return ERR_PTR(-EOPNOTSUPP); + + if (IS_ERR(ft)) + return ERR_CAST(ft); + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return ERR_PTR(-ENOMEM); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + ft->max_fte - 1); + + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + fg = mlx5_create_flow_group(ft, flow_group_in); + kvfree(flow_group_in); + if (IS_ERR(fg)) + esw_warn(esw->dev, "Can't create LAG demux flow group\n"); + + return fg; +} + +struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft) +{ + struct mlx5_flow_spec *spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *ret; + void *misc; + + if (!spec) + return ERR_PTR(-ENOMEM); + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) { + kvfree(spec); + return ERR_PTR(-EOPNOTSUPP); + } + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VHCA_RX; + dest.vhca.id = MLX5_CAP_GEN(esw->dev, vhca_id); + + ret = mlx5_add_flow_rules(lag_ft, spec, &flow_act, &dest, 1); + kvfree(spec); + return ret; +} + #define MAX_PF_SQ 256 #define MAX_SQ_NVPORTS 32 @@ -2047,7 +2124,8 @@ static int esw_create_vport_rx_group(struct mlx5_eswitch *esw) if (IS_ERR(g)) { err = PTR_ERR(g); - mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err); + esw_warn(esw->dev, "Failed to create vport rx group err %d\n", + err); goto out; } @@ -2092,7 +2170,8 @@ static int esw_create_vport_rx_drop_group(struct mlx5_eswitch *esw) if (IS_ERR(g)) { err = PTR_ERR(g); - mlx5_core_warn(esw->dev, "Failed to create vport rx drop group err %d\n", err); + esw_warn(esw->dev, + "Failed to create vport rx drop group err %d\n", err); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 003d211306a7..61a6ba1e49dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1438,15 +1438,9 @@ mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table* mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, - int prio, u32 level) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; - - ft_attr.level = level; - ft_attr.prio = prio; - ft_attr.max_fte = 1; - - return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0); + return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0); } EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 51ec8f61ecbb..449e4bd86c06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1471,6 +1471,158 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev) return devcom; } +static int mlx5_lag_demux_ft_fg_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_lag *ldev) +{ +#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_flow_namespace *ns; + struct mlx5_flow_group *fg; + int err; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_LAG); + if (!ns) + return 0; + + ldev->lag_demux_ft = mlx5_create_flow_table(ns, ft_attr); + if (IS_ERR(ldev->lag_demux_ft)) + return PTR_ERR(ldev->lag_demux_ft); + + fg = mlx5_esw_lag_demux_fg_create(dev->priv.eswitch, + ldev->lag_demux_ft); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + mlx5_destroy_flow_table(ldev->lag_demux_ft); + ldev->lag_demux_ft = NULL; + return err; + } + + ldev->lag_demux_fg = fg; + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int mlx5_lag_demux_fw_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_lag *ldev) +{ + struct mlx5_flow_namespace *ns; + int err; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_LAG); + if (!ns) + return 0; + + ldev->lag_demux_fg = NULL; + ft_attr->max_fte = 1; + ldev->lag_demux_ft = mlx5_create_lag_demux_flow_table(ns, ft_attr); + if (IS_ERR(ldev->lag_demux_ft)) { + err = PTR_ERR(ldev->lag_demux_ft); + ldev->lag_demux_ft = NULL; + return err; + } + + return 0; +} + +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr) +{ + struct mlx5_lag *ldev; + + if (!ft_attr) + return -EINVAL; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return -ENODEV; + + xa_init(&ldev->lag_demux_rules); + + if (mlx5_get_sd(dev)) + return mlx5_lag_demux_ft_fg_init(dev, ft_attr, ldev); + + return mlx5_lag_demux_fw_init(dev, ft_attr, ldev); +} +EXPORT_SYMBOL(mlx5_lag_demux_init); + +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + unsigned long vport_num; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + xa_for_each(&ldev->lag_demux_rules, vport_num, rule) + mlx5_del_flow_rules(rule); + xa_destroy(&ldev->lag_demux_rules); + + if (ldev->lag_demux_fg) + mlx5_destroy_flow_group(ldev->lag_demux_fg); + if (ldev->lag_demux_ft) + mlx5_destroy_flow_table(ldev->lag_demux_ft); + ldev->lag_demux_fg = NULL; + ldev->lag_demux_ft = NULL; +} +EXPORT_SYMBOL(mlx5_lag_demux_cleanup); + +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *vport_dev, u16 vport_num, + int index) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + int err; + + ldev = mlx5_lag_dev(vport_dev); + if (!ldev || !ldev->lag_demux_fg) + return 0; + + if (xa_load(&ldev->lag_demux_rules, index)) + return 0; + + rule = mlx5_esw_lag_demux_rule_create(vport_dev->priv.eswitch, + vport_num, ldev->lag_demux_ft); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_warn(vport_dev, + "Failed to create LAG demux rule for vport %u, err %d\n", + vport_num, err); + return err; + } + + err = xa_err(xa_store(&ldev->lag_demux_rules, index, rule, + GFP_KERNEL)); + if (err) { + mlx5_del_flow_rules(rule); + mlx5_core_warn(vport_dev, + "Failed to store LAG demux rule for vport %u, err %d\n", + vport_num, err); + } + + return err; +} +EXPORT_SYMBOL(mlx5_lag_demux_rule_add); + +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int index) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev || !ldev->lag_demux_fg) + return; + + rule = xa_erase(&ldev->lag_demux_rules, index); + if (rule) + mlx5_del_flow_rules(rule); +} +EXPORT_SYMBOL(mlx5_lag_demux_rule_del); + static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) { queue_delayed_work(ldev->wq, &ldev->bond_work, delay); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 30cbd61768f8..6c911374f409 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -5,6 +5,9 @@ #define __MLX5_LAG_H__ #include +#include +#include +#include #define MLX5_LAG_MAX_HASH_BUCKETS 16 /* XArray mark for the LAG master device @@ -83,6 +86,9 @@ struct mlx5_lag { /* Protect lag fields/state changes */ struct mutex lock; struct lag_mpesw lag_mpesw; + struct mlx5_flow_table *lag_demux_ft; + struct mlx5_flow_group *lag_demux_fg; + struct xarray lag_demux_rules; }; static inline struct mlx5_lag * @@ -133,6 +139,12 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev) bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev); bool mlx5_lag_check_prereq(struct mlx5_lag *ldev); +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr); +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *dev, u16 vport_num, + int vport_index); +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int vport_index); void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker); int mlx5_activate_lag(struct mlx5_lag *ldev, diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 02064424e868..d8f3b7ef319e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -252,9 +252,9 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr *ft_attr, u16 vport); -struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( - struct mlx5_flow_namespace *ns, - int prio, u32 level); +struct mlx5_flow_table * +mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr); int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); /* inbox should be set with the following values: diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h index d370dfd19055..ab9f754664e5 100644 --- a/include/linux/mlx5/lag.h +++ b/include/linux/mlx5/lag.h @@ -4,8 +4,18 @@ #ifndef __MLX5_LAG_API_H__ #define __MLX5_LAG_API_H__ +#include + struct mlx5_core_dev; +struct mlx5_flow_table; +struct mlx5_flow_table_attr; +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr); +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *dev, u16 vport_num, + int vport_index); +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int vport_index); int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); #endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 4dd2115f43594da5271a1aa34fde6719b4259047 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 9 Mar 2026 11:34:35 +0200 Subject: net/mlx5: Expose MLX5_UMR_ALIGN definition Expose HW constant value in a shared header, to be used by core/EN drivers. Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-10-tariqt@nvidia.com Reviewed-by: Dragos Tatulea Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 1 - include/linux/mlx5/device.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 665323b90b64..ff56948597dd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -51,7 +51,6 @@ enum { }; #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 -#define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 25c6b42140b2..07a25f264292 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -293,6 +293,7 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_ALIGN (2048) #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) -- cgit v1.2.3 From f1a424e21c15993db0f9594cda17ef5d516ab3e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:41:04 -0600 Subject: io_uring: switch struct io_ring_ctx internal bitfields to flags Bitfields cannot be set and checked atomically, and this makes it more clear that these are indeed in shared storage and must be checked and set in a sane fashion. This is in preparation for annotating a few of the known racy, but harmless, flags checking. No intended functional changes in this patch. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 34 ++++++++++-------- io_uring/eventfd.c | 4 +-- io_uring/io_uring.c | 82 +++++++++++++++++++++--------------------- io_uring/io_uring.h | 9 ++--- io_uring/msg_ring.c | 2 +- io_uring/register.c | 8 ++--- io_uring/rsrc.c | 8 ++--- io_uring/tctx.c | 2 +- io_uring/timeout.c | 4 +-- io_uring/tw.c | 2 +- 10 files changed, 82 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..0b3f08adc217 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -268,24 +268,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 7482a7dc6b38..3da028500f76 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; + ctx->int_flags |= IO_RING_F_HAS_EVFD; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); @@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { - ctx->has_evfd = false; + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9a37035e76c0..bfeb3bc3849d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -477,17 +477,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->poll_activated) + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) io_poll_wq_wake(ctx); - if (ctx->off_timeout_used) + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) io_flush_timeouts(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_lock(&ctx->completion_lock); } @@ -500,11 +500,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - if (!ctx->task_complete) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ - if (!ctx->syscall_iopoll) + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); @@ -830,7 +830,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { lockdep_assert_held(&ctx->uring_lock); - lockdep_assert(ctx->lockless_cq); + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); @@ -860,7 +860,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); @@ -885,7 +885,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) lockdep_assert_held(&ctx->uring_lock); cqe[0].user_data = req->cqe.user_data; - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux32(ctx, cqe); spin_unlock(&ctx->completion_lock); @@ -913,7 +913,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); @@ -1135,7 +1135,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); else io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); @@ -1148,7 +1148,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) INIT_WQ_LIST(&state->compl_reqs); } - if (unlikely(ctx->drain_active)) + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_queue_deferred(ctx); ctx->submit_state.cq_flush = false; @@ -1344,7 +1344,7 @@ static __cold void io_drain_req(struct io_kiocb *req) list_add_tail(&de->list, &ctx->defer_list); io_queue_deferred(ctx); if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1655,7 +1655,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) } else { /* can't fail with IO_URING_F_INLINE */ io_req_sqe_copy(req, IO_URING_F_INLINE); - if (unlikely(req->ctx->drain_active)) + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_drain_req(req); else io_queue_iowq(req); @@ -1671,7 +1671,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->op_restricted) + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -1691,7 +1691,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) { struct io_kiocb *head = ctx->submit_state.link.head; - ctx->drain_active = true; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; if (head) { /* * If we need to drain a request in the middle of a link, drain @@ -1701,7 +1701,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; } } @@ -1767,23 +1767,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) return io_init_fail_req(req, -EOPNOTSUPP); io_init_drain(ctx); } } - if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -2204,7 +2204,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) poll_wq_task_work); mutex_lock(&ctx->uring_lock); - ctx->poll_activated = true; + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; mutex_unlock(&ctx->uring_lock); /* @@ -2219,9 +2219,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ - if (ctx->poll_activated || ctx->poll_wq_task_work.func) + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) goto out; - if (WARN_ON_ONCE(!ctx->task_complete)) + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) goto out; if (!ctx->submitter_task) goto out; @@ -2242,7 +2242,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!ctx->poll_activated)) + if (unlikely(!(ctx->int_flags & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper @@ -2607,7 +2607,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - if (ctx->syscall_iopoll) + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and @@ -2622,7 +2622,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_GETEVENTS) { int ret2; - if (ctx->syscall_iopoll) { + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to @@ -2923,9 +2923,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, if (dst->bpf_filters) WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); if (dst->op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (dst->reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; } static __cold int io_uring_create(struct io_ctx_config *config) @@ -2952,17 +2952,18 @@ static __cold int io_uring_create(struct io_ctx_config *config) if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL)) - ctx->task_complete = true; + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) - ctx->lockless_cq = true; + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || + (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ - if (!ctx->task_complete) - ctx->poll_activated = true; + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user @@ -2972,9 +2973,10 @@ static __cold int io_uring_create(struct io_ctx_config *config) */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; - ctx->compat = in_compat_syscall(); + if (in_compat_syscall()) + ctx->int_flags |= IO_RING_F_COMPAT; if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 0fa844faf287..5cb1983043cd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -211,7 +211,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); - } else if (!ctx->task_complete) { + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* @@ -228,7 +228,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline bool io_is_compat(struct io_ring_ctx *ctx) { - return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -472,8 +472,9 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || - ctx->has_evfd || ctx->poll_activated)) + if (unlikely(ctx->int_flags & (IO_RING_F_OFF_TIMEOUT_USED | + IO_RING_F_HAS_EVFD | + IO_RING_F_POLL_ACTIVATED))) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 57ad0085869a..3ff9098573db 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - return target_ctx->task_complete; + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; } static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) diff --git a/io_uring/register.c b/io_uring/register.c index 0148735f7711..489a6feaf228 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -184,9 +184,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return ret; } if (ctx->restrictions.op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (ctx->restrictions.reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; return 0; } @@ -384,7 +384,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); @@ -725,7 +725,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4fa59bf89bba..52554ed89b11 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 tag = 0; uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (ctx->compat) + if (io_is_compat(ctx)) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); @@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (arg) { uvec = (struct iovec __user *) arg; - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } - if (ctx->compat) + if (io_is_compat(ctx)) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7cbcb82aedfb..143de8e990eb 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -121,7 +121,7 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; tctx = current->io_uring; - if (ctx->iowq_limits_set) { + if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 8eddf8add7a2..579fdddac71a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -566,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req, INIT_LIST_HEAD(&timeout->list); timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr diff --git a/io_uring/tw.c b/io_uring/tw.c index 2f2b4ac4b126..022fe8753c19 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (!head) { io_ctx_mark_taskrun(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, false); } -- cgit v1.2.3 From 9165dc4fa969b64c2d4396ee4e1546a719978dd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:10 -0700 Subject: io_uring: add REQ_F_IOPOLL A subsequent commit will allow uring_cmds to files that don't implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't sufficient to determine whether a given request needs to be iopolled. Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request needs to be iopolled to completion. Set the flag in io_rw_init_file() and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use the request flag instead of IORING_SETUP_IOPOLL in places dealing with a specific request. A future possibility would be to add an option to enable/disable iopoll in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 9 ++++----- io_uring/rw.c | 11 ++++++----- io_uring/uring_cmd.c | 5 +++-- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0b3f08adc217..4dbd7083dd54 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -550,6 +550,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -641,6 +642,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb5a263706be..a610eaa5fd7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -356,7 +356,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -378,7 +377,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -1419,7 +1418,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = 0; /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; @@ -1435,7 +1434,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); @@ -1533,7 +1532,7 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a5f262734e8..3bdb9914e673 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); @@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { u32 cflags = 0; __io_complete_rw_common(req, ret); @@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -963,7 +964,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) @@ -1188,7 +1189,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..b651c63f6e20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, * because iopoll completion data overlaps with the hash_node used * for tracking. */ - if (ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) return; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { @@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_set_cqe32_extra(req, res2, 0); } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -260,6 +260,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { -- cgit v1.2.3 From 033af2b3eb19c5ed96825572105bca3611635ada Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:38 +0000 Subject: io_uring: introduce callback driven main loop The io_uring_enter() has a fixed order of execution: it submits requests, waits for completions, and returns to the user. Allow to optionally replace it with a custom loop driven by a callback called loop_step. The basic requirements to the callback is that it should be able to submit requests, wait for completions, parse them and repeat. Most of the communication including parameter passing can be implemented via shared memory. The callback should return IOU_LOOP_CONTINUE to continue execution or IOU_LOOP_STOP to return to the user space. Note that the kernel may decide to prematurely terminate it as well, e.g. in case the process was signalled or killed. The hook takes a structure with parameters. It can be used to ask the kernel to wait for CQEs by setting cq_wait_idx to the CQE index it wants to wait for. Spurious wake ups are possible and even likely, the callback is expected to handle it. There will be more parameters in the future like timeout. It can be used with kernel callbacks, for example, as a slow path deprecation mechanism overwiting SQEs and emulating the wanted behaviour, however it's more useful together with BPF programs implemented in following patches. Note that keeping it separately from the normal io_uring wait loop makes things much simpler and cleaner. It keeps it in one place instead of spreading a bunch of checks in different places including disabling the submission path. It holds the lock by default, which is a better fit for BPF synchronisation and the loop execution model. It nicely avoids existing quirks like forced wake ups on timeout request completion. And it should be easier to implement new features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/a2d369aa1c9dd23ad7edac9220cffc563abcaed6.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 11 +++++ io_uring/loop.c | 91 ++++++++++++++++++++++++++++++++++++++++++ io_uring/loop.h | 27 +++++++++++++ io_uring/wait.h | 1 + 6 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 io_uring/loop.c create mode 100644 io_uring/loop.h (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4dbd7083dd54..344b634b8989 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -41,6 +41,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -361,6 +363,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() diff --git a/io_uring/Makefile b/io_uring/Makefile index 931f9156132a..1c1f47de32a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o \ - query.o + query.o loop.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74cd62b44d94..960d36c49ffe 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "eventfd.h" #include "wait.h" #include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -588,6 +589,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -2571,6 +2577,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/wait.h b/io_uring/wait.h index 5e236f74e1af..037e512dd80c 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { -- cgit v1.2.3 From 98f37634b12b17ad5c56db8fb63cf9d7dc55d74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:41 +0000 Subject: io_uring/bpf-ops: implement bpf ops registration Implement BPF struct ops registration. It's registered off the BPF path, and can be removed by BPF as well as io_uring. To protect it, introduce a global lock synchronising registration. ctx->uring_lock can be nested under it. ctx->bpf_ops is write protected by both locks and so it's safe to read it under either of them. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/1f46bffd76008de49cbafa2ad77d348810a4f69e.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/bpf-ops.c | 92 +++++++++++++++++++++++++++++++++++++++++- io_uring/bpf-ops.h | 8 ++++ io_uring/io_uring.c | 1 + 4 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 344b634b8989..28e5dbdac55b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include #include +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -488,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index 17518f4ecca9..e4b244337aa9 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -5,10 +5,11 @@ #include "io_uring.h" #include "register.h" +#include "loop.h" #include "memmap.h" #include "bpf-ops.h" -#include "loop.h" +static DEFINE_MUTEX(io_bpf_ctrl_mutex); static const struct btf_type *loop_params_type; __bpf_kfunc_start_defs(); @@ -143,16 +144,103 @@ static int bpf_io_init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) { + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; return 0; } static int bpf_io_reg(void *kdata, struct bpf_link *link) { - return -EOPNOTSUPP; + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_register_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; } static void bpf_io_unreg(void *kdata, struct bpf_link *link) { + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); } static struct bpf_struct_ops bpf_ring_ops = { diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h index b9e589ad519a..b39b3fd3acda 100644 --- a/io_uring/bpf-ops.h +++ b/io_uring/bpf-ops.h @@ -17,4 +17,12 @@ struct io_uring_bpf_ops { void *priv; }; +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + #endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a80c8e6e633..d703f0a8b315 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2148,6 +2148,7 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); -- cgit v1.2.3 From 005869886d1d370afb6c10cd40709d956960e9c2 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:39 +0000 Subject: cxl: export internal structs for external Type2 drivers In preparation for type2 support, move structs and functions a type2 driver will need to access to into a new shared header file. Differentiate between public and private data to be preserved by type2 drivers. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Tested-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-3-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/cxl.h | 97 +--------------------- drivers/cxl/cxlmem.h | 114 -------------------------- include/cxl/cxl.h | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 210 deletions(-) create mode 100644 include/cxl/cxl.h (limited to 'include') diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b..1d94217729f7 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -12,6 +12,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -201,97 +202,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -/* - * Using struct_group() allows for per register-block-type helper routines, - * without requiring block-type agnostic code to include the prefix. - */ -struct cxl_regs { - /* - * Common set of CXL Component register block base pointers - * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure - * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure - */ - struct_group_tagged(cxl_component_regs, component, - void __iomem *hdm_decoder; - void __iomem *ras; - ); - /* - * Common set of CXL Device register block base pointers - * @status: CXL 2.0 8.2.8.3 Device Status Registers - * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers - * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers - */ - struct_group_tagged(cxl_device_regs, device_regs, - void __iomem *status, *mbox, *memdev; - ); - - struct_group_tagged(cxl_pmu_regs, pmu_regs, - void __iomem *pmu; - ); - - /* - * RCH downstream port specific RAS register - * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB - */ - struct_group_tagged(cxl_rch_regs, rch_regs, - void __iomem *dport_aer; - ); - - /* - * RCD upstream port specific PCIe cap register - * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB - */ - struct_group_tagged(cxl_rcd_regs, rcd_regs, - void __iomem *rcd_pcie_cap; - ); -}; - -struct cxl_reg_map { - bool valid; - int id; - unsigned long offset; - unsigned long size; -}; - -struct cxl_component_reg_map { - struct cxl_reg_map hdm_decoder; - struct cxl_reg_map ras; -}; - -struct cxl_device_reg_map { - struct cxl_reg_map status; - struct cxl_reg_map mbox; - struct cxl_reg_map memdev; -}; - -struct cxl_pmu_reg_map { - struct cxl_reg_map pmu; -}; - -/** - * struct cxl_register_map - DVSEC harvested register block mapping parameters - * @host: device for devm operations and logging - * @base: virtual base of the register-block-BAR + @block_offset - * @resource: physical resource base of the register block - * @max_size: maximum mapping size to perform register search - * @reg_type: see enum cxl_regloc_type - * @component_map: cxl_reg_map for component registers - * @device_map: cxl_reg_maps for device registers - * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units - */ -struct cxl_register_map { - struct device *host; - void __iomem *base; - resource_size_t resource; - resource_size_t max_size; - u8 reg_type; - union { - struct cxl_component_reg_map component_map; - struct cxl_device_reg_map device_map; - struct cxl_pmu_reg_map pmu_map; - }; -}; - void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, @@ -497,11 +407,6 @@ struct cxl_region_params { resource_size_t cache_size; }; -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 71367cb5178c..281546de426e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -113,8 +113,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -#define CXL_NR_PARTITIONS_MAX 2 - struct cxl_dpa_info { u64 size; struct cxl_dpa_part_info { @@ -373,87 +371,6 @@ struct cxl_security_state { struct kernfs_node *sanitize_node; }; -/* - * enum cxl_devtype - delineate type-2 from a generic type-3 device - * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or - * HDM-DB, no requirement that this device implements a - * mailbox, or other memory-device-standard manageability - * flows. - * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with - * HDM-H and class-mandatory memory device registers - */ -enum cxl_devtype { - CXL_DEVTYPE_DEVMEM, - CXL_DEVTYPE_CLASSMEM, -}; - -/** - * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range: range for DPA address - * @coord: QoS performance data (i.e. latency, bandwidth) - * @cdat_coord: raw QoS performance data from CDAT - * @qos_class: QoS Class cookies - */ -struct cxl_dpa_perf { - struct range dpa_range; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; - int qos_class; -}; - -/** - * struct cxl_dpa_partition - DPA partition descriptor - * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) - * @perf: performance attributes of the partition from CDAT - * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... - */ -struct cxl_dpa_partition { - struct resource res; - struct cxl_dpa_perf perf; - enum cxl_partition_mode mode; -}; - -/** - * struct cxl_dev_state - The driver device state - * - * cxl_dev_state represents the CXL driver/device state. It provides an - * interface to mailbox commands as well as some cached data about the device. - * Currently only memory devices are represented. - * - * @dev: The device associated with this CXL state - * @cxlmd: The device representing the CXL.mem capabilities of @dev - * @reg_map: component and ras register mapping parameters - * @regs: Class device "Device" registers - * @cxl_dvsec: Offset to the PCIe device DVSEC - * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) - * @media_ready: Indicate whether the device media is usable - * @dpa_res: Overall DPA resource tree for the device - * @part: DPA partition array - * @nr_partitions: Number of DPA partitions - * @serial: PCIe Device Serial Number - * @type: Generic Memory Class device or Vendor Specific Memory device - * @cxl_mbox: CXL mailbox context - * @cxlfs: CXL features context - */ -struct cxl_dev_state { - struct device *dev; - struct cxl_memdev *cxlmd; - struct cxl_register_map reg_map; - struct cxl_device_regs regs; - int cxl_dvsec; - bool rcd; - bool media_ready; - struct resource dpa_res; - struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; - unsigned int nr_partitions; - u64 serial; - enum cxl_devtype type; - struct cxl_mailbox cxl_mbox; -#ifdef CONFIG_CXL_FEATURES - struct cxl_features_state *cxlfs; -#endif -}; - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { /* @@ -523,37 +440,6 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } -struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, - enum cxl_devtype type, - u64 serial, u16 dvsec, - size_t size, bool has_mbox); - -/** - * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a - * driver specific struct. - * - * @parent: device behind the request - * @type: CXL device type - * @serial: device identification - * @dvsec: dvsec capability offset - * @drv_struct: driver struct embedding a cxl_dev_state struct - * @member: name of the struct cxl_dev_state member in drv_struct - * @mbox: true if mailbox supported - * - * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state - * struct initialized. - * - * Introduced for Type2 driver support. - */ -#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ - ({ \ - static_assert(__same_type(struct cxl_dev_state, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member) == 0); \ - (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ - sizeof(drv_struct), mbox); \ - }) - enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 000000000000..fa7269154620 --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ +/* Copyright(c) 2026 Advanced Micro Devices, Inc. */ + +#ifndef __CXL_CXL_H__ +#define __CXL_CXL_H__ + +#include +#include +#include + +/** + * enum cxl_devtype - delineate type-2 from a generic type-3 device + * @CXL_DEVTYPE_DEVMEM: Vendor specific CXL Type-2 device implementing HDM-D or + * HDM-DB, no requirement that this device implements a + * mailbox, or other memory-device-standard manageability + * flows. + * @CXL_DEVTYPE_CLASSMEM: Common class definition of a CXL Type-3 device with + * HDM-H and class-mandatory memory device registers + */ +enum cxl_devtype { + CXL_DEVTYPE_DEVMEM, + CXL_DEVTYPE_CLASSMEM, +}; + +struct device; + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + void __iomem *ras; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); + + struct_group_tagged(cxl_pmu_regs, pmu_regs, + void __iomem *pmu; + ); + + /* + * RCH downstream port specific RAS register + * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *dport_aer; + ); + + /* + * RCD upstream port specific PCIe cap register + * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB + */ + struct_group_tagged(cxl_rcd_regs, rcd_regs, + void __iomem *rcd_pcie_cap; + ); +}; + +struct cxl_reg_map { + bool valid; + int id; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; + struct cxl_reg_map ras; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +struct cxl_pmu_reg_map { + struct cxl_reg_map pmu; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @host: device for devm operations and logging + * @base: virtual base of the register-block-BAR + @block_offset + * @resource: physical resource base of the register block + * @max_size: maximum mapping size to perform register search + * @reg_type: see enum cxl_regloc_type + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + */ +struct cxl_register_map { + struct device *host; + void __iomem *base; + resource_size_t resource; + resource_size_t max_size; + u8 reg_type; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + struct cxl_pmu_reg_map pmu_map; + }; +}; + +/** + * struct cxl_dpa_perf - DPA performance property entry + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @cdat_coord: raw QoS performance data from CDAT + * @qos_class: QoS Class cookies + */ +struct cxl_dpa_perf { + struct range dpa_range; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int qos_class; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @cxlmd: The device representing the CXL.mem capabilities of @dev + * @reg_map: component and ras register mapping parameters + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable + * @dpa_res: Overall DPA resource tree for the device + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions + * @serial: PCIe Device Serial Number + * @type: Generic Memory Class device or Vendor Specific Memory device + * @cxl_mbox: CXL mailbox context + * @cxlfs: CXL features context + */ +struct cxl_dev_state { + /* public for Type2 drivers */ + struct device *dev; + struct cxl_memdev *cxlmd; + + /* private for Type2 drivers */ + struct cxl_register_map reg_map; + struct cxl_device_regs regs; + int cxl_dvsec; + bool rcd; + bool media_ready; + struct resource dpa_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; + u64 serial; + enum cxl_devtype type; + struct cxl_mailbox cxl_mbox; +#ifdef CONFIG_CXL_FEATURES + struct cxl_features_state *cxlfs; +#endif +}; + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) +#endif /* __CXL_CXL_H__ */ -- cgit v1.2.3 From cc6421acd97f2a386516a16129d00254588bd9ad Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 13 Mar 2026 12:19:31 +0100 Subject: xsk: remove repeated defines Seems we have been carrying around repeated defines for unaligned mode logic. Remove redundant ones. Signed-off-by: Maciej Fijalkowski Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260313111931.438911-1-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- include/net/xsk_buff_pool.h | 7 ------- net/xdp/xsk.h | 7 ------- 2 files changed, 14 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 0b1abdb99c9e..ccb3b350001f 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -174,13 +174,6 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr, u32 len) { diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index a4bc4749faac..7c811b5cce76 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,13 +4,6 @@ #ifndef XSK_H_ #define XSK_H_ -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; -- cgit v1.2.3 From 785639b5bf2a87eaf0cda14baaa068b3728c7be2 Mon Sep 17 00:00:00 2001 From: Cássio Gabriel Date: Mon, 16 Mar 2026 10:39:38 -0300 Subject: ALSA: timer: keep a list of open masters for slave lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit snd_timer_check_slave() still walks all registered timers and all open timer instances to find a matching master for a newly opened slave. Maintain a global list of open master instances that can accept slave links and use it for the slave lookup path instead. This keeps the existing matching semantics while avoiding the nested walk over snd_timer_list and each timer open_list_head. The reverse path in snd_timer_check_master() already scans only the pending slave list, so this makes both lookup paths closer in shape without changing the master/slave linking logic. Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260316-alsa-timer-master-list-v1-1-fb95e547110a@gmail.com Signed-off-by: Takashi Iwai --- include/sound/timer.h | 1 + sound/core/timer.c | 29 ++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/sound/timer.h b/include/sound/timer.h index 760e132cc0cd..83bafe70cf33 100644 --- a/include/sound/timer.h +++ b/include/sound/timer.h @@ -102,6 +102,7 @@ struct snd_timer_instance { unsigned int slave_id; struct list_head open_list; struct list_head active_list; + struct list_head master_list; struct list_head ack_list; struct list_head slave_list_head; struct list_head slave_active_head; diff --git a/sound/core/timer.c b/sound/core/timer.c index 6a70df7ae019..820901d503af 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -129,6 +129,9 @@ static LIST_HEAD(snd_timer_list); /* list of slave instances */ static LIST_HEAD(snd_timer_slave_list); +/* list of open master instances that can accept slave links */ +static LIST_HEAD(snd_timer_master_list); + /* lock for slave active lists */ static DEFINE_SPINLOCK(slave_active_lock); @@ -161,6 +164,7 @@ struct snd_timer_instance *snd_timer_instance_new(const char *owner) } INIT_LIST_HEAD(&timeri->open_list); INIT_LIST_HEAD(&timeri->active_list); + INIT_LIST_HEAD(&timeri->master_list); INIT_LIST_HEAD(&timeri->ack_list); INIT_LIST_HEAD(&timeri->slave_list_head); INIT_LIST_HEAD(&timeri->slave_active_head); @@ -245,6 +249,12 @@ static int check_matching_master_slave(struct snd_timer_instance *master, return 1; } +static bool snd_timer_has_slave_key(const struct snd_timer_instance *timeri) +{ + return !(timeri->flags & SNDRV_TIMER_IFLG_SLAVE) && + timeri->slave_class > SNDRV_TIMER_SCLASS_NONE; +} + /* * look for a master instance matching with the slave id of the given slave. * when found, relink the open_link of the slave. @@ -253,19 +263,15 @@ static int check_matching_master_slave(struct snd_timer_instance *master, */ static int snd_timer_check_slave(struct snd_timer_instance *slave) { - struct snd_timer *timer; struct snd_timer_instance *master; int err = 0; - /* FIXME: it's really dumb to look up all entries.. */ - list_for_each_entry(timer, &snd_timer_list, device_list) { - list_for_each_entry(master, &timer->open_list_head, open_list) { - err = check_matching_master_slave(master, slave); - if (err != 0) /* match found or error */ - goto out; - } + list_for_each_entry(master, &snd_timer_master_list, master_list) { + err = check_matching_master_slave(master, slave); + if (err != 0) /* match found or error */ + goto out; } - out: +out: return err < 0 ? err : 0; } @@ -377,6 +383,8 @@ int snd_timer_open(struct snd_timer_instance *timeri, timeri->slave_id = slave_id; list_add_tail(&timeri->open_list, &timer->open_list_head); + if (snd_timer_has_slave_key(timeri)) + list_add_tail(&timeri->master_list, &snd_timer_master_list); timer->num_instances++; err = snd_timer_check_master(timeri); list_added: @@ -431,6 +439,9 @@ static void snd_timer_close_locked(struct snd_timer_instance *timeri, num_slaves--; } + if (!list_empty(&timeri->master_list)) + list_del_init(&timeri->master_list); + /* force to stop the timer */ snd_timer_stop(timeri); -- cgit v1.2.3 From 37a23d6f11938cd59927e3307b9b301624df8e8f Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 11 Mar 2026 21:59:21 -0700 Subject: bus: mhi: host: Use kzalloc_flex Change kzalloc + kzalloc to just kzalloc with a flexible array member. Add __counted_by for extra runtime analysis when requested. Move counting assignment immediately after allocation as required by __counted_by. Move mhi_buf definition as a complete definition as needed for flex arrays. It's not a pointer anymore. Signed-off-by: Rosen Penev [mani: squashed https://lore.kernel.org/mhi/20260317-mhi-invalid-free-mhi-buffers-v1-1-8418a3ad604f@oss.qualcomm.com] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312045921.7663-1-rosenp@gmail.com --- drivers/bus/mhi/host/boot.c | 22 +++------------------- include/linux/mhi.h | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/bus/mhi/host/boot.c b/drivers/bus/mhi/host/boot.c index f16a1e67a667..19c84913cfb9 100644 --- a/drivers/bus/mhi/host/boot.c +++ b/drivers/bus/mhi/host/boot.c @@ -308,7 +308,6 @@ static void mhi_free_bhi_buffer(struct mhi_controller *mhi_cntrl, struct mhi_buf *mhi_buf = image_info->mhi_buf; dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(image_info->mhi_buf); kfree(image_info); } @@ -322,7 +321,6 @@ void mhi_free_bhie_table(struct mhi_controller *mhi_cntrl, dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(image_info->mhi_buf); kfree(image_info); } @@ -333,15 +331,10 @@ static int mhi_alloc_bhi_buffer(struct mhi_controller *mhi_cntrl, struct image_info *img_info; struct mhi_buf *mhi_buf; - img_info = kzalloc_obj(*img_info); + img_info = kzalloc_flex(*img_info, mhi_buf, 1); if (!img_info) return -ENOMEM; - /* Allocate memory for entry */ - img_info->mhi_buf = kzalloc_obj(*img_info->mhi_buf); - if (!img_info->mhi_buf) - goto error_alloc_mhi_buf; - /* Allocate and populate vector table */ mhi_buf = img_info->mhi_buf; @@ -358,8 +351,6 @@ static int mhi_alloc_bhi_buffer(struct mhi_controller *mhi_cntrl, return 0; error_alloc_segment: - kfree(mhi_buf); -error_alloc_mhi_buf: kfree(img_info); return -ENOMEM; @@ -375,14 +366,11 @@ int mhi_alloc_bhie_table(struct mhi_controller *mhi_cntrl, struct image_info *img_info; struct mhi_buf *mhi_buf; - img_info = kzalloc_obj(*img_info); + img_info = kzalloc_flex(*img_info, mhi_buf, segments); if (!img_info) return -ENOMEM; - /* Allocate memory for entries */ - img_info->mhi_buf = kzalloc_objs(*img_info->mhi_buf, segments); - if (!img_info->mhi_buf) - goto error_alloc_mhi_buf; + img_info->entries = segments; /* Allocate and populate vector table */ mhi_buf = img_info->mhi_buf; @@ -402,7 +390,6 @@ int mhi_alloc_bhie_table(struct mhi_controller *mhi_cntrl, } img_info->bhi_vec = img_info->mhi_buf[segments - 1].buf; - img_info->entries = segments; *image_info = img_info; return 0; @@ -411,9 +398,6 @@ error_alloc_segment: for (--i, --mhi_buf; i >= 0; i--, mhi_buf--) dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(img_info->mhi_buf); - -error_alloc_mhi_buf: kfree(img_info); return -ENOMEM; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 88ccb3e14f48..fb3ba639f4f8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -85,17 +85,33 @@ enum mhi_ch_type { MHI_CH_TYPE_INBOUND_COALESCED = 3, }; +/** + * struct mhi_buf - MHI Buffer description + * @buf: Virtual address of the buffer + * @name: Buffer label. For offload channel, configurations name must be: + * ECA - Event context array data + * CCA - Channel context array data + * @dma_addr: IOMMU address of the buffer + * @len: # of bytes + */ +struct mhi_buf { + void *buf; + const char *name; + dma_addr_t dma_addr; + size_t len; +}; + /** * struct image_info - Firmware and RDDM table * @mhi_buf: Buffer for firmware and RDDM table * @entries: # of entries in table */ struct image_info { - struct mhi_buf *mhi_buf; /* private: from internal.h */ struct bhi_vec_entry *bhi_vec; /* public: */ u32 entries; + struct mhi_buf mhi_buf[] __counted_by(entries); }; /** @@ -488,22 +504,6 @@ struct mhi_result { int transaction_status; }; -/** - * struct mhi_buf - MHI Buffer description - * @buf: Virtual address of the buffer - * @name: Buffer label. For offload channel, configurations name must be: - * ECA - Event context array data - * CCA - Channel context array data - * @dma_addr: IOMMU address of the buffer - * @len: # of bytes - */ -struct mhi_buf { - void *buf; - const char *name; - dma_addr_t dma_addr; - size_t len; -}; - /** * struct mhi_driver - Structure representing a MHI client driver * @probe: CB function for client driver probe function -- cgit v1.2.3 From c841b676da98638f5ed8d3f2f449ddd02d9921aa Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Fri, 14 Nov 2025 11:39:40 +0100 Subject: ovpn: notify userspace on client float event Send a netlink notification when a client updates its remote UDP endpoint. The notification includes the new IP address, port, and scope ID (for IPv6). Cc: linux-kselftest@vger.kernel.org Cc: horms@kernel.org Cc: shuah@kernel.org Cc: donald.hunter@gmail.com Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli Reviewed-by: Sabrina Dubroca --- Documentation/netlink/specs/ovpn.yaml | 6 +++ drivers/net/ovpn/netlink.c | 82 +++++++++++++++++++++++++++++ drivers/net/ovpn/netlink.h | 2 + drivers/net/ovpn/peer.c | 2 + include/uapi/linux/ovpn.h | 1 + tools/testing/selftests/net/ovpn/ovpn-cli.c | 3 ++ 6 files changed, 96 insertions(+) (limited to 'include') diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml index 1b91045cee2e..0d0c028bf96f 100644 --- a/Documentation/netlink/specs/ovpn.yaml +++ b/Documentation/netlink/specs/ovpn.yaml @@ -502,6 +502,12 @@ operations: - ifindex - keyconf + - + name: peer-float-ntf + doc: Notification about a peer floating (changing its remote UDP endpoint) + notify: peer-get + mcgrp: peers + mcast-groups: list: - diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index fed0e46b32a3..e10d7f9a28f5 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -1203,6 +1203,88 @@ err_free_msg: return ret; } +/** + * ovpn_nl_peer_float_notify - notify userspace about peer floating + * @peer: the floated peer + * @ss: sockaddr representing the new remote endpoint + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_nl_peer_float_notify(struct ovpn_peer *peer, + const struct sockaddr_storage *ss) +{ + struct ovpn_socket *sock; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa; + struct sk_buff *msg; + struct nlattr *attr; + int ret = -EMSGSIZE; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, + OVPN_CMD_PEER_FLOAT_NTF); + if (!hdr) { + ret = -ENOBUFS; + goto err_free_msg; + } + + if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) + goto err_cancel_msg; + + attr = nla_nest_start(msg, OVPN_A_PEER); + if (!attr) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_PEER_ID, peer->id)) + goto err_cancel_msg; + + if (ss->ss_family == AF_INET) { + sa = (struct sockaddr_in *)ss; + if (nla_put_in_addr(msg, OVPN_A_PEER_REMOTE_IPV4, + sa->sin_addr.s_addr) || + nla_put_net16(msg, OVPN_A_PEER_REMOTE_PORT, sa->sin_port)) + goto err_cancel_msg; + } else if (ss->ss_family == AF_INET6) { + sa6 = (struct sockaddr_in6 *)ss; + if (nla_put_in6_addr(msg, OVPN_A_PEER_REMOTE_IPV6, + &sa6->sin6_addr) || + nla_put_u32(msg, OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + sa6->sin6_scope_id) || + nla_put_net16(msg, OVPN_A_PEER_REMOTE_PORT, sa6->sin6_port)) + goto err_cancel_msg; + } else { + ret = -EAFNOSUPPORT; + goto err_cancel_msg; + } + + nla_nest_end(msg, attr); + genlmsg_end(msg, hdr); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sk), msg, + 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); + rcu_read_unlock(); + + return 0; + +err_unlock: + rcu_read_unlock(); +err_cancel_msg: + genlmsg_cancel(msg, hdr); +err_free_msg: + nlmsg_free(msg); + return ret; +} + /** * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed * @peer: the peer whose key needs to be renewed diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h index 8615dfc3c472..11ee7c681885 100644 --- a/drivers/net/ovpn/netlink.h +++ b/drivers/net/ovpn/netlink.h @@ -13,6 +13,8 @@ int ovpn_nl_register(void); void ovpn_nl_unregister(void); int ovpn_nl_peer_del_notify(struct ovpn_peer *peer); +int ovpn_nl_peer_float_notify(struct ovpn_peer *peer, + const struct sockaddr_storage *ss); int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id); #endif /* _NET_OVPN_NETLINK_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 3716a1d82801..4e145b4497e6 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -287,6 +287,8 @@ void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb) spin_unlock_bh(&peer->lock); + ovpn_nl_peer_float_notify(peer, &ss); + /* rehashing is required only in MP mode as P2P has one peer * only and thus there is no hashtable */ diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h index 959b41def61f..0cce0d58b830 100644 --- a/include/uapi/linux/ovpn.h +++ b/include/uapi/linux/ovpn.h @@ -100,6 +100,7 @@ enum { OVPN_CMD_KEY_SWAP, OVPN_CMD_KEY_SWAP_NTF, OVPN_CMD_KEY_DEL, + OVPN_CMD_PEER_FLOAT_NTF, __OVPN_CMD_MAX, OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c index 0f3babf19fd0..7178abae1b2f 100644 --- a/tools/testing/selftests/net/ovpn/ovpn-cli.c +++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c @@ -1516,6 +1516,9 @@ static int ovpn_handle_msg(struct nl_msg *msg, void *arg) case OVPN_CMD_PEER_DEL_NTF: fprintf(stdout, "received CMD_PEER_DEL_NTF\n"); break; + case OVPN_CMD_PEER_FLOAT_NTF: + fprintf(stdout, "received CMD_PEER_FLOAT_NTF\n"); + break; case OVPN_CMD_KEY_SWAP_NTF: fprintf(stdout, "received CMD_KEY_SWAP_NTF\n"); break; -- cgit v1.2.3 From 2e570a51408839b2079f3cb7e3944bf9b1184ee0 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Wed, 9 Jul 2025 17:21:25 +0200 Subject: ovpn: add support for asymmetric peer IDs In order to support the multipeer architecture, upon connection setup each side of a tunnel advertises a unique ID that the other side must include in packets sent to them. Therefore when transmitting a packet, a peer inserts the recipient's advertised ID for that specific tunnel into the peer ID field. When receiving a packet, a peer expects to find its own unique receive ID for that specific tunnel in the peer ID field. Add support for the TX peer ID and embed it into transmitting packets. If no TX peer ID is specified, fallback to using the same peer ID both for RX and TX in order to be compatible with the non-multipeer compliant peers. Cc: horms@kernel.org Cc: donald.hunter@gmail.com Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli Reviewed-by: Sabrina Dubroca --- Documentation/netlink/specs/ovpn.yaml | 17 ++++++++++++++++- drivers/net/ovpn/crypto_aead.c | 2 +- drivers/net/ovpn/netlink-gen.c | 13 ++++++++++--- drivers/net/ovpn/netlink-gen.h | 6 +++--- drivers/net/ovpn/netlink.c | 14 ++++++++++++-- drivers/net/ovpn/peer.c | 4 ++++ drivers/net/ovpn/peer.h | 4 +++- include/uapi/linux/ovpn.h | 1 + 8 files changed, 50 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml index 0d0c028bf96f..b0c782e59a32 100644 --- a/Documentation/netlink/specs/ovpn.yaml +++ b/Documentation/netlink/specs/ovpn.yaml @@ -43,7 +43,8 @@ attribute-sets: type: u32 doc: >- The unique ID of the peer in the device context. To be used to - identify peers during operations for a specific device + identify peers during operations for a specific device. + Also used to match packets received from this peer. checks: max: 0xFFFFFF - @@ -160,6 +161,16 @@ attribute-sets: name: link-tx-packets type: uint doc: Number of packets transmitted at the transport level + - + name: tx-id + type: u32 + doc: >- + The ID value used when transmitting packets to this peer. This + way outgoing packets can have a different ID than incoming ones. + Useful in multipeer-to-multipeer connections, where each peer + will advertise the tx-id to be used on the link. + checks: + max: 0xFFFFFF - name: peer-new-input subset-of: peer @@ -188,6 +199,8 @@ attribute-sets: name: keepalive-interval - name: keepalive-timeout + - + name: tx-id - name: peer-set-input subset-of: peer @@ -214,6 +227,8 @@ attribute-sets: name: keepalive-interval - name: keepalive-timeout + - + name: tx-id - name: peer-del-input subset-of: peer diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c index 77be0942a269..59848c41b7b2 100644 --- a/drivers/net/ovpn/crypto_aead.c +++ b/drivers/net/ovpn/crypto_aead.c @@ -122,7 +122,7 @@ int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, memcpy(skb->data, iv, OVPN_NONCE_WIRE_SIZE); /* add packet op as head of additional data */ - op = ovpn_opcode_compose(OVPN_DATA_V2, ks->key_id, peer->id); + op = ovpn_opcode_compose(OVPN_DATA_V2, ks->key_id, peer->tx_id); __skb_push(skb, OVPN_OPCODE_SIZE); BUILD_BUG_ON(sizeof(op) != OVPN_OPCODE_SIZE); *((__force __be32 *)skb->data) = htonl(op); diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c index ecbe9dcf4f7d..2147cec7c2c5 100644 --- a/drivers/net/ovpn/netlink-gen.c +++ b/drivers/net/ovpn/netlink-gen.c @@ -16,6 +16,10 @@ static const struct netlink_range_validation ovpn_a_peer_id_range = { .max = 16777215ULL, }; +static const struct netlink_range_validation ovpn_a_peer_tx_id_range = { + .max = 16777215ULL, +}; + static const struct netlink_range_validation ovpn_a_keyconf_peer_id_range = { .max = 16777215ULL, }; @@ -51,7 +55,7 @@ const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1] = { [OVPN_A_KEYDIR_NONCE_TAIL] = NLA_POLICY_EXACT_LEN(OVPN_NONCE_TAIL_SIZE), }; -const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { +const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_TX_ID + 1] = { [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), @@ -75,13 +79,14 @@ const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { [OVPN_A_PEER_LINK_TX_BYTES] = { .type = NLA_UINT, }, [OVPN_A_PEER_LINK_RX_PACKETS] = { .type = NLA_UINT, }, [OVPN_A_PEER_LINK_TX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_TX_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_tx_id_range), }; const struct nla_policy ovpn_peer_del_input_nl_policy[OVPN_A_PEER_ID + 1] = { [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), }; -const struct nla_policy ovpn_peer_new_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIMEOUT + 1] = { +const struct nla_policy ovpn_peer_new_input_nl_policy[OVPN_A_PEER_TX_ID + 1] = { [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), @@ -94,9 +99,10 @@ const struct nla_policy ovpn_peer_new_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIME [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, + [OVPN_A_PEER_TX_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_tx_id_range), }; -const struct nla_policy ovpn_peer_set_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIMEOUT + 1] = { +const struct nla_policy ovpn_peer_set_input_nl_policy[OVPN_A_PEER_TX_ID + 1] = { [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), @@ -108,6 +114,7 @@ const struct nla_policy ovpn_peer_set_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIME [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, + [OVPN_A_PEER_TX_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_tx_id_range), }; /* OVPN_CMD_PEER_NEW - do */ diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h index b2301580770f..67cd85f86173 100644 --- a/drivers/net/ovpn/netlink-gen.h +++ b/drivers/net/ovpn/netlink-gen.h @@ -18,10 +18,10 @@ extern const struct nla_policy ovpn_keyconf_del_input_nl_policy[OVPN_A_KEYCONF_S extern const struct nla_policy ovpn_keyconf_get_nl_policy[OVPN_A_KEYCONF_CIPHER_ALG + 1]; extern const struct nla_policy ovpn_keyconf_swap_input_nl_policy[OVPN_A_KEYCONF_PEER_ID + 1]; extern const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1]; -extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1]; +extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_TX_ID + 1]; extern const struct nla_policy ovpn_peer_del_input_nl_policy[OVPN_A_PEER_ID + 1]; -extern const struct nla_policy ovpn_peer_new_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIMEOUT + 1]; -extern const struct nla_policy ovpn_peer_set_input_nl_policy[OVPN_A_PEER_KEEPALIVE_TIMEOUT + 1]; +extern const struct nla_policy ovpn_peer_new_input_nl_policy[OVPN_A_PEER_TX_ID + 1]; +extern const struct nla_policy ovpn_peer_set_input_nl_policy[OVPN_A_PEER_TX_ID + 1]; int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index e10d7f9a28f5..291e2e5bb450 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -305,6 +305,12 @@ static int ovpn_nl_peer_modify(struct ovpn_peer *peer, struct genl_info *info, dst_cache_reset(&peer->dst_cache); } + /* In a multipeer-to-multipeer setup we may have asymmetric peer IDs, + * that is peer->id might be different from peer->tx_id. + */ + if (attrs[OVPN_A_PEER_TX_ID]) + peer->tx_id = nla_get_u32(attrs[OVPN_A_PEER_TX_ID]); + if (attrs[OVPN_A_PEER_VPN_IPV4]) { rehash = true; peer->vpn_addrs.ipv4.s_addr = @@ -326,8 +332,8 @@ static int ovpn_nl_peer_modify(struct ovpn_peer *peer, struct genl_info *info, } netdev_dbg(peer->ovpn->dev, - "modify peer id=%u endpoint=%pIScp VPN-IPv4=%pI4 VPN-IPv6=%pI6c\n", - peer->id, &ss, + "modify peer id=%u tx_id=%u endpoint=%pIScp VPN-IPv4=%pI4 VPN-IPv6=%pI6c\n", + peer->id, peer->tx_id, &ss, &peer->vpn_addrs.ipv4.s_addr, &peer->vpn_addrs.ipv6); spin_unlock_bh(&peer->lock); @@ -373,6 +379,7 @@ int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) } peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_new(ovpn, peer_id); if (IS_ERR(peer)) { NL_SET_ERR_MSG_FMT_MOD(info->extack, @@ -572,6 +579,9 @@ static int ovpn_nl_send_peer(struct sk_buff *skb, const struct genl_info *info, if (nla_put_u32(skb, OVPN_A_PEER_ID, peer->id)) goto err; + if (nla_put_u32(skb, OVPN_A_PEER_TX_ID, peer->tx_id)) + goto err; + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) if (nla_put_in_addr(skb, OVPN_A_PEER_VPN_IPV4, peer->vpn_addrs.ipv4.s_addr)) diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 4e145b4497e6..26b55d813f0e 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -99,7 +99,11 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) if (!peer) return ERR_PTR(-ENOMEM); + /* in the default case TX and RX IDs are the same. + * the user may set a different TX ID via netlink + */ peer->id = id; + peer->tx_id = id; peer->ovpn = ovpn; peer->vpn_addrs.ipv4.s_addr = htonl(INADDR_ANY); diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index a1423f2b09e0..328401570cba 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -21,7 +21,8 @@ * struct ovpn_peer - the main remote peer object * @ovpn: main openvpn instance this peer belongs to * @dev_tracker: reference tracker for associated dev - * @id: unique identifier + * @id: unique identifier, used to match incoming packets + * @tx_id: identifier to be used in TX packets * @vpn_addrs: IP addresses assigned over the tunnel * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel @@ -64,6 +65,7 @@ struct ovpn_peer { struct ovpn_priv *ovpn; netdevice_tracker dev_tracker; u32 id; + u32 tx_id; struct { struct in_addr ipv4; struct in6_addr ipv6; diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h index 0cce0d58b830..06690090a1a9 100644 --- a/include/uapi/linux/ovpn.h +++ b/include/uapi/linux/ovpn.h @@ -55,6 +55,7 @@ enum { OVPN_A_PEER_LINK_TX_BYTES, OVPN_A_PEER_LINK_RX_PACKETS, OVPN_A_PEER_LINK_TX_PACKETS, + OVPN_A_PEER_TX_ID, __OVPN_A_PEER_MAX, OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) -- cgit v1.2.3 From 5aeb6e039972312ecfdf7e54573e2729a5974df2 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 16 Mar 2026 18:08:46 +0000 Subject: RDMA/efa: Rename alloc_ucontext comp_mask to supported_caps Following discussion [1], rename the comp_mask field in efa_ibv_alloc_ucontext_cmd to supported_caps to reflect its actual usage as a capabilities handshake mechanism rather than a standard comp_mask. Rename related constants and align function and macro names. [1] https://lore.kernel.org/linux-rdma/20260312120858.GH1448102@nvidia.com/ Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20260316180846.30273-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_verbs.c | 17 +++++++++-------- include/uapi/rdma/efa-abi.h | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index fc498663cd37..283c62d9cb3d 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1917,22 +1917,23 @@ static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn) return efa_com_dealloc_uar(&dev->edev, ¶ms); } -#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \ - (_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \ +#define EFA_CHECK_USER_SUPP(_dev, _supported_caps, _attr, _mask, _attr_str) \ + (_attr_str = (!(_dev)->dev_attr._attr || ((_supported_caps) & (_mask))) ? \ NULL : #_attr) -static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext, +static int efa_user_supp_handshake(const struct ib_ucontext *ibucontext, const struct efa_ibv_alloc_ucontext_cmd *cmd) { struct efa_dev *dev = to_edev(ibucontext->device); char *attr_str; - if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch, - EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str)) + if (EFA_CHECK_USER_SUPP(dev, cmd->supported_caps, max_tx_batch, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_TX_BATCH, + attr_str)) goto err; - if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth, - EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR, + if (EFA_CHECK_USER_SUPP(dev, cmd->supported_caps, min_sq_depth, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_MIN_SQ_WR, attr_str)) goto err; @@ -1966,7 +1967,7 @@ int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata) goto err_out; } - err = efa_user_comp_handshake(ibucontext, &cmd); + err = efa_user_supp_handshake(ibucontext, &cmd); if (err) goto err_out; diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 13225b038124..d5c18f8de182 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -22,12 +22,12 @@ */ enum { - EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, - EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_SUPP_CAPS_MIN_SQ_WR = 1 << 1, }; struct efa_ibv_alloc_ucontext_cmd { - __u32 comp_mask; + __u32 supported_caps; __u8 reserved_20[4]; }; -- cgit v1.2.3 From e71e00127110dedc6a9e746178282b4dac97ed96 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 11:25:36 -0400 Subject: iommupt: Add the RISC-V page table format The RISC-V format is a fairly simple 5 level page table not unlike the x86 one. It has optional support for a single contiguous page size of 64k (16 x 4k). The specification describes a 32-bit format, the general code can support it via a #define but the iommu side implementation has been left off until a user comes. Tested-by: Vincent Chen Acked-by: Paul Walmsley # arch/riscv Reviewed-by: Tomasz Jeznach Tested-by: Tomasz Jeznach Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 + drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_riscv.h | 29 +++ drivers/iommu/generic_pt/fmt/iommu_riscv64.c | 11 + drivers/iommu/generic_pt/fmt/riscv.h | 313 +++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 16 ++ include/linux/generic_pt/iommu.h | 11 + 8 files changed, 394 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_riscv.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_riscv64.c create mode 100644 drivers/iommu/generic_pt/fmt/riscv.h (limited to 'include') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index a78b295f264d..0bb98fe581fe 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -5,6 +5,7 @@ CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y CONFIG_IOMMU_PT_VTDSS=y +CONFIG_IOMMU_PT_RISCV64=y CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index ce4fb4786914..f4ed1add58b7 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -52,6 +52,16 @@ config IOMMU_PT_VTDSS Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_RISCV64 + tristate "IOMMU page table for RISC-V 64 bit Sv57/Sv48/Sv39" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for RISC-V 64 bit 3/4/5 level page table. + It supports 4K/2M/1G/512G/256T page sizes and can decode a sign + extended portion of the 64 bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_X86_64 tristate "IOMMU page table for x86 64-bit, 4/5 levels" depends on !GENERIC_ATOMIC64 # for cmpxchg64 @@ -66,6 +76,7 @@ config IOMMU_PT_KUNIT_TEST tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_RISCV64 || !IOMMU_PT_RISCV64 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS default KUNIT_ALL_TESTS diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 976b49ec97dc..ea024d582594 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -5,6 +5,8 @@ iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss +iommu_pt_fmt-$(CONFIG_IOMMU_PT_RISCV64) += riscv64 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 IOMMU_PT_KUNIT_TEST := diff --git a/drivers/iommu/generic_pt/fmt/defs_riscv.h b/drivers/iommu/generic_pt/fmt/defs_riscv.h new file mode 100644 index 000000000000..cf67474d5eba --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_riscv.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_RISCV_H +#define __GENERIC_PT_FMT_DEFS_RISCV_H + +#include +#include + +#ifdef PT_RISCV_32BIT +typedef u32 pt_riscv_entry_t; +#define riscvpt_write_attrs riscv32pt_write_attrs +#else +typedef u64 pt_riscv_entry_t; +#define riscvpt_write_attrs riscv64pt_write_attrs +#endif + +typedef pt_riscv_entry_t pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct riscvpt_write_attrs { + pt_riscv_entry_t descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs riscvpt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_riscv64.c b/drivers/iommu/generic_pt/fmt/iommu_riscv64.c new file mode 100644 index 000000000000..cbf60fffa9bf --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_riscv64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT riscv +#define PT_FMT_VARIANT 64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_RISCV_SVNAPOT_64K)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/riscv.h b/drivers/iommu/generic_pt/fmt/riscv.h new file mode 100644 index 000000000000..a7fef6266a36 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/riscv.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + * + * RISC-V page table + * + * This is described in Sections: + * 12.3. Sv32: Page-Based 32-bit Virtual-Memory Systems + * 12.4. Sv39: Page-Based 39-bit Virtual-Memory System + * 12.5. Sv48: Page-Based 48-bit Virtual-Memory System + * 12.6. Sv57: Page-Based 57-bit Virtual-Memory System + * of the "The RISC-V Instruction Set Manual: Volume II" + * + * This includes the contiguous page extension from: + * Chapter 13. "Svnapot" Extension for NAPOT Translation Contiguity, + * Version 1.0 + * + * The table format is sign extended and supports leafs in every level. The spec + * doesn't talk a lot about levels, but level here is the same as i=LEVELS-1 in + * the spec. + */ +#ifndef __GENERIC_PT_FMT_RISCV_H +#define __GENERIC_PT_FMT_RISCV_H + +#include "defs_riscv.h" +#include "../pt_defs.h" + +#include +#include +#include +#include + +enum { + PT_ITEM_WORD_SIZE = sizeof(pt_riscv_entry_t), +#ifdef PT_RISCV_32BIT + PT_MAX_VA_ADDRESS_LG2 = 32, + PT_MAX_OUTPUT_ADDRESS_LG2 = 34, + PT_MAX_TOP_LEVEL = 1, +#else + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_MAX_OUTPUT_ADDRESS_LG2 = 56, + PT_MAX_TOP_LEVEL = 4, +#endif + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* fsc.PPN is 44 bits wide, all PPNs are 4k aligned */ + PT_TOP_PHYS_MASK = GENMASK_ULL(55, 12), +}; + +/* PTE bits */ +enum { + RISCVPT_V = BIT(0), + RISCVPT_R = BIT(1), + RISCVPT_W = BIT(2), + RISCVPT_X = BIT(3), + RISCVPT_U = BIT(4), + RISCVPT_G = BIT(5), + RISCVPT_A = BIT(6), + RISCVPT_D = BIT(7), + RISCVPT_RSW = GENMASK(9, 8), + RISCVPT_PPN32 = GENMASK(31, 10), + + RISCVPT_PPN64 = GENMASK_ULL(53, 10), + RISCVPT_PPN64_64K = GENMASK_ULL(53, 14), + RISCVPT_PBMT = GENMASK_ULL(62, 61), + RISCVPT_N = BIT_ULL(63), + + /* Svnapot encodings for ppn[0] */ + RISCVPT_PPN64_64K_SZ = BIT(13), +}; + +#ifdef PT_RISCV_32BIT +#define RISCVPT_PPN RISCVPT_PPN32 +#define pt_riscv pt_riscv_32 +#else +#define RISCVPT_PPN RISCVPT_PPN64 +#define pt_riscv pt_riscv_64 +#endif + +#define common_to_riscvpt(common_ptr) \ + container_of_const(common_ptr, struct pt_riscv, common) +#define to_riscvpt(pts) common_to_riscvpt((pts)->range->common) + +static inline pt_oaddr_t riscvpt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa riscvpt_table_pa + +static inline pt_oaddr_t riscvpt_entry_oa(const struct pt_state *pts) +{ + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && + pts->entry & RISCVPT_N) { + PT_WARN_ON(pts->level != 0); + return oalog2_mul(FIELD_GET(RISCVPT_PPN64_64K, pts->entry), + ilog2(SZ_64K)); + } + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); +} +#define pt_entry_oa riscvpt_entry_oa + +static inline bool riscvpt_can_have_leaf(const struct pt_state *pts) +{ + return true; +} +#define pt_can_have_leaf riscvpt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +riscvpt_entry_num_contig_lg2(const struct pt_state *pts) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_RISCV_SVNAPOT_64K) && + pts->entry & RISCVPT_N) { + PT_WARN_ON(!pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)); + PT_WARN_ON(pts->level); + return ilog2(16); + } + return ilog2(1); +} +#define pt_entry_num_contig_lg2 riscvpt_entry_num_contig_lg2 + +static inline unsigned int riscvpt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 riscvpt_num_items_lg2 + +static inline unsigned short +riscvpt_contig_count_lg2(const struct pt_state *pts) +{ + if (pts->level == 0 && pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)) + return ilog2(16); + return ilog2(1); +} +#define pt_contig_count_lg2 riscvpt_contig_count_lg2 + +static inline enum pt_entry_type riscvpt_load_entry_raw(struct pt_state *pts) +{ + const pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); + pt_riscv_entry_t entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & RISCVPT_V)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + ((entry & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) != 0)) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw riscvpt_load_entry_raw + +static inline void +riscvpt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); + pt_riscv_entry_t entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = RISCVPT_V | + FIELD_PREP(RISCVPT_PPN, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && pts->level == 0 && + oasz_lg2 != PT_GRANULE_LG2SZ) { + u64 *end; + + entry |= RISCVPT_N | RISCVPT_PPN64_64K_SZ; + tablep += pts->index; + end = tablep + log2_div(SZ_64K, PT_GRANULE_LG2SZ); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + /* FIXME does riscv need this to be cmpxchg? */ + WRITE_ONCE(tablep[pts->index], entry); + } + pts->entry = entry; +} +#define pt_install_leaf_entry riscvpt_install_leaf_entry + +static inline bool riscvpt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + pt_riscv_entry_t entry; + + entry = RISCVPT_V | + FIELD_PREP(RISCVPT_PPN, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table riscvpt_install_table + +static inline void riscvpt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (RISCVPT_R | RISCVPT_W | RISCVPT_X | RISCVPT_U | + RISCVPT_G | RISCVPT_A | RISCVPT_D); +} +#define pt_attr_from_entry riscvpt_attr_from_entry + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_riscv_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->riscv_64pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, riscv_64pt.common) + ->iommu; +} + +static inline int riscvpt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = RISCVPT_A | RISCVPT_U; + if (iommu_prot & IOMMU_WRITE) + pte |= RISCVPT_W | RISCVPT_R | RISCVPT_D; + if (iommu_prot & IOMMU_READ) + pte |= RISCVPT_R; + if (!(iommu_prot & IOMMU_NOEXEC)) + pte |= RISCVPT_X; + + /* Caller must specify a supported combination of flags */ + if (unlikely((pte & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) == 0)) + return -EOPNOTSUPP; + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot riscvpt_iommu_set_prot + +static inline int +riscvpt_iommu_fmt_init(struct pt_iommu_riscv_64 *iommu_table, + const struct pt_iommu_riscv_64_cfg *cfg) +{ + struct pt_riscv *table = &iommu_table->riscv_64pt; + + switch (cfg->common.hw_max_vasz_lg2) { + case 39: + pt_top_set_level(&table->common, 2); + break; + case 48: + pt_top_set_level(&table->common, 3); + break; + case 57: + pt_top_set_level(&table->common, 4); + break; + default: + return -EINVAL; + } + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init riscvpt_iommu_fmt_init + +static inline void +riscvpt_iommu_fmt_hw_info(struct pt_iommu_riscv_64 *table, + const struct pt_range *top_range, + struct pt_iommu_riscv_64_hw_info *info) +{ + phys_addr_t top_phys = virt_to_phys(top_range->top_table); + + info->ppn = oalog2_div(top_phys, PT_GRANULE_LG2SZ); + PT_WARN_ON(top_phys & ~PT_TOP_PHYS_MASK); + + /* + * See Table 3. Encodings of iosatp.MODE field" for DC.tx.SXL = 0: + * 8 = Sv39 = top level 2 + * 9 = Sv38 = top level 3 + * 10 = Sv57 = top level 4 + */ + info->fsc_iosatp_mode = top_range->top_level + 6; +} +#define pt_iommu_fmt_hw_info riscvpt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_riscv_64_cfg riscv_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 39 }, + [1] = { .common.features = 0, + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 48 }, + [2] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 57 }, +}; +#define kunit_fmt_cfgs riscv_64_kunit_fmt_cfgs +enum { + KUNIT_FMT_FEATURES = BIT(PT_FEAT_RISCV_SVNAPOT_64K), +}; +#endif + +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 6a9a1acb5aad..fc5d0b5edadc 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -175,6 +175,22 @@ enum { PT_FEAT_VTDSS_FORCE_WRITEABLE, }; +struct pt_riscv_32 { + struct pt_common common; +}; + +struct pt_riscv_64 { + struct pt_common common; +}; + +enum { + /* + * Support the 64k contiguous page size following the Svnapot extension. + */ + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, + +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..49d9addb98c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -275,6 +275,17 @@ struct pt_iommu_vtdss_hw_info { IOMMU_FORMAT(vtdss, vtdss_pt); +struct pt_iommu_riscv_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_riscv_64_hw_info { + u64 ppn; + u8 fsc_iosatp_mode; +}; + +IOMMU_FORMAT(riscv_64, riscv_64pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; /* 4 is a 57 bit 5 level table */ -- cgit v1.2.3 From 99fb8afa16add85ed016baee9735231bca0c32b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:10 -0400 Subject: iommupt: Directly call iommupt's unmap_range() The common algorithm in iommupt does not require the iommu_pgsize() calculations, it can directly unmap any arbitrary range. Add a new function pointer to directly call an iommupt unmap_range op and make __iommu_unmap() call it directly. Gives about a 5% gain on single page unmappings. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map/unmap_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 29 ++++------------------------- drivers/iommu/iommu.c | 27 +++++++++++++++++++++------ include/linux/generic_pt/iommu.h | 37 +++++++++++++++++++++++++++++++------ include/linux/iommu.h | 1 + 4 files changed, 57 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 9c08bb594e41..a627c26fa62d 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -1031,34 +1031,12 @@ start_oa: return ret; } -/** - * unmap_pages() - Make a range of IOVA empty/not present - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @iotlb_gather: Gather struct that must be flushed on return - * - * unmap_pages() will remove a translation created by map_pages(). It cannot - * subdivide a mapping created by map_pages(), so it should be called with IOVA - * ranges that match those passed to map_pages(). The IOVA range can aggregate - * contiguous map_pages() calls so long as no individual range is split. - * - * Context: The caller must hold a write range lock that includes - * the whole range. - * - * Returns: Number of bytes of VA unmapped. iova + res will be the point - * unmapping stopped. - */ -size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, - size_t pgsize, size_t pgcount, +static size_t NS(unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, struct iommu_iotlb_gather *iotlb_gather) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( unmap.free_list) }; - pt_vaddr_t len = pgsize * pgcount; struct pt_range range; int ret; @@ -1073,7 +1051,6 @@ size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, return unmap.unmapped; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) @@ -1121,6 +1098,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) .set_dirty = NS(set_dirty), @@ -1183,6 +1161,7 @@ static int pt_iommu_init_domain(struct pt_iommu *iommu_table, domain->type = __IOMMU_DOMAIN_PAGING; domain->pgsize_bitmap = info.pgsize_bitmap; + domain->is_iommupt = true; if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) range = _pt_top_range(common, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 35db51780954..f68269707101 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "dma-iommu.h" #include "iommu-priv.h" @@ -2666,13 +2667,12 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_GPL(iommu_map); -static size_t __iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *iotlb_gather) +static size_t +__iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_domain_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; - unsigned long orig_iova = iova; unsigned int min_pagesz; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) @@ -2718,8 +2718,23 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - trace_unmap(orig_iova, size, unmapped); - iommu_debug_unmap_end(domain, orig_iova, size, unmapped); + return unmapped; +} + +static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + size_t unmapped; + + if (pt) + unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); + else + unmapped = __iommu_unmap_domain_pgtbl(domain, iova, size, + iotlb_gather); + trace_unmap(iova, size, unmapped); + iommu_debug_unmap_end(domain, iova, size, unmapped); return unmapped; } diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 49d9addb98c5..0da971134a37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -66,6 +66,13 @@ struct pt_iommu { struct device *iommu_device; }; +static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) +{ + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) + return NULL; + return container_of(domain, struct pt_iommu, domain); +} + /** * struct pt_iommu_info - Details about the IOMMU page table * @@ -80,6 +87,29 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @unmap_range: Make a range of IOVA empty/not present + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @len: Length of the range starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_range() will remove a translation created by map_range(). It + * cannot subdivide a mapping created by map_range(), so it should be + * called with IOVA ranges that match those passed to map_pages. The + * IOVA range can aggregate contiguous map_range() calls so long as no + * individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the + * point unmapping stopped. + */ + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, + struct iommu_iotlb_gather *iotlb_gather); + /** * @set_dirty: Make the iova write dirty * @iommu_table: Table to manipulate @@ -198,10 +228,6 @@ struct pt_iommu_cfg { unsigned long iova, phys_addr_t paddr, \ size_t pgsize, size_t pgcount, \ int prot, gfp_t gfp, size_t *mapped); \ - size_t pt_iommu_##fmt##_unmap_pages( \ - struct iommu_domain *domain, unsigned long iova, \ - size_t pgsize, size_t pgcount, \ - struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -223,8 +249,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .map_pages = &pt_iommu_##fmt##_map_pages #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..7ca648c01336 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,7 @@ enum iommu_domain_cookie_type { struct iommu_domain { unsigned type; enum iommu_domain_cookie_type cookie_type; + bool is_iommupt; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ -- cgit v1.2.3 From d6c65b0fd6218bd21ed0be7a8d3218e8f6dc91de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:11 -0400 Subject: iommupt: Avoid rewalking during map Currently the core code provides a simplified interface to drivers where it fragments a requested multi-page map into single page size steps after doing all the calculations to figure out what page size is appropriate. Each step rewalks the page tables from the start. Since iommupt has a single implementation of the mapping algorithm it can internally compute each step as it goes while retaining its current position in the walk. Add a new function pt_pgsz_count() which computes the same page size fragement of a large mapping operations. Compute the next fragment when all the leaf entries of the current fragement have been written, then continue walking from the current point. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 133 +++++++++++++++++----------- drivers/iommu/generic_pt/kunit_generic_pt.h | 12 +++ drivers/iommu/generic_pt/pt_iter.h | 22 +++++ drivers/iommu/iommu.c | 39 ++++++-- include/linux/generic_pt/iommu.h | 34 +++++-- 5 files changed, 175 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index a627c26fa62d..17b72dbd7d51 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -477,6 +477,7 @@ struct pt_iommu_map_args { pt_oaddr_t oa; unsigned int leaf_pgsize_lg2; unsigned int leaf_level; + pt_vaddr_t num_leaves; }; /* @@ -529,11 +530,15 @@ static int clear_contig(const struct pt_state *start_pts, static int __map_range_leaf(struct pt_range *range, void *arg, unsigned int level, struct pt_table_p *table) { + struct pt_iommu *iommu_table = iommu_from_common(range->common); struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; unsigned int start_index; pt_oaddr_t oa = map->oa; + unsigned int num_leaves; + unsigned int orig_end; + pt_vaddr_t last_va; unsigned int step; bool need_contig; int ret = 0; @@ -547,6 +552,15 @@ static int __map_range_leaf(struct pt_range *range, void *arg, _pt_iter_first(&pts); start_index = pts.index; + orig_end = pts.end_index; + if (pts.index + map->num_leaves < pts.end_index) { + /* Need to stop in the middle of the table to change sizes */ + pts.end_index = pts.index + map->num_leaves; + num_leaves = 0; + } else { + num_leaves = map->num_leaves - (pts.end_index - pts.index); + } + do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { @@ -572,7 +586,40 @@ static int __map_range_leaf(struct pt_range *range, void *arg, flush_writes_range(&pts, start_index, pts.index); map->oa = oa; - return ret; + map->num_leaves = num_leaves; + if (ret || num_leaves) + return ret; + + /* range->va is not valid if we reached the end of the table */ + pts.index -= step; + pt_index_to_va(&pts); + pts.index += step; + last_va = range->va + log2_to_int(leaf_pgsize_lg2); + + if (last_va - 1 == range->last_va) { + PT_WARN_ON(pts.index != orig_end); + return 0; + } + + /* + * Reached a point where the page size changed, compute the new + * parameters. + */ + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); + map->leaf_level = + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, + last_va, range->last_va, oa, + map->leaf_pgsize_lg2); + + /* Didn't finish this table level, caller will repeat it */ + if (pts.index != orig_end) { + if (pts.index != start_index) + pt_index_to_va(&pts); + return -EAGAIN; + } + return 0; } static int __map_range(struct pt_range *range, void *arg, unsigned int level, @@ -595,14 +642,9 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, if (pts.type != PT_ENTRY_EMPTY) return -EADDRINUSE; ret = pt_iommu_new_table(&pts, &map->attrs); - if (ret) { - /* - * Racing with another thread installing a table - */ - if (ret == -EAGAIN) - continue; + /* EAGAIN on a race will loop again */ + if (ret) return ret; - } } else { pts.table_lower = pt_table_ptr(&pts); /* @@ -626,10 +668,12 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, * The already present table can possibly be shared with another * concurrent map. */ - if (map->leaf_level == level - 1) - ret = pt_descend(&pts, arg, __map_range_leaf); - else - ret = pt_descend(&pts, arg, __map_range); + do { + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + } while (ret == -EAGAIN); if (ret) return ret; @@ -637,6 +681,14 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, pt_index_to_va(&pts); if (pts.index >= pts.end_index) break; + + /* + * This level is currently running __map_range_leaf() which is + * not correct if the target level has been updated to this + * level. Have the caller invoke __map_range_leaf. + */ + if (map->leaf_level == level) + return -EAGAIN; } while (true); return 0; } @@ -808,12 +860,13 @@ static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, static int do_map(struct pt_range *range, struct pt_common *common, bool single_page, struct pt_iommu_map_args *map) { + int ret; + /* * The __map_single_page() fast path does not support DMA_INCOHERENT * flushing to keep its .text small. */ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { - int ret; ret = pt_walk_range(range, __map_single_page, map); if (ret != -EAGAIN) @@ -821,50 +874,25 @@ static int do_map(struct pt_range *range, struct pt_common *common, /* EAGAIN falls through to the full path */ } - if (map->leaf_level == range->top_level) - return pt_walk_range(range, __map_range_leaf, map); - return pt_walk_range(range, __map_range, map); + do { + if (map->leaf_level == range->top_level) + ret = pt_walk_range(range, __map_range_leaf, map); + else + ret = pt_walk_range(range, __map_range, map); + } while (ret == -EAGAIN); + return ret; } -/** - * map_pages() - Install translation for an IOVA range - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @paddr: Physical/Output address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO - * @gfp: GFP flags for any memory allocations - * @mapped: Total bytes successfully mapped - * - * The range starting at IOVA will have paddr installed into it. The caller - * must specify a valid pgsize and pgcount to segment the range into compatible - * blocks. - * - * On error the caller will probably want to invoke unmap on the range from iova - * up to the amount indicated by @mapped to return the table back to an - * unchanged state. - * - * Context: The caller must hold a write range lock that includes the whole - * range. - * - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were - * mapped are added to @mapped, @mapped is not zerod first. - */ -int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) +static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; struct pt_common *common = common_from_iommu(iommu_table); struct iommu_iotlb_gather iotlb_gather; - pt_vaddr_t len = pgsize * pgcount; struct pt_iommu_map_args map = { .iotlb_gather = &iotlb_gather, .oa = paddr, - .leaf_pgsize_lg2 = vaffs(pgsize), }; bool single_page = false; struct pt_range range; @@ -892,13 +920,13 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && - pgcount == 1) { + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; map.leaf_level = 0; + map.num_leaves = 1; single_page = true; } else { map.leaf_pgsize_lg2 = pt_compute_best_pgsize( @@ -907,6 +935,9 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return -ENXIO; map.leaf_level = pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, + range.last_va, paddr, + map.leaf_pgsize_lg2); } ret = check_map_range(iommu_table, &range, &map); @@ -929,7 +960,6 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, *mapped += map.oa - paddr; return ret; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); struct pt_unmap_args { struct iommu_pages_list free_list; @@ -1098,6 +1128,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .map_range = NS(map_range), .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h index 68278bf15cfe..374e475f591e 100644 --- a/drivers/iommu/generic_pt/kunit_generic_pt.h +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -312,6 +312,17 @@ static void test_best_pgsize(struct kunit *test) } } +static void test_pgsz_count(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), + SZ_1G / SZ_4K); + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, + ilog2(SZ_4K)), + (SZ_2M - SZ_4K) / SZ_4K); +} + /* * Check that pt_install_table() and pt_table_pa() match */ @@ -770,6 +781,7 @@ static struct kunit_case generic_pt_test_cases[] = { KUNIT_CASE_FMT(test_init), KUNIT_CASE_FMT(test_bitops), KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_pgsz_count), KUNIT_CASE_FMT(test_table_ptr), KUNIT_CASE_FMT(test_max_va), KUNIT_CASE_FMT(test_table_radix), diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h index c0d8617cce29..3e45dbde6b83 100644 --- a/drivers/iommu/generic_pt/pt_iter.h +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -569,6 +569,28 @@ static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, return pgsz_lg2; } +/* + * Return the number of pgsize_lg2 leaf entries that can be mapped for + * va to oa. This accounts for any requirement to reduce or increase the page + * size across the VA range. + */ +static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa, + unsigned int pgsize_lg2) +{ + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); + + if (next_pgsizes) { + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); + + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - + va + 1); + } + return log2_div(len, pgsize_lg2); +} + #define _PT_MAKE_CALL_LEVEL(fn) \ static __always_inline int fn(struct pt_range *range, void *arg, \ unsigned int level, \ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f68269707101..33cee64686e3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2569,14 +2569,14 @@ out_set_count: return pgsize; } -int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; - phys_addr_t orig_paddr = paddr; int ret = 0; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2633,12 +2633,9 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, /* unroll mapping in case something went wrong */ if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - } else { - trace_map(orig_iova, orig_paddr, orig_size); - iommu_debug_map(domain, orig_paddr, orig_size); + return ret; } - - return ret; + return 0; } int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) @@ -2650,6 +2647,32 @@ int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) return ops->iotlb_sync_map(domain, iova, size); } +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + int ret; + + if (pt) { + size_t mapped = 0; + + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, + &mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } + return 0; + } + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); + if (!ret) + return ret; + + trace_map(iova, paddr, size); + iommu_debug_map(domain, paddr, size); + return 0; +} + int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0da971134a37..dd0edd02a48a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -87,6 +87,33 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @map_range: Install translation for an IOVA range + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @len: Length of the range starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * + * The range starting at IOVA will have paddr installed into it. The + * rage is automatically segmented into optimally sized table entries, + * and can have any valid alignment. + * + * On error the caller will probably want to invoke unmap on the range + * from iova up to the amount indicated by @mapped to return the table + * back to an unchanged state. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA + * that were mapped are added to @mapped, @mapped is not zerod first. + */ + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped); + /** * @unmap_range: Make a range of IOVA empty/not present * @iommu_table: Table to manipulate @@ -224,10 +251,6 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ - unsigned long iova, phys_addr_t paddr, \ - size_t pgsize, size_t pgcount, \ - int prot, gfp_t gfp, size_t *mapped); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -248,8 +271,7 @@ struct pt_iommu_cfg { * iommu_pt */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- cgit v1.2.3 From a82efb8747d1b8a7c0a377dc79c2aac204eae788 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:02 +0000 Subject: iommu: Add device ATS supported capability PCIe ATS may be disabled by platform firmware, root complex limitations, or kernel policy even when a device advertises the ATS capability in its PCI configuration space. Add a new IOMMU_CAP_PCI_ATS_SUPPORTED capability to allow IOMMU drivers to report the effective ATS decision for a device. When this capability is true for a device, ATS may be enabled for that device, but it does not imply that ATS is currently enabled. A subsequent patch will extend iommufd to expose the effective ATS status to userspace. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 6 ++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ drivers/iommu/intel/iommu.c | 2 ++ include/linux/iommu.h | 2 ++ 4 files changed, 13 insertions(+) (limited to 'include') diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 81c4d7733872..f1814fee5182 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2985,6 +2985,12 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return amd_iommu_hd_support(iommu); } + case IOMMU_CAP_PCI_ATS_SUPPORTED: { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + + return amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); + } default: break; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 4d00d796f078..dec5cac98f7c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -107,6 +107,7 @@ static const char * const event_class_str[] = { }; static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); +static bool arm_smmu_ats_supported(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) { @@ -2494,6 +2495,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DIRTY_TRACKING: return arm_smmu_dbm_capable(master->smmu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return arm_smmu_ats_supported(master); default: return false; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ef7613b177b9..5dca8e525c73 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3220,6 +3220,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return info->ats_supported; default: return false; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ca648c01336..a904821ed169 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -272,6 +272,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From a11661a58c06f7fdfef03a368ef20d05a4ea4ed0 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:03 +0000 Subject: iommufd: Report ATS not supported status via IOMMU_GET_HW_INFO If the IOMMU driver reports that ATS is not supported for a device, set the IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED flag in the returned hardware capabilities. This uses a negative flag for UAPI compatibility. Existing userspace assumes ATS is supported if no flag is present. This also ensures that new userspace works correctly on both old and new kernels, where a zero value implies ATS support. When this flag is set, ATS cannot be used for the device. When it is clear, ATS may be enabled when an appropriate HWPT is attached. Reviewed-by: Samiullah Khawaja Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- drivers/iommu/iommufd/device.c | 4 ++++ include/uapi/linux/iommufd.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 344d620cdecc..92c5d5ef8d00 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1624,6 +1624,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + /* Report when ATS cannot be used for this device */ + if (!device_iommu_capable(idev->dev, IOMMU_CAP_PCI_ATS_SUPPORTED)) + cmd->out_capabilities |= IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED; + cmd->out_max_pasid_log2 = 0; /* * Currently, all iommu drivers enable PASID in the probe_device() diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 1dafbc552d37..507ee9bcba01 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -695,11 +695,15 @@ enum iommu_hw_info_type { * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it * when the struct * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED: ATS is not supported or cannot be used + * on this device (absence implies ATS + * may be enabled) */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, + IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED = 1 << 3, }; /** -- cgit v1.2.3 From dccc0c3ddf8f16071736f98a7d6dd46a2d43e037 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 20 Dec 2025 10:33:26 +0000 Subject: media: rc: fix race between unregister and urb/irq callbacks Some rc device drivers have a race condition between rc_unregister_device() and irq or urb callbacks. This is because rc_unregister_device() does two things, it marks the device as unregistered so no new commands can be issued and then it calls rc_free_device(). This means the driver has no chance to cancel any pending urb callbacks or interrupts after the device has been marked as unregistered. Those callbacks may access struct rc_dev or its members (e.g. struct ir_raw_event_ctrl), which have been freed by rc_free_device(). This change removes the implicit call to rc_free_device() from rc_unregister_device(). This means that device drivers can call rc_unregister_device() in their remove or disconnect function, then cancel all the urbs and interrupts before explicitly calling rc_free_device(). Note this is an alternative fix for an issue found by Haotian Zhang, see the Closes: tags. Reported-by: Haotian Zhang Closes: https://lore.kernel.org/linux-media/20251114101432.2566-1-vulab@iscas.ac.cn/ Closes: https://lore.kernel.org/linux-media/20251114101418.2548-1-vulab@iscas.ac.cn/ Closes: https://lore.kernel.org/linux-media/20251114101346.2530-1-vulab@iscas.ac.cn/ Closes: https://lore.kernel.org/linux-media/20251114090605.2413-1-vulab@iscas.ac.cn/ Reviewed-by: Patrice Chotard Signed-off-by: Sean Young Signed-off-by: Hans Verkuil --- drivers/gpu/drm/bridge/sil-sii8620.c | 1 + drivers/hid/hid-picolcd_cir.c | 1 + drivers/media/cec/core/cec-core.c | 2 +- drivers/media/common/siano/smsir.c | 1 + drivers/media/i2c/ir-kbd-i2c.c | 2 ++ drivers/media/pci/bt8xx/bttv-input.c | 3 ++- drivers/media/pci/cx23885/cx23885-input.c | 1 + drivers/media/pci/cx88/cx88-input.c | 3 ++- drivers/media/pci/dm1105/dm1105.c | 1 + drivers/media/pci/mantis/mantis_input.c | 1 + drivers/media/pci/saa7134/saa7134-input.c | 1 + drivers/media/pci/smipcie/smipcie-ir.c | 1 + drivers/media/pci/ttpci/budget-ci.c | 1 + drivers/media/rc/ati_remote.c | 6 +++--- drivers/media/rc/ene_ir.c | 2 +- drivers/media/rc/fintek-cir.c | 3 ++- drivers/media/rc/igorplugusb.c | 1 + drivers/media/rc/iguanair.c | 1 + drivers/media/rc/img-ir/img-ir-hw.c | 3 ++- drivers/media/rc/img-ir/img-ir-raw.c | 3 ++- drivers/media/rc/imon.c | 3 ++- drivers/media/rc/ir-hix5hd2.c | 2 +- drivers/media/rc/ir_toy.c | 1 + drivers/media/rc/ite-cir.c | 2 +- drivers/media/rc/mceusb.c | 1 + drivers/media/rc/rc-ir-raw.c | 5 ----- drivers/media/rc/rc-loopback.c | 1 + drivers/media/rc/rc-main.c | 6 +----- drivers/media/rc/redrat3.c | 4 +++- drivers/media/rc/st_rc.c | 2 +- drivers/media/rc/streamzap.c | 7 ++++--- drivers/media/rc/sunxi-cir.c | 1 + drivers/media/rc/ttusbir.c | 2 +- drivers/media/rc/winbond-cir.c | 2 +- drivers/media/rc/xbox_remote.c | 5 +++-- drivers/media/usb/au0828/au0828-input.c | 1 + drivers/media/usb/dvb-usb-v2/dvb_usb_core.c | 1 + drivers/media/usb/dvb-usb/dvb-usb-remote.c | 6 ++++-- drivers/media/usb/em28xx/em28xx-input.c | 1 + drivers/staging/media/av7110/av7110_ir.c | 1 + include/media/rc-core.h | 2 -- 41 files changed, 58 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index d3f238b1f2a9..982306eb4f0a 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -2221,6 +2221,7 @@ static void sii8620_detach(struct drm_bridge *bridge) return; rc_unregister_device(ctx->rc_dev); + rc_free_device(ctx->rc_dev); } static int sii8620_is_packing_required(struct sii8620 *ctx, diff --git a/drivers/hid/hid-picolcd_cir.c b/drivers/hid/hid-picolcd_cir.c index d6faa0e00f95..6d4c636e1c9f 100644 --- a/drivers/hid/hid-picolcd_cir.c +++ b/drivers/hid/hid-picolcd_cir.c @@ -134,5 +134,6 @@ void picolcd_exit_cir(struct picolcd_data *data) data->rc_dev = NULL; rc_unregister_device(rdev); + rc_free_device(rdev); } diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index 1953ce559eca..0fcd3b5e60c8 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -338,8 +338,8 @@ int cec_register_adapter(struct cec_adapter *adap, res = cec_devnode_register(&adap->devnode, adap->owner); if (res) { #ifdef CONFIG_MEDIA_CEC_RC - /* Note: rc_unregister also calls rc_free */ rc_unregister_device(adap->rc); + rc_free_device(adap->rc); adap->rc = NULL; #endif return res; diff --git a/drivers/media/common/siano/smsir.c b/drivers/media/common/siano/smsir.c index af07fed21ae1..283770d583d5 100644 --- a/drivers/media/common/siano/smsir.c +++ b/drivers/media/common/siano/smsir.c @@ -92,6 +92,7 @@ int sms_ir_init(struct smscore_device_t *coredev) void sms_ir_exit(struct smscore_device_t *coredev) { rc_unregister_device(coredev->ir.dev); + rc_free_device(coredev->ir.dev); pr_debug("\n"); } diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c index 5588cdd7ec20..604745317004 100644 --- a/drivers/media/i2c/ir-kbd-i2c.c +++ b/drivers/media/i2c/ir-kbd-i2c.c @@ -355,6 +355,7 @@ static void ir_work(struct work_struct *work) mutex_unlock(&ir->lock); if (rc == -ENODEV) { rc_unregister_device(ir->rc); + rc_free_device(ir->rc); ir->rc = NULL; return; } @@ -972,6 +973,7 @@ static void ir_remove(struct i2c_client *client) i2c_unregister_device(ir->tx_c); rc_unregister_device(ir->rc); + rc_free_device(ir->rc); } static const struct i2c_device_id ir_kbd_id[] = { diff --git a/drivers/media/pci/bt8xx/bttv-input.c b/drivers/media/pci/bt8xx/bttv-input.c index 373b6c6817d7..f704476506e0 100644 --- a/drivers/media/pci/bt8xx/bttv-input.c +++ b/drivers/media/pci/bt8xx/bttv-input.c @@ -572,8 +572,9 @@ void bttv_input_fini(struct bttv *btv) if (btv->remote == NULL) return; - bttv_ir_stop(btv); rc_unregister_device(btv->remote->dev); + bttv_ir_stop(btv); + rc_free_device(btv->remote->dev); kfree(btv->remote); btv->remote = NULL; } diff --git a/drivers/media/pci/cx23885/cx23885-input.c b/drivers/media/pci/cx23885/cx23885-input.c index ffbbeca8a8e5..554767b8ef2b 100644 --- a/drivers/media/pci/cx23885/cx23885-input.c +++ b/drivers/media/pci/cx23885/cx23885-input.c @@ -402,6 +402,7 @@ void cx23885_input_fini(struct cx23885_dev *dev) if (dev->kernel_ir == NULL) return; rc_unregister_device(dev->kernel_ir->rc); + rc_free_device(dev->kernel_ir->rc); kfree(dev->kernel_ir->phys); kfree(dev->kernel_ir->name); kfree(dev->kernel_ir); diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c index e958eecb29c5..5d9ce4f9af01 100644 --- a/drivers/media/pci/cx88/cx88-input.c +++ b/drivers/media/pci/cx88/cx88-input.c @@ -509,8 +509,9 @@ int cx88_ir_fini(struct cx88_core *core) if (!ir) return 0; - cx88_ir_stop(core); rc_unregister_device(ir->dev); + cx88_ir_stop(core); + rc_free_device(ir->dev); kfree(ir); /* done */ diff --git a/drivers/media/pci/dm1105/dm1105.c b/drivers/media/pci/dm1105/dm1105.c index de05d8b0f9dc..bbd24769ae56 100644 --- a/drivers/media/pci/dm1105/dm1105.c +++ b/drivers/media/pci/dm1105/dm1105.c @@ -763,6 +763,7 @@ static int dm1105_ir_init(struct dm1105_dev *dm1105) static void dm1105_ir_exit(struct dm1105_dev *dm1105) { rc_unregister_device(dm1105->ir.dev); + rc_free_device(dm1105->ir.dev); } static int dm1105_hw_init(struct dm1105_dev *dev) diff --git a/drivers/media/pci/mantis/mantis_input.c b/drivers/media/pci/mantis/mantis_input.c index 34c0d979240f..edb4cacf55d2 100644 --- a/drivers/media/pci/mantis/mantis_input.c +++ b/drivers/media/pci/mantis/mantis_input.c @@ -72,5 +72,6 @@ EXPORT_SYMBOL_GPL(mantis_input_init); void mantis_input_exit(struct mantis_pci *mantis) { rc_unregister_device(mantis->rc); + rc_free_device(mantis->rc); } EXPORT_SYMBOL_GPL(mantis_input_exit); diff --git a/drivers/media/pci/saa7134/saa7134-input.c b/drivers/media/pci/saa7134/saa7134-input.c index 5b7101415780..7f6680de3156 100644 --- a/drivers/media/pci/saa7134/saa7134-input.c +++ b/drivers/media/pci/saa7134/saa7134-input.c @@ -834,6 +834,7 @@ void saa7134_input_fini(struct saa7134_dev *dev) return; rc_unregister_device(dev->remote->dev); + rc_free_device(dev->remote->dev); kfree(dev->remote); dev->remote = NULL; } diff --git a/drivers/media/pci/smipcie/smipcie-ir.c b/drivers/media/pci/smipcie/smipcie-ir.c index c0604d9c7011..0bbe4fa2d5a8 100644 --- a/drivers/media/pci/smipcie/smipcie-ir.c +++ b/drivers/media/pci/smipcie/smipcie-ir.c @@ -181,5 +181,6 @@ void smi_ir_exit(struct smi_dev *dev) rc_unregister_device(rc_dev); smi_ir_stop(ir); + rc_free_device(rc_dev); ir->rc_dev = NULL; } diff --git a/drivers/media/pci/ttpci/budget-ci.c b/drivers/media/pci/ttpci/budget-ci.c index 3709c0fb23b0..8b496b959d7e 100644 --- a/drivers/media/pci/ttpci/budget-ci.c +++ b/drivers/media/pci/ttpci/budget-ci.c @@ -249,6 +249,7 @@ static void msp430_ir_deinit(struct budget_ci *budget_ci) cancel_work_sync(&budget_ci->ir.msp430_irq_bh_work); rc_unregister_device(budget_ci->ir.dev); + rc_free_device(budget_ci->ir.dev); } static int ciintf_read_attribute_mem(struct dvb_ca_en50221 *ca, int slot, int address) diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c index 78abe810a88e..51d85de24fae 100644 --- a/drivers/media/rc/ati_remote.c +++ b/drivers/media/rc/ati_remote.c @@ -921,7 +921,6 @@ static int ati_remote_probe(struct usb_interface *interface, input_free_device(input_dev); exit_unregister_device: rc_unregister_device(rc_dev); - rc_dev = NULL; exit_kill_urbs: usb_kill_urb(ati_remote->irq_urb); usb_kill_urb(ati_remote->out_urb); @@ -941,18 +940,19 @@ static void ati_remote_disconnect(struct usb_interface *interface) struct ati_remote *ati_remote; ati_remote = usb_get_intfdata(interface); - usb_set_intfdata(interface, NULL); if (!ati_remote) { dev_warn(&interface->dev, "%s - null device?\n", __func__); return; } + rc_unregister_device(ati_remote->rdev); + usb_set_intfdata(interface, NULL); usb_kill_urb(ati_remote->irq_urb); usb_kill_urb(ati_remote->out_urb); if (ati_remote->idev) input_unregister_device(ati_remote->idev); - rc_unregister_device(ati_remote->rdev); ati_remote_free_buffers(ati_remote); + rc_free_device(ati_remote->rdev); kfree(ati_remote); } diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c index f8120605501a..6f7dccc965e7 100644 --- a/drivers/media/rc/ene_ir.c +++ b/drivers/media/rc/ene_ir.c @@ -1090,7 +1090,6 @@ exit_release_hw_io: release_region(dev->hw_io, ENE_IO_SIZE); exit_unregister_device: rc_unregister_device(rdev); - rdev = NULL; exit_free_dev_rdev: rc_free_device(rdev); kfree(dev); @@ -1110,6 +1109,7 @@ static void ene_remove(struct pnp_dev *pnp_dev) ene_rx_restore_hw_buffer(dev); spin_unlock_irqrestore(&dev->hw_lock, flags); + rc_free_device(dev->rdev); free_irq(dev->irq, dev); release_region(dev->hw_io, ENE_IO_SIZE); kfree(dev); diff --git a/drivers/media/rc/fintek-cir.c b/drivers/media/rc/fintek-cir.c index f7cfa8a073eb..5055dfc3f465 100644 --- a/drivers/media/rc/fintek-cir.c +++ b/drivers/media/rc/fintek-cir.c @@ -568,6 +568,7 @@ static void fintek_remove(struct pnp_dev *pdev) struct fintek_dev *fintek = pnp_get_drvdata(pdev); unsigned long flags; + rc_unregister_device(fintek->rdev); spin_lock_irqsave(&fintek->fintek_lock, flags); /* disable CIR */ fintek_disable_cir(fintek); @@ -580,7 +581,7 @@ static void fintek_remove(struct pnp_dev *pdev) free_irq(fintek->cir_irq, fintek); release_region(fintek->cir_addr, fintek->cir_port_len); - rc_unregister_device(fintek->rdev); + rc_free_device(fintek->rdev); kfree(fintek); } diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c index e034c93d57cf..5ceb5ca44e23 100644 --- a/drivers/media/rc/igorplugusb.c +++ b/drivers/media/rc/igorplugusb.c @@ -242,6 +242,7 @@ static void igorplugusb_disconnect(struct usb_interface *intf) usb_set_intfdata(intf, NULL); usb_unpoison_urb(ir->urb); usb_free_urb(ir->urb); + rc_free_device(ir->rc); kfree(ir->buf_in); } diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index c508f2536243..0c5b8befb0af 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -500,6 +500,7 @@ static void iguanair_disconnect(struct usb_interface *intf) usb_set_intfdata(intf, NULL); usb_kill_urb(ir->urb_in); usb_kill_urb(ir->urb_out); + rc_free_device(ir->rc); usb_free_urb(ir->urb_in); usb_free_urb(ir->urb_out); usb_free_coherent(ir->udev, MAX_IN_PACKET, ir->buf_in, ir->dma_in); diff --git a/drivers/media/rc/img-ir/img-ir-hw.c b/drivers/media/rc/img-ir/img-ir-hw.c index 63f6f5b36838..f30adf4d8444 100644 --- a/drivers/media/rc/img-ir/img-ir-hw.c +++ b/drivers/media/rc/img-ir/img-ir-hw.c @@ -1118,9 +1118,10 @@ void img_ir_remove_hw(struct img_ir_priv *priv) struct rc_dev *rdev = hw->rdev; if (!rdev) return; + rc_unregister_device(rdev); img_ir_set_decoder(priv, NULL, 0); hw->rdev = NULL; - rc_unregister_device(rdev); + rc_free_device(rdev); #ifdef CONFIG_COMMON_CLK if (!IS_ERR(priv->clk)) clk_notifier_unregister(priv->clk, &hw->clk_nb); diff --git a/drivers/media/rc/img-ir/img-ir-raw.c b/drivers/media/rc/img-ir/img-ir-raw.c index 92fb7b555a0f..f1460d4acf3e 100644 --- a/drivers/media/rc/img-ir/img-ir-raw.c +++ b/drivers/media/rc/img-ir/img-ir-raw.c @@ -136,6 +136,7 @@ void img_ir_remove_raw(struct img_ir_priv *priv) if (!rdev) return; + rc_unregister_device(rdev); /* switch off and disable raw (edge) interrupts */ spin_lock_irq(&priv->lock); raw->rdev = NULL; @@ -145,7 +146,7 @@ void img_ir_remove_raw(struct img_ir_priv *priv) img_ir_write(priv, IMG_IR_IRQ_CLEAR, IMG_IR_IRQ_EDGE); spin_unlock_irq(&priv->lock); - rc_unregister_device(rdev); + rc_free_device(rdev); timer_delete_sync(&raw->timer); } diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 7e92161105d5..310c9fc9ae91 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -2541,9 +2541,10 @@ static void imon_disconnect(struct usb_interface *interface) if (ifnum == 0) { ictx->dev_present_intf0 = false; + rc_unregister_device(ictx->rdev); usb_kill_urb(ictx->rx_urb_intf0); input_unregister_device(ictx->idev); - rc_unregister_device(ictx->rdev); + rc_free_device(ictx->rdev); if (ictx->display_supported) { if (ictx->display_type == IMON_DISPLAY_TYPE_LCD) usb_deregister_dev(interface, &imon_lcd_class); diff --git a/drivers/media/rc/ir-hix5hd2.c b/drivers/media/rc/ir-hix5hd2.c index edc46828509c..1b061e4a3dcf 100644 --- a/drivers/media/rc/ir-hix5hd2.c +++ b/drivers/media/rc/ir-hix5hd2.c @@ -331,7 +331,6 @@ static int hix5hd2_ir_probe(struct platform_device *pdev) regerr: rc_unregister_device(rdev); - rdev = NULL; clkerr: clk_disable_unprepare(priv->clock); err: @@ -346,6 +345,7 @@ static void hix5hd2_ir_remove(struct platform_device *pdev) clk_disable_unprepare(priv->clock); rc_unregister_device(priv->rdev); + rc_free_device(priv->rdev); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c index d6472de5da87..089833e41178 100644 --- a/drivers/media/rc/ir_toy.c +++ b/drivers/media/rc/ir_toy.c @@ -536,6 +536,7 @@ static void irtoy_disconnect(struct usb_interface *intf) usb_free_urb(ir->urb_out); usb_kill_urb(ir->urb_in); usb_free_urb(ir->urb_in); + rc_free_device(ir->rc); kfree(ir->in); kfree(ir->out); kfree(ir); diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index bf544517c67a..bde2a7051231 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -1414,7 +1414,6 @@ exit_release_cir_addr: release_region(itdev->cir_addr, itdev->params->io_region_size); exit_unregister_device: rc_unregister_device(rdev); - rdev = NULL; exit_free_dev_rdev: rc_free_device(rdev); kfree(itdev); @@ -1439,6 +1438,7 @@ static void ite_remove(struct pnp_dev *pdev) release_region(dev->cir_addr, dev->params->io_region_size); rc_unregister_device(dev->rdev); + rc_free_device(dev->rdev); kfree(dev); } diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index ed55e9ec3c57..06222eee1754 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1850,6 +1850,7 @@ static void mceusb_dev_disconnect(struct usb_interface *intf) usb_free_urb(ir->urb_in); usb_free_coherent(dev, ir->len_in, ir->buf_in, ir->dma_in); usb_put_dev(dev); + rc_free_device(ir->rc); kfree(ir); } diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 2e269ef5e26b..ba24c2f22d39 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -648,9 +648,6 @@ int ir_raw_event_register(struct rc_dev *dev) void ir_raw_event_free(struct rc_dev *dev) { - if (!dev) - return; - kfree(dev->raw); dev->raw = NULL; } @@ -674,8 +671,6 @@ void ir_raw_event_unregister(struct rc_dev *dev) lirc_bpf_free(dev); - ir_raw_event_free(dev); - /* * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so * ensure that the raw member is null on unlock; this is how diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c index 78ac09b3cbd3..53d0540717b3 100644 --- a/drivers/media/rc/rc-loopback.c +++ b/drivers/media/rc/rc-loopback.c @@ -263,6 +263,7 @@ static int __init loop_init(void) static void __exit loop_exit(void) { rc_unregister_device(loopdev.dev); + rc_free_device(loopdev.dev); } module_init(loop_init); diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 821607504008..dda3479ea3ad 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1611,6 +1611,7 @@ static void rc_dev_release(struct device *device) { struct rc_dev *dev = to_rc_dev(device); + ir_raw_event_free(dev); kfree(dev); } @@ -1773,7 +1774,6 @@ struct rc_dev *devm_rc_allocate_device(struct device *dev, } rc->dev.parent = dev; - rc->managed_alloc = true; *dr = rc; devres_add(dev, dr); @@ -2042,11 +2042,7 @@ void rc_unregister_device(struct rc_dev *dev) device_del(&dev->dev); ida_free(&rc_ida, dev->minor); - - if (!dev->managed_alloc) - rc_free_device(dev); } - EXPORT_SYMBOL_GPL(rc_unregister_device); /* diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c index 3b917a2a8918..3f828a564e19 100644 --- a/drivers/media/rc/redrat3.c +++ b/drivers/media/rc/redrat3.c @@ -1131,11 +1131,13 @@ static void redrat3_dev_disconnect(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct redrat3_dev *rr3 = usb_get_intfdata(intf); + struct rc_dev *rc = rr3->rc; usb_set_intfdata(intf, NULL); - rc_unregister_device(rr3->rc); + rc_unregister_device(rc); led_classdev_unregister(&rr3->led); redrat3_delete(rr3, udev); + rc_free_device(rc); } static int redrat3_dev_suspend(struct usb_interface *intf, pm_message_t message) diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c index 6b70bac5f45d..0ba06bfc9e14 100644 --- a/drivers/media/rc/st_rc.c +++ b/drivers/media/rc/st_rc.c @@ -203,6 +203,7 @@ static void st_rc_remove(struct platform_device *pdev) device_init_wakeup(&pdev->dev, false); clk_disable_unprepare(rc_dev->sys_clock); rc_unregister_device(rc_dev->rdev); + rc_free_device(rc_dev->rdev); } static int st_rc_open(struct rc_dev *rdev) @@ -334,7 +335,6 @@ static int st_rc_probe(struct platform_device *pdev) return ret; rcerr: rc_unregister_device(rdev); - rdev = NULL; clkerr: clk_disable_unprepare(rc_dev->sys_clock); err: diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c index 5a18603f9a95..7103da57c19f 100644 --- a/drivers/media/rc/streamzap.c +++ b/drivers/media/rc/streamzap.c @@ -388,15 +388,16 @@ static void streamzap_disconnect(struct usb_interface *interface) struct streamzap_ir *sz = usb_get_intfdata(interface); struct usb_device *usbdev = interface_to_usbdev(interface); - usb_set_intfdata(interface, NULL); - if (!sz) return; - usb_kill_urb(sz->urb_in); rc_unregister_device(sz->rdev); + usb_set_intfdata(interface, NULL); + + usb_kill_urb(sz->urb_in); usb_free_urb(sz->urb_in); usb_free_coherent(usbdev, sz->buf_in_len, sz->buf_in, sz->dma_in); + rc_free_device(sz->rdev); kfree(sz); } diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c index 92ef4e7c6f69..cb4c56bf0752 100644 --- a/drivers/media/rc/sunxi-cir.c +++ b/drivers/media/rc/sunxi-cir.c @@ -371,6 +371,7 @@ static void sunxi_ir_remove(struct platform_device *pdev) struct sunxi_ir *ir = platform_get_drvdata(pdev); rc_unregister_device(ir->rc); + rc_free_device(ir->rc); sunxi_ir_hw_exit(&pdev->dev); } diff --git a/drivers/media/rc/ttusbir.c b/drivers/media/rc/ttusbir.c index 110a46900114..a2a64a860264 100644 --- a/drivers/media/rc/ttusbir.c +++ b/drivers/media/rc/ttusbir.c @@ -333,7 +333,6 @@ static int ttusbir_probe(struct usb_interface *intf, return 0; out3: rc_unregister_device(rc); - rc = NULL; out2: led_classdev_unregister(&tt->led); out: @@ -373,6 +372,7 @@ static void ttusbir_disconnect(struct usb_interface *intf) } usb_kill_urb(tt->bulk_urb); usb_free_urb(tt->bulk_urb); + rc_free_device(tt->rc); usb_set_intfdata(intf, NULL); kfree(tt); } diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c index 515469dd82d4..8e804661a621 100644 --- a/drivers/media/rc/winbond-cir.c +++ b/drivers/media/rc/winbond-cir.c @@ -1132,7 +1132,6 @@ exit_release_wbase: release_region(data->wbase, WAKEUP_IOMEM_LEN); exit_unregister_device: rc_unregister_device(data->dev); - data->dev = NULL; exit_free_rc: rc_free_device(data->dev); exit_unregister_led: @@ -1163,6 +1162,7 @@ wbcir_remove(struct pnp_dev *device) wbcir_set_bits(data->wbase + WBCIR_REG_WCEIR_EV_EN, 0x00, 0x07); rc_unregister_device(data->dev); + rc_free_device(data->dev); led_classdev_unregister(&data->led); diff --git a/drivers/media/rc/xbox_remote.c b/drivers/media/rc/xbox_remote.c index 3e3da70cf8da..c64123e9d16a 100644 --- a/drivers/media/rc/xbox_remote.c +++ b/drivers/media/rc/xbox_remote.c @@ -277,14 +277,15 @@ static void xbox_remote_disconnect(struct usb_interface *interface) struct xbox_remote *xbox_remote; xbox_remote = usb_get_intfdata(interface); - usb_set_intfdata(interface, NULL); if (!xbox_remote) { dev_warn(&interface->dev, "%s - null device?\n", __func__); return; } - usb_kill_urb(xbox_remote->irq_urb); rc_unregister_device(xbox_remote->rdev); + usb_set_intfdata(interface, NULL); + usb_kill_urb(xbox_remote->irq_urb); + rc_free_device(xbox_remote->rdev); usb_free_urb(xbox_remote->irq_urb); kfree(xbox_remote); } diff --git a/drivers/media/usb/au0828/au0828-input.c b/drivers/media/usb/au0828/au0828-input.c index 319be7509c82..b156afb1a0ae 100644 --- a/drivers/media/usb/au0828/au0828-input.c +++ b/drivers/media/usb/au0828/au0828-input.c @@ -358,6 +358,7 @@ void au0828_rc_unregister(struct au0828_dev *dev) return; rc_unregister_device(ir->rc); + rc_free_device(ir->rc); /* done */ kfree(ir); diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c index 600cff8a4abd..bd86d250433d 100644 --- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c +++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c @@ -187,6 +187,7 @@ static int dvb_usbv2_remote_exit(struct dvb_usb_device *d) if (d->rc_dev) { cancel_delayed_work_sync(&d->rc_query_work); rc_unregister_device(d->rc_dev); + rc_free_device(d->rc_dev); d->rc_dev = NULL; } diff --git a/drivers/media/usb/dvb-usb/dvb-usb-remote.c b/drivers/media/usb/dvb-usb/dvb-usb-remote.c index 65e2c9e2cdc9..6dc11718dfb9 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-remote.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-remote.c @@ -347,10 +347,12 @@ int dvb_usb_remote_exit(struct dvb_usb_device *d) { if (d->state & DVB_USB_STATE_REMOTE) { cancel_delayed_work_sync(&d->rc_query_work); - if (d->props.rc.mode == DVB_RC_LEGACY) + if (d->props.rc.mode == DVB_RC_LEGACY) { input_unregister_device(d->input_dev); - else + } else { rc_unregister_device(d->rc_dev); + rc_free_device(d->rc_dev); + } } d->state &= ~DVB_USB_STATE_REMOTE; return 0; diff --git a/drivers/media/usb/em28xx/em28xx-input.c b/drivers/media/usb/em28xx/em28xx-input.c index 20fdd59b5518..ab61d9a29b10 100644 --- a/drivers/media/usb/em28xx/em28xx-input.c +++ b/drivers/media/usb/em28xx/em28xx-input.c @@ -853,6 +853,7 @@ static int em28xx_ir_fini(struct em28xx *dev) goto ref_put; rc_unregister_device(ir->rc); + rc_free_device(ir->rc); kfree(ir->i2c_client); diff --git a/drivers/staging/media/av7110/av7110_ir.c b/drivers/staging/media/av7110/av7110_ir.c index 68b3979ba5f2..fdae467fd7ab 100644 --- a/drivers/staging/media/av7110/av7110_ir.c +++ b/drivers/staging/media/av7110/av7110_ir.c @@ -151,6 +151,7 @@ int av7110_ir_init(struct av7110 *av7110) void av7110_ir_exit(struct av7110 *av7110) { rc_unregister_device(av7110->ir.rcdev); + rc_free_device(av7110->ir.rcdev); } //MODULE_AUTHOR("Holger Waechtler , Oliver Endriss "); diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 35c7a0546f02..7c964b5ad792 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -81,7 +81,6 @@ struct lirc_fh { /** * struct rc_dev - represents a remote control device * @dev: driver model's view of this device - * @managed_alloc: devm_rc_allocate_device was used to create rc_dev * @registered: set to true by rc_register_device(), false by * rc_unregister_device * @idle: used to keep track of RX state @@ -156,7 +155,6 @@ struct lirc_fh { */ struct rc_dev { struct device dev; - bool managed_alloc; bool registered; bool idle; bool encode_wakeup; -- cgit v1.2.3 From a93a51f42ac354425a252210183c4151d991f75d Mon Sep 17 00:00:00 2001 From: LiPeng Huang Date: Thu, 5 Feb 2026 09:48:52 +0800 Subject: media: lirc: increase IR_MAX_DURATION to send extended code sequences sensor:increase IR_MAX_DURATION to 1000ms to fix long IR remote timeouts Certain infrared remotes (e.g., brand-specific smart home remotes, custom consumer electronics) send extended code sequences that exceed the default 500ms IR_MAX_DURATION threshold. This causes the kernel's raw IR driver to discard incomplete code, resulting in unrecognized commands. Increase IR_MAX_DURATION to 1000ms: - Aligns with common extended IR protocol specs (most long sequences use 700-900ms) - No impact on standard remotes (all mainstream IR codes are <500ms) - Validated on vivo X200 and vivo X300, resolves timeout issues without regressions Signed-off-by: LiPeng Huang Signed-off-by: Sean Young Signed-off-by: Hans Verkuil --- drivers/media/rc/gpio-ir-tx.c | 4 ++-- include/media/rc-core.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/media/rc/gpio-ir-tx.c b/drivers/media/rc/gpio-ir-tx.c index e185ead40464..d15fb17fa0fa 100644 --- a/drivers/media/rc/gpio-ir-tx.c +++ b/drivers/media/rc/gpio-ir-tx.c @@ -51,7 +51,7 @@ static int gpio_ir_tx_set_carrier(struct rc_dev *dev, u32 carrier) static void delay_until(ktime_t until) { /* - * delta should never exceed 0.5 seconds (IR_MAX_DURATION) and on + * delta should never exceed 1 second (IR_MAX_DURATION) and on * m68k ndelay(s64) does not compile; so use s32 rather than s64. */ s32 delta; @@ -95,7 +95,7 @@ static void gpio_ir_tx_modulated(struct gpio_ir *gpio_ir, uint *txbuf, { ktime_t edge; /* - * delta should never exceed 0.5 seconds (IR_MAX_DURATION) and on + * delta should never exceed 1 second (IR_MAX_DURATION) and on * m68k ndelay(s64) does not compile; so use s32 rather than s64. */ s32 delta; diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 7c964b5ad792..d37fffc5dc3c 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -301,7 +301,7 @@ struct ir_raw_event { #define US_TO_NS(usec) ((usec) * 1000) #define MS_TO_US(msec) ((msec) * 1000) -#define IR_MAX_DURATION MS_TO_US(500) +#define IR_MAX_DURATION MS_TO_US(1000) #define IR_DEFAULT_TIMEOUT MS_TO_US(125) #define IR_MAX_TIMEOUT LIRC_VALUE_MASK -- cgit v1.2.3 From 2727d44f5d5bc3f8e55a6a0ccf24d8105a5a400e Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:09:31 +0100 Subject: writeback: fix kernel-doc function name mismatch for wb_put_many() The kernel-doc comment says wb_put but the actual function is wb_put_many. Fix the name to match. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Kit Dallege Link: https://patch.msgid.link/20260315170931.65852-1-xaum.io@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/backing-dev-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c88fd4d37d1f..a06b93446d10 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -237,7 +237,7 @@ static inline void wb_get(struct bdi_writeback *wb) } /** - * wb_put - decrement a wb's refcount + * wb_put_many - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ -- cgit v1.2.3 From 3ccc8a922906703cd0efdf1bdd6186f18f7e23ec Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 16 Mar 2026 14:15:02 +0200 Subject: drm/intel: add shared step.h and switch i915 to use it As the first step towards using shared definitions for step name enumerations, add shared include/drm/intel/step.h and switch i915 to use it. Reviewed-by: Luca Coelho Link: https://patch.msgid.link/e76412a316ddff44dc46633d80e9caa5df54ed6b.1773663208.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_step.h | 57 ++--------------------------------- include/drm/intel/step.h | 62 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 55 deletions(-) create mode 100644 include/drm/intel/step.h (limited to 'include') diff --git a/drivers/gpu/drm/i915/intel_step.h b/drivers/gpu/drm/i915/intel_step.h index 22f1d6905160..2ca36eae4b5a 100644 --- a/drivers/gpu/drm/i915/intel_step.h +++ b/drivers/gpu/drm/i915/intel_step.h @@ -8,6 +8,8 @@ #include +#include + struct drm_i915_private; struct intel_step_info { @@ -19,61 +21,6 @@ struct intel_step_info { u8 media_step; }; -#define STEP_ENUM_VAL(name) STEP_##name, - -#define STEP_NAME_LIST(func) \ - func(A0) \ - func(A1) \ - func(A2) \ - func(A3) \ - func(B0) \ - func(B1) \ - func(B2) \ - func(B3) \ - func(C0) \ - func(C1) \ - func(C2) \ - func(C3) \ - func(D0) \ - func(D1) \ - func(D2) \ - func(D3) \ - func(E0) \ - func(E1) \ - func(E2) \ - func(E3) \ - func(F0) \ - func(F1) \ - func(F2) \ - func(F3) \ - func(G0) \ - func(G1) \ - func(G2) \ - func(G3) \ - func(H0) \ - func(H1) \ - func(H2) \ - func(H3) \ - func(I0) \ - func(I1) \ - func(I2) \ - func(I3) \ - func(J0) \ - func(J1) \ - func(J2) \ - func(J3) - -/* - * Symbolic steppings that do not match the hardware. These are valid both as gt - * and display steppings as symbolic names. - */ -enum intel_step { - STEP_NONE = 0, - STEP_NAME_LIST(STEP_ENUM_VAL) - STEP_FUTURE, - STEP_FOREVER, -}; - void intel_step_init(struct drm_i915_private *i915); const char *intel_step_name(enum intel_step step); diff --git a/include/drm/intel/step.h b/include/drm/intel/step.h new file mode 100644 index 000000000000..4de7520109bc --- /dev/null +++ b/include/drm/intel/step.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2026 Intel Corporation */ + +#ifndef __STEP_H__ +#define __STEP_H__ + +#define STEP_ENUM_VAL(name) STEP_##name, + +#define STEP_NAME_LIST(func) \ + func(A0) \ + func(A1) \ + func(A2) \ + func(A3) \ + func(B0) \ + func(B1) \ + func(B2) \ + func(B3) \ + func(C0) \ + func(C1) \ + func(C2) \ + func(C3) \ + func(D0) \ + func(D1) \ + func(D2) \ + func(D3) \ + func(E0) \ + func(E1) \ + func(E2) \ + func(E3) \ + func(F0) \ + func(F1) \ + func(F2) \ + func(F3) \ + func(G0) \ + func(G1) \ + func(G2) \ + func(G3) \ + func(H0) \ + func(H1) \ + func(H2) \ + func(H3) \ + func(I0) \ + func(I1) \ + func(I2) \ + func(I3) \ + func(J0) \ + func(J1) \ + func(J2) \ + func(J3) + +/* + * Symbolic steppings that do not match the hardware. These are valid both as gt + * and display steppings as symbolic names. + */ +enum intel_step { + STEP_NONE = 0, + STEP_NAME_LIST(STEP_ENUM_VAL) + STEP_FUTURE, + STEP_FOREVER, +}; + +#endif /* __STEP_H__ */ -- cgit v1.2.3 From 9577c74c96f88d807d1ba005adbf5952e7127e55 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:41 -0700 Subject: platform/x86/intel/vsec: Make driver_data info const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat PCI id->driver_data (intel_vsec_platform_info) as read-only by making vsec_priv->info a const pointer and updating all function signatures to accept const intel_vsec_platform_info *. This improves const-correctness and clarifies that the platform info data from the driver_data table is not meant to be modified at runtime. No functional changes intended. Signed-off-by: David E. Box Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-3-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 20 ++++++++++---------- include/linux/intel_vsec.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 46966edca03b..e0096be605d9 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -42,7 +42,7 @@ enum vsec_device_state { }; struct vsec_priv { - struct intel_vsec_platform_info *info; + const struct intel_vsec_platform_info *info; struct device *suppliers[VSEC_FEATURE_COUNT]; struct oobmsm_plat_info plat_info; enum vsec_device_state state[VSEC_FEATURE_COUNT]; @@ -270,7 +270,7 @@ cleanup_aux: EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, "INTEL_VSEC"); static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header, - struct intel_vsec_platform_info *info, + const struct intel_vsec_platform_info *info, unsigned long cap_id, u64 base_addr) { struct intel_vsec_device __free(kfree) *intel_vsec_dev = NULL; @@ -406,7 +406,7 @@ static int get_cap_id(u32 header_id, unsigned long *cap_id) static int intel_vsec_register_device(struct pci_dev *pdev, struct intel_vsec_header *header, - struct intel_vsec_platform_info *info, + const struct intel_vsec_platform_info *info, u64 base_addr) { const struct vsec_feature_dependency *consumer_deps; @@ -452,7 +452,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, } static bool intel_vsec_walk_header(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { struct intel_vsec_header **header = info->headers; bool have_devices = false; @@ -468,7 +468,7 @@ static bool intel_vsec_walk_header(struct pci_dev *pdev, } static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool have_devices = false; int pos = 0; @@ -519,7 +519,7 @@ static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, } static bool intel_vsec_walk_vsec(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool have_devices = false; int pos = 0; @@ -565,7 +565,7 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, } int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { if (!pdev || !info || !info->headers) return -EINVAL; @@ -578,7 +578,7 @@ int intel_vsec_register(struct pci_dev *pdev, EXPORT_SYMBOL_NS_GPL(intel_vsec_register, "INTEL_VSEC"); static bool intel_vsec_get_features(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool found = false; @@ -622,7 +622,7 @@ static void intel_vsec_skip_missing_dependencies(struct pci_dev *pdev) static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - struct intel_vsec_platform_info *info; + const struct intel_vsec_platform_info *info; struct vsec_priv *priv; int num_caps, ret; int run_once = 0; @@ -633,7 +633,7 @@ static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id return ret; pci_save_state(pdev); - info = (struct intel_vsec_platform_info *)id->driver_data; + info = (const struct intel_vsec_platform_info *)id->driver_data; if (!info) return -EINVAL; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 1a0f357c2427..d551174b0049 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -200,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device #if IS_ENABLED(CONFIG_INTEL_VSEC) int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info); + const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else static inline int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { return -ENODEV; } -- cgit v1.2.3 From c62fd96a04e4a7b847448f97ecfe9f3fe706e7b3 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:42 -0700 Subject: platform/x86/intel/vsec: Decouple add/link helpers from PCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This refactor prepares for adding ACPI-enumerated PMT endpoints. While intel_vsec is bound to PCI today, some helpers are used by code that will also register PMT endpoints from non-PCI (ACPI) paths. Clean up PCI-specific plumbing where it isn’t strictly required and rely on generic struct device where possible. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-4-david.e.box@linux.intel.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 13 +++++++++---- drivers/platform/x86/intel/vsec_tpmi.c | 2 +- include/linux/intel_vsec.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index e0096be605d9..938648b9ef09 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -158,18 +158,23 @@ static bool vsec_driver_present(int cap_id) */ static const struct pci_device_id intel_vsec_pci_ids[]; -static int intel_vsec_link_devices(struct pci_dev *pdev, struct device *dev, +static int intel_vsec_link_devices(struct device *parent, struct device *dev, int consumer_id) { const struct vsec_feature_dependency *deps; enum vsec_device_state *state; struct device **suppliers; struct vsec_priv *priv; + struct pci_dev *pdev; int supplier_id; if (!consumer_id) return 0; + if (!dev_is_pci(parent)) + return 0; + + pdev = to_pci_dev(parent); if (!pci_match_id(intel_vsec_pci_ids, pdev)) return 0; @@ -204,7 +209,7 @@ static int intel_vsec_link_devices(struct pci_dev *pdev, struct device *dev, return 0; } -int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, +int intel_vsec_add_aux(struct device *parent, struct intel_vsec_device *intel_vsec_dev, const char *name) { @@ -252,7 +257,7 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, if (ret) goto cleanup_aux; - ret = intel_vsec_link_devices(pdev, &auxdev->dev, intel_vsec_dev->cap_id); + ret = intel_vsec_link_devices(parent, &auxdev->dev, intel_vsec_dev->cap_id); if (ret) goto cleanup_aux; @@ -343,7 +348,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he * Pass the ownership of intel_vsec_dev and resource within it to * intel_vsec_add_aux() */ - return intel_vsec_add_aux(pdev, parent, no_free_ptr(intel_vsec_dev), + return intel_vsec_add_aux(parent, no_free_ptr(intel_vsec_dev), intel_vsec_name(header->id)); } diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 98846e88d3d0..2298b6361094 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -655,7 +655,7 @@ static int tpmi_create_device(struct intel_tpmi_info *tpmi_info, * feature_vsec_dev and res memory are also freed as part of * device deletion. */ - return intel_vsec_add_aux(vsec_dev->pcidev, &vsec_dev->auxdev.dev, + return intel_vsec_add_aux(&vsec_dev->auxdev.dev, feature_vsec_dev, feature_id_name); } diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index d551174b0049..49a746ec0128 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -184,7 +184,7 @@ struct pmt_feature_group { struct telemetry_region regions[]; }; -int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, +int intel_vsec_add_aux(struct device *parent, struct intel_vsec_device *intel_vsec_dev, const char *name); -- cgit v1.2.3 From 353042d54d82f6c46449f0ee38c244b5a13c1fe4 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:43 -0700 Subject: platform/x86/intel/vsec: Switch exported helpers from pci_dev to device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparatory refactor for ACPI-enumerated PMT endpoints. Several exported PMT/VSEC interfaces and structs carried struct pci_dev * even though callers only need a generic struct device. Move those to struct device * so the same APIs work for PCI and ACPI parents. Acked-by: Rodrigo Vivi Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-5-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/gpu/drm/xe/xe_debugfs.c | 2 +- drivers/gpu/drm/xe/xe_hwmon.c | 2 +- drivers/gpu/drm/xe/xe_vsec.c | 7 ++-- drivers/gpu/drm/xe/xe_vsec.h | 4 +-- drivers/platform/x86/intel/pmc/core.c | 4 +-- drivers/platform/x86/intel/pmc/ssram_telemetry.c | 2 +- drivers/platform/x86/intel/pmt/class.c | 8 ++--- drivers/platform/x86/intel/pmt/class.h | 5 +-- drivers/platform/x86/intel/pmt/discovery.c | 4 +-- drivers/platform/x86/intel/pmt/telemetry.c | 13 +++---- drivers/platform/x86/intel/pmt/telemetry.h | 12 +++---- drivers/platform/x86/intel/sdsi.c | 5 +-- drivers/platform/x86/intel/vsec.c | 44 ++++++++++++++---------- drivers/platform/x86/intel/vsec_tpmi.c | 6 ++-- include/linux/intel_vsec.h | 13 +++---- 15 files changed, 71 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c index 844cfafe1ec7..ad2d8f179eb6 100644 --- a/drivers/gpu/drm/xe/xe_debugfs.c +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -45,7 +45,7 @@ static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio, u64 residency = 0; int ret; - ret = xe_pmt_telem_read(to_pci_dev(xe->drm.dev), + ret = xe_pmt_telem_read(xe->drm.dev, xe_mmio_read32(mmio, PUNIT_TELEMETRY_GUID), &residency, offset, sizeof(residency)); if (ret != sizeof(residency)) { diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index 0fd4d4f1014a..92e423a339f1 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -506,7 +506,7 @@ xe_hwmon_energy_get(struct xe_hwmon *hwmon, int channel, long *energy) if (hwmon->xe->info.platform == XE_BATTLEMAGE) { u64 pmt_val; - ret = xe_pmt_telem_read(to_pci_dev(hwmon->xe->drm.dev), + ret = xe_pmt_telem_read(hwmon->xe->drm.dev, xe_mmio_read32(mmio, PUNIT_TELEMETRY_GUID), &pmt_val, BMG_ENERGY_STATUS_PMT_OFFSET, sizeof(pmt_val)); if (ret != sizeof(pmt_val)) { diff --git a/drivers/gpu/drm/xe/xe_vsec.c b/drivers/gpu/drm/xe/xe_vsec.c index 4ebb4dbe1c9b..a9baf0bfe572 100644 --- a/drivers/gpu/drm/xe/xe_vsec.c +++ b/drivers/gpu/drm/xe/xe_vsec.c @@ -140,10 +140,10 @@ static int xe_guid_decode(u32 guid, int *index, u32 *offset) return 0; } -int xe_pmt_telem_read(struct pci_dev *pdev, u32 guid, u64 *data, loff_t user_offset, +int xe_pmt_telem_read(struct device *dev, u32 guid, u64 *data, loff_t user_offset, u32 count) { - struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_device *xe = kdev_to_xe_device(dev); void __iomem *telem_addr = xe->mmio.regs + BMG_TELEMETRY_OFFSET; u32 mem_region; u32 offset; @@ -198,7 +198,6 @@ void xe_vsec_init(struct xe_device *xe) { struct intel_vsec_platform_info *info; struct device *dev = xe->drm.dev; - struct pci_dev *pdev = to_pci_dev(dev); enum xe_vsec platform; platform = get_platform_info(xe); @@ -221,6 +220,6 @@ void xe_vsec_init(struct xe_device *xe) * Register a VSEC. Cleanup is handled using device managed * resources. */ - intel_vsec_register(pdev, info); + intel_vsec_register(dev, info); } MODULE_IMPORT_NS("INTEL_VSEC"); diff --git a/drivers/gpu/drm/xe/xe_vsec.h b/drivers/gpu/drm/xe/xe_vsec.h index dabfb4e02d70..a25b4e6e681b 100644 --- a/drivers/gpu/drm/xe/xe_vsec.h +++ b/drivers/gpu/drm/xe/xe_vsec.h @@ -6,10 +6,10 @@ #include -struct pci_dev; +struct device; struct xe_device; void xe_vsec_init(struct xe_device *xe); -int xe_pmt_telem_read(struct pci_dev *pdev, u32 guid, u64 *data, loff_t user_offset, u32 count); +int xe_pmt_telem_read(struct device *dev, u32 guid, u64 *data, loff_t user_offset, u32 count); #endif diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 02b303418d18..d91e1ab842d6 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -1315,7 +1315,7 @@ static struct telem_endpoint *pmc_core_register_endpoint(struct pci_dev *pcidev, unsigned int i; for (i = 0; guids[i]; i++) { - ep = pmt_telem_find_and_register_endpoint(pcidev, guids[i], 0); + ep = pmt_telem_find_and_register_endpoint(&pcidev->dev, guids[i], 0); if (!IS_ERR(ep)) return ep; } @@ -1600,7 +1600,7 @@ static int pmc_core_get_telem_info(struct pmc_dev *pmcdev, struct pmc_dev_info * if (!pmc->map->lpm_req_guid) return -ENXIO; - ep = pmt_telem_find_and_register_endpoint(pcidev, pmc->map->lpm_req_guid, 0); + ep = pmt_telem_find_and_register_endpoint(&pcidev->dev, pmc->map->lpm_req_guid, 0); if (IS_ERR(ep)) { dev_dbg(&pmcdev->pdev->dev, "couldn't get telem endpoint %pe", ep); return -EPROBE_DEFER; diff --git a/drivers/platform/x86/intel/pmc/ssram_telemetry.c b/drivers/platform/x86/intel/pmc/ssram_telemetry.c index 03fad9331fc0..6f6e83e70fc5 100644 --- a/drivers/platform/x86/intel/pmc/ssram_telemetry.c +++ b/drivers/platform/x86/intel/pmc/ssram_telemetry.c @@ -60,7 +60,7 @@ pmc_ssram_telemetry_add_pmt(struct pci_dev *pcidev, u64 ssram_base, void __iomem info.base_addr = ssram_base; info.parent = &pcidev->dev; - return intel_vsec_register(pcidev, &info); + return intel_vsec_register(&pcidev->dev, &info); } static inline u64 get_base(void __iomem *addr, u32 offset) diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index be3c8d9e4fff..b4c9964df807 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -60,11 +60,11 @@ pmt_memcpy64_fromio(void *to, const u64 __iomem *from, size_t count) return count; } -int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf, +int pmt_telem_read_mmio(struct device *dev, struct pmt_callbacks *cb, u32 guid, void *buf, void __iomem *addr, loff_t off, u32 count) { if (cb && cb->read_telem) - return cb->read_telem(pdev, guid, buf, off, count); + return cb->read_telem(dev, guid, buf, off, count); addr += off; @@ -99,7 +99,7 @@ intel_pmt_read(struct file *filp, struct kobject *kobj, if (count > entry->size - off) count = entry->size - off; - count = pmt_telem_read_mmio(entry->pcidev, entry->cb, entry->header.guid, buf, + count = pmt_telem_read_mmio(entry->ep->dev, entry->cb, entry->header.guid, buf, entry->base, off, count); return count; @@ -208,7 +208,7 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, struct intel_vsec_device *ivdev, struct resource *disc_res) { - struct pci_dev *pci_dev = ivdev->pcidev; + struct pci_dev *pci_dev = to_pci_dev(ivdev->dev); struct device *dev = &ivdev->auxdev.dev; struct intel_pmt_header *header = &entry->header; u8 bir; diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h index 3c5ad5f52bca..1ae56a5baad2 100644 --- a/drivers/platform/x86/intel/pmt/class.h +++ b/drivers/platform/x86/intel/pmt/class.h @@ -19,11 +19,12 @@ #define GET_BIR(v) ((v) & GENMASK(2, 0)) #define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) +struct device; struct pci_dev; extern struct class intel_pmt_class; struct telem_endpoint { - struct pci_dev *pcidev; + struct device *dev; struct telem_header header; struct pmt_callbacks *cb; void __iomem *base; @@ -65,7 +66,7 @@ struct intel_pmt_namespace { struct intel_pmt_entry *entry); }; -int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf, +int pmt_telem_read_mmio(struct device *dev, struct pmt_callbacks *cb, u32 guid, void *buf, void __iomem *addr, loff_t off, u32 count); bool intel_pmt_is_early_client_hw(struct device *dev); int intel_pmt_dev_create(struct intel_pmt_entry *entry, diff --git a/drivers/platform/x86/intel/pmt/discovery.c b/drivers/platform/x86/intel/pmt/discovery.c index e500aa327d23..c482368bfaae 100644 --- a/drivers/platform/x86/intel/pmt/discovery.c +++ b/drivers/platform/x86/intel/pmt/discovery.c @@ -542,7 +542,7 @@ static int pmt_features_probe(struct auxiliary_device *auxdev, const struct auxi if (!priv) return -ENOMEM; - priv->parent = &ivdev->pcidev->dev; + priv->parent = ivdev->dev; auxiliary_set_drvdata(auxdev, priv); priv->dev = device_create(&intel_pmt_class, &auxdev->dev, MKDEV(0, 0), priv, @@ -609,7 +609,7 @@ void intel_pmt_get_features(struct intel_pmt_entry *entry) mutex_lock(&feature_list_lock); list_for_each_entry(feature, &pmt_feature_list, list) { - if (feature->priv->parent != &entry->ep->pcidev->dev) + if (feature->priv->parent != entry->ep->dev) continue; pmt_get_features(entry, feature); diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c index a52803bfe124..bdc7c24a3678 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.c +++ b/drivers/platform/x86/intel/pmt/telemetry.c @@ -112,7 +112,7 @@ static int pmt_telem_add_endpoint(struct intel_vsec_device *ivdev, return -ENOMEM; ep = entry->ep; - ep->pcidev = ivdev->pcidev; + ep->dev = ivdev->dev; ep->header.access_type = entry->header.access_type; ep->header.guid = entry->header.guid; ep->header.base_offset = entry->header.base_offset; @@ -204,7 +204,7 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info) goto unlock; } - info->pdev = entry->ep->pcidev; + info->dev = entry->ep->dev; info->header = entry->ep->header; unlock: @@ -218,9 +218,10 @@ static int pmt_copy_region(struct telemetry_region *region, struct intel_pmt_entry *entry) { + struct pci_dev *pdev = to_pci_dev(entry->ep->dev); struct oobmsm_plat_info *plat_info; - plat_info = intel_vsec_get_mapping(entry->ep->pcidev); + plat_info = intel_vsec_get_mapping(pdev); if (IS_ERR(plat_info)) return PTR_ERR(plat_info); @@ -308,7 +309,7 @@ int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count) if (offset + NUM_BYTES_QWORD(count) > size) return -EINVAL; - pmt_telem_read_mmio(ep->pcidev, ep->cb, ep->header.guid, data, ep->base, offset, + pmt_telem_read_mmio(ep->dev, ep->cb, ep->header.guid, data, ep->base, offset, NUM_BYTES_QWORD(count)); return ep->present ? 0 : -EPIPE; @@ -335,7 +336,7 @@ int pmt_telem_read32(struct telem_endpoint *ep, u32 id, u32 *data, u32 count) EXPORT_SYMBOL_NS_GPL(pmt_telem_read32, "INTEL_PMT_TELEMETRY"); struct telem_endpoint * -pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, u32 guid, u16 pos) +pmt_telem_find_and_register_endpoint(struct device *dev, u32 guid, u16 pos) { int devid = 0; int inst = 0; @@ -348,7 +349,7 @@ pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, u32 guid, u16 pos) if (err) return ERR_PTR(err); - if (ep_info.header.guid == guid && ep_info.pdev == pcidev) { + if (ep_info.header.guid == guid && ep_info.dev == dev) { if (inst == pos) return pmt_telem_register_endpoint(devid); ++inst; diff --git a/drivers/platform/x86/intel/pmt/telemetry.h b/drivers/platform/x86/intel/pmt/telemetry.h index d45af5512b4e..0f88c5e7d90e 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.h +++ b/drivers/platform/x86/intel/pmt/telemetry.h @@ -6,8 +6,8 @@ #define PMT_TELEM_TELEMETRY 0 #define PMT_TELEM_CRASHLOG 1 +struct device; struct telem_endpoint; -struct pci_dev; struct telem_header { u8 access_type; @@ -17,7 +17,7 @@ struct telem_header { }; struct telem_endpoint_info { - struct pci_dev *pdev; + struct device *dev; struct telem_header header; }; @@ -71,8 +71,8 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info); /** * pmt_telem_find_and_register_endpoint() - Get a telemetry endpoint from - * pci_dev device, guid and pos - * @pdev: PCI device inside the Intel vsec + * device, guid and pos + * @dev: device inside the Intel vsec * @guid: GUID of the telemetry space * @pos: Instance of the guid * @@ -80,8 +80,8 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info); * * endpoint - On success returns pointer to the telemetry endpoint * * -ENXIO - telemetry endpoint not found */ -struct telem_endpoint *pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, - u32 guid, u16 pos); +struct telem_endpoint * +pmt_telem_find_and_register_endpoint(struct device *dev, u32 guid, u16 pos); /** * pmt_telem_read() - Read qwords from counter sram using sample id diff --git a/drivers/platform/x86/intel/sdsi.c b/drivers/platform/x86/intel/sdsi.c index da75f53d0bcc..d7e37d4ace23 100644 --- a/drivers/platform/x86/intel/sdsi.c +++ b/drivers/platform/x86/intel/sdsi.c @@ -599,13 +599,14 @@ static int sdsi_get_layout(struct sdsi_priv *priv, struct disc_table *table) return 0; } -static int sdsi_map_mbox_registers(struct sdsi_priv *priv, struct pci_dev *parent, +static int sdsi_map_mbox_registers(struct sdsi_priv *priv, struct device *dev, struct disc_table *disc_table, struct resource *disc_res) { u32 access_type = FIELD_GET(DT_ACCESS_TYPE, disc_table->access_info); u32 size = FIELD_GET(DT_SIZE, disc_table->access_info); u32 tbir = FIELD_GET(DT_TBIR, disc_table->offset); u32 offset = DT_OFFSET(disc_table->offset); + struct pci_dev *parent = to_pci_dev(dev); struct resource res = {}; /* Starting location of SDSi MMIO region based on access type */ @@ -681,7 +682,7 @@ static int sdsi_probe(struct auxiliary_device *auxdev, const struct auxiliary_de return ret; /* Map the SDSi mailbox registers */ - ret = sdsi_map_mbox_registers(priv, intel_cap_dev->pcidev, &disc_table, disc_res); + ret = sdsi_map_mbox_registers(priv, intel_cap_dev->dev, &disc_table, disc_res); if (ret) return ret; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 938648b9ef09..a547e4b98245 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -274,7 +274,7 @@ cleanup_aux: } EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, "INTEL_VSEC"); -static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header, +static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *header, const struct intel_vsec_platform_info *info, unsigned long cap_id, u64 base_addr) { @@ -288,18 +288,18 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he if (info->parent) parent = info->parent; else - parent = &pdev->dev; + parent = dev; if (!intel_vsec_supported(header->id, info->caps)) return -EINVAL; if (!header->num_entries) { - dev_dbg(&pdev->dev, "Invalid 0 entry count for header id %d\n", header->id); + dev_dbg(dev, "Invalid 0 entry count for header id %d\n", header->id); return -EINVAL; } if (!header->entry_size) { - dev_dbg(&pdev->dev, "Invalid 0 entry size for header id %d\n", header->id); + dev_dbg(dev, "Invalid 0 entry size for header id %d\n", header->id); return -EINVAL; } @@ -331,7 +331,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he release_mem_region(tmp->start, resource_size(tmp)); } - intel_vsec_dev->pcidev = pdev; + intel_vsec_dev->dev = dev; intel_vsec_dev->resource = no_free_ptr(res); intel_vsec_dev->num_resources = header->num_entries; intel_vsec_dev->quirks = info->quirks; @@ -409,13 +409,14 @@ static int get_cap_id(u32 header_id, unsigned long *cap_id) return 0; } -static int intel_vsec_register_device(struct pci_dev *pdev, +static int intel_vsec_register_device(struct device *dev, struct intel_vsec_header *header, const struct intel_vsec_platform_info *info, u64 base_addr) { const struct vsec_feature_dependency *consumer_deps; struct vsec_priv *priv; + struct pci_dev *pdev; unsigned long cap_id; int ret; @@ -427,8 +428,12 @@ static int intel_vsec_register_device(struct pci_dev *pdev, * Only track dependencies for devices probed by the VSEC driver. * For others using the exported APIs, add the device directly. */ + if (!dev_is_pci(dev)) + return intel_vsec_add_dev(dev, header, info, cap_id, base_addr); + + pdev = to_pci_dev(dev); if (!pci_match_id(intel_vsec_pci_ids, pdev)) - return intel_vsec_add_dev(pdev, header, info, cap_id, base_addr); + return intel_vsec_add_dev(dev, header, info, cap_id, base_addr); priv = pci_get_drvdata(pdev); if (priv->state[cap_id] == STATE_REGISTERED || @@ -444,7 +449,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, consumer_deps = get_consumer_dependencies(priv, cap_id); if (!consumer_deps || suppliers_ready(priv, consumer_deps, cap_id)) { - ret = intel_vsec_add_dev(pdev, header, info, cap_id, base_addr); + ret = intel_vsec_add_dev(dev, header, info, cap_id, base_addr); if (ret) priv->state[cap_id] = STATE_SKIP; else @@ -456,7 +461,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, return -EAGAIN; } -static bool intel_vsec_walk_header(struct pci_dev *pdev, +static bool intel_vsec_walk_header(struct device *dev, const struct intel_vsec_platform_info *info) { struct intel_vsec_header **header = info->headers; @@ -464,7 +469,7 @@ static bool intel_vsec_walk_header(struct pci_dev *pdev, int ret; for ( ; *header; header++) { - ret = intel_vsec_register_device(pdev, *header, info, info->base_addr); + ret = intel_vsec_register_device(dev, *header, info, info->base_addr); if (!ret) have_devices = true; } @@ -512,7 +517,7 @@ static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, pci_read_config_dword(pdev, pos + PCI_DVSEC_HEADER2, &hdr); header.id = PCI_DVSEC_HEADER2_ID(hdr); - ret = intel_vsec_register_device(pdev, &header, info, + ret = intel_vsec_register_device(&pdev->dev, &header, info, pci_resource_start(pdev, header.tbir)); if (ret) continue; @@ -558,7 +563,7 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, header.tbir = INTEL_DVSEC_TABLE_BAR(table); header.offset = INTEL_DVSEC_TABLE_OFFSET(table); - ret = intel_vsec_register_device(pdev, &header, info, + ret = intel_vsec_register_device(&pdev->dev, &header, info, pci_resource_start(pdev, header.tbir)); if (ret) continue; @@ -569,13 +574,13 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, return have_devices; } -int intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info) { - if (!pdev || !info || !info->headers) + if (!dev || !info || !info->headers) return -EINVAL; - if (!intel_vsec_walk_header(pdev, info)) + if (!intel_vsec_walk_header(dev, info)) return -ENODEV; else return 0; @@ -601,7 +606,7 @@ static bool intel_vsec_get_features(struct pci_dev *pdev, found = true; if (info && (info->quirks & VSEC_QUIRK_NO_DVSEC) && - intel_vsec_walk_header(pdev, info)) + intel_vsec_walk_header(&pdev->dev, info)) found = true; return found; @@ -673,7 +678,10 @@ int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, { struct vsec_priv *priv; - priv = pci_get_drvdata(vsec_dev->pcidev); + if (!dev_is_pci(vsec_dev->dev)) + return -ENODEV; + + priv = pci_get_drvdata(to_pci_dev(vsec_dev->dev)); if (!priv) return -EINVAL; @@ -821,7 +829,7 @@ static pci_ers_result_t intel_vsec_pci_slot_reset(struct pci_dev *pdev) xa_for_each(&auxdev_array, index, intel_vsec_dev) { /* check if pdev doesn't match */ - if (pdev != intel_vsec_dev->pcidev) + if (&pdev->dev != intel_vsec_dev->dev) continue; devm_release_action(&pdev->dev, intel_vsec_remove_aux, &intel_vsec_dev->auxdev); diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 2298b6361094..9dddf4e5863e 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -530,7 +530,7 @@ static const struct file_operations mem_write_ops = { .release = single_release, }; -#define tpmi_to_dev(info) (&info->vsec_dev->pcidev->dev) +#define tpmi_to_dev(info) ((info)->vsec_dev->dev) static void tpmi_dbgfs_register(struct intel_tpmi_info *tpmi_info) { @@ -642,7 +642,7 @@ static int tpmi_create_device(struct intel_tpmi_info *tpmi_info, tmp->flags = IORESOURCE_MEM; } - feature_vsec_dev->pcidev = vsec_dev->pcidev; + feature_vsec_dev->dev = vsec_dev->dev; feature_vsec_dev->resource = res; feature_vsec_dev->num_resources = pfs->pfs_header.num_entries; feature_vsec_dev->priv_data = &tpmi_info->plat_info; @@ -742,7 +742,7 @@ static int tpmi_fetch_pfs_header(struct intel_tpmi_pm_feature *pfs, u64 start, i static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) { struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev); - struct pci_dev *pci_dev = vsec_dev->pcidev; + struct pci_dev *pci_dev = to_pci_dev(vsec_dev->dev); struct intel_tpmi_info *tpmi_info; u64 pfs_start = 0; int ret, i; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 49a746ec0128..4eecb2a6bac4 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -29,6 +29,7 @@ #define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) #define TABLE_OFFSET_SHIFT 3 +struct device; struct pci_dev; struct resource; @@ -82,14 +83,14 @@ enum intel_vsec_quirks { * struct pmt_callbacks - Callback infrastructure for PMT devices * @read_telem: when specified, called by client driver to access PMT * data (instead of direct copy). - * * pdev: PCI device reference for the callback's use + * * dev: device reference for the callback's use * * guid: ID of data to acccss * * data: buffer for the data to be copied * * off: offset into the requested buffer * * count: size of buffer */ struct pmt_callbacks { - int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); + int (*read_telem)(struct device *dev, u32 guid, u64 *data, loff_t off, u32 count); }; struct vsec_feature_dependency { @@ -122,7 +123,7 @@ struct intel_vsec_platform_info { /** * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access - * @pcidev: pci device associated with the device + * @dev: struct device associated with the device * @resource: any resources shared by the parent * @ida: id reference * @num_resources: number of resources @@ -135,7 +136,7 @@ struct intel_vsec_platform_info { */ struct intel_vsec_device { struct auxiliary_device auxdev; - struct pci_dev *pcidev; + struct device *dev; struct resource *resource; struct ida *ida; int num_resources; @@ -199,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device } #if IS_ENABLED(CONFIG_INTEL_VSEC) -int intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else -static inline int intel_vsec_register(struct pci_dev *pdev, +static inline int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info) { return -ENODEV; -- cgit v1.2.3 From 22fa2ebc11a164e1ea529da6c356e3e01aef8ac8 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:45 -0700 Subject: platform/x86/intel/vsec: Plumb ACPI PMT discovery tables through vsec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some platforms expose PMT discovery via ACPI instead of PCI BARs. Add a generic discovery source flag and carry ACPI discovery entries alongside the existing PCI resource path so PMT clients can consume either. Changes: - Add enum intel_vsec_disc_source { _PCI, _ACPI }. - Extend intel_vsec_platform_info and intel_vsec_device with source enum and ACPI discovery table pointer/ - When src==ACPI, skip BAR resource setup and copy the ACPI discovery entries into the aux device. No user-visible behavior change yet; this only wires ACPI data through vsec in preparation for ACPI-enumerated PMT clients. Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-7-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 23 +++++++++++++++++++++++ include/linux/intel_vsec.h | 20 +++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 34b2c19ecff0..7d5dbc1c1d05 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -24,7 +24,9 @@ #include #include #include +#include #include +#include #include #define PMT_XA_START 0 @@ -109,6 +111,7 @@ static void intel_vsec_dev_release(struct device *dev) ida_free(intel_vsec_dev->ida, intel_vsec_dev->auxdev.id); + kfree(intel_vsec_dev->acpi_disc); kfree(intel_vsec_dev->resource); kfree(intel_vsec_dev); } @@ -320,6 +323,13 @@ static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *head * auxiliary device driver. */ for (i = 0, tmp = res; i < header->num_entries; i++, tmp++) { + /* + * Skip resource mapping check for ACPI-based discovery + * since those tables are read from _DSD, not MMIO. + */ + if (info->src == INTEL_VSEC_DISC_ACPI) + break; + tmp->start = base_addr + header->offset + i * (header->entry_size * sizeof(u32)); tmp->end = tmp->start + (header->entry_size * sizeof(u32)) - 1; tmp->flags = IORESOURCE_MEM; @@ -338,6 +348,19 @@ static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *head intel_vsec_dev->base_addr = info->base_addr; intel_vsec_dev->priv_data = info->priv_data; intel_vsec_dev->cap_id = cap_id; + intel_vsec_dev->src = info->src; + + if (info->src == INTEL_VSEC_DISC_ACPI) { + size_t bytes; + + if (check_mul_overflow(intel_vsec_dev->num_resources, + sizeof(*info->acpi_disc), &bytes)) + return -EOVERFLOW; + + intel_vsec_dev->acpi_disc = kmemdup(info->acpi_disc, bytes, GFP_KERNEL); + if (!intel_vsec_dev->acpi_disc) + return -ENOMEM; + } if (header->id == VSEC_ID_SDSI) intel_vsec_dev->ida = &intel_vsec_sdsi_ida; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 4eecb2a6bac4..1fe5665a9d02 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -33,6 +33,11 @@ struct device; struct pci_dev; struct resource; +enum intel_vsec_disc_source { + INTEL_VSEC_DISC_PCI, /* PCI, default */ + INTEL_VSEC_DISC_ACPI, /* ACPI */ +}; + enum intel_vsec_id { VSEC_ID_TELEMETRY = 2, VSEC_ID_WATCHER = 3, @@ -103,6 +108,10 @@ struct vsec_feature_dependency { * @parent: parent device in the auxbus chain * @headers: list of headers to define the PMT client devices to create * @deps: array of feature dependencies + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @provider == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @priv_data: private data, usable by parent devices, currently a callback * @caps: bitmask of PMT capabilities for the given headers * @quirks: bitmask of VSEC device quirks @@ -113,6 +122,8 @@ struct intel_vsec_platform_info { struct device *parent; struct intel_vsec_header **headers; const struct vsec_feature_dependency *deps; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; void *priv_data; unsigned long caps; unsigned long quirks; @@ -124,7 +135,12 @@ struct intel_vsec_platform_info { * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access * @dev: struct device associated with the device - * @resource: any resources shared by the parent + * @resource: PCI discovery resources (BAR windows), one per discovery + * instance. Valid only when @src == INTEL_VSEC_DISC_PCI + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @src == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @ida: id reference * @num_resources: number of resources * @id: xarray id @@ -138,6 +154,8 @@ struct intel_vsec_device { struct auxiliary_device auxdev; struct device *dev; struct resource *resource; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; struct ida *ida; int num_resources; int id; /* xa */ -- cgit v1.2.3 From 4afc71bba8b7d7841681e7647ae02f5079aaf28f Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Wed, 4 Mar 2026 20:52:37 -0500 Subject: module.lds,codetag: force 0 sh_addr for sections Commit 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros") added .text and made .data, .bss, and .rodata sections unconditional in the module linker script, but without an explicit address like the other sections in the same file. When linking modules with ld.bfd -r, sections defined without an address inherit the location counter, resulting in non-zero sh_addr values in the .ko. Relocatable objects are expected to have sh_addr=0 for these sections and these non-zero addresses confuse elfutils and have been reported to cause segmentation faults in SystemTap [1]. Add the 0 address specifier to all sections in module.lds, including the .codetag.* sections via MOD_SEPARATE_CODETAG_SECTIONS macro. Link: https://sourceware.org/bugzilla/show_bug.cgi?id=33958 Fixes: 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros") Signed-off-by: Joe Lawrence Reviewed-by: Petr Pavlu Acked-by: Josh Poimboeuf Signed-off-by: Sami Tolvanen --- include/asm-generic/codetag.lds.h | 2 +- scripts/module.lds.S | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h index a14f4bdafdda..4948e5d4e9d9 100644 --- a/include/asm-generic/codetag.lds.h +++ b/include/asm-generic/codetag.lds.h @@ -18,7 +18,7 @@ IF_MEM_ALLOC_PROFILING(SECTION_WITH_BOUNDARIES(alloc_tags)) #define MOD_SEPARATE_CODETAG_SECTION(_name) \ - .codetag.##_name : { \ + .codetag.##_name 0 : { \ SECTION_WITH_BOUNDARIES(_name) \ } diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 054ef99e8288..e1cab3cee3f7 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -32,30 +32,30 @@ SECTIONS { __jump_table 0 : ALIGN(8) { KEEP(*(__jump_table)) } __ex_table 0 : ALIGN(4) { KEEP(*(__ex_table)) } - __patchable_function_entries : { *(__patchable_function_entries) } + __patchable_function_entries 0 : { *(__patchable_function_entries) } .init.klp_funcs 0 : ALIGN(8) { KEEP(*(.init.klp_funcs)) } .init.klp_objects 0 : ALIGN(8) { KEEP(*(.init.klp_objects)) } #ifdef CONFIG_ARCH_USES_CFI_TRAPS - __kcfi_traps : { KEEP(*(.kcfi_traps)) } + __kcfi_traps 0 : { KEEP(*(.kcfi_traps)) } #endif - .text : { + .text 0 : { *(.text .text.[0-9a-zA-Z_]*) } - .bss : { + .bss 0 : { *(.bss .bss.[0-9a-zA-Z_]*) *(.bss..L*) } - .data : { + .data 0 : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) } - .rodata : { + .rodata 0 : { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } -- cgit v1.2.3 From bb8539e0e60916ef3ed4a92eb2f3cfd8e34061ef Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 16 Mar 2026 17:28:23 +0800 Subject: ppp: require callers of ppp_dev_name() to hold RCU ppp_dev_name() holds the RCU read lock internally to protect pch->ppp. However, as it returns netdev->name to the caller, the caller should also hold either RCU or RTNL lock to prevent the netdev from being freed. The only two references of the function is in the L2TP driver, both of which already hold RCU. So remove the internal RCU lock and document that callers must hold RCU. Signed-off-by: Qingfang Deng Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20260316092824.479149-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_generic.c | 3 +-- include/linux/ppp_channel.h | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a036ddfe327b..cb29a6968c63 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2969,6 +2969,7 @@ int ppp_unit_number(struct ppp_channel *chan) /* * Return the PPP device interface name of a channel. + * Caller must hold RCU read lock. */ char *ppp_dev_name(struct ppp_channel *chan) { @@ -2977,11 +2978,9 @@ char *ppp_dev_name(struct ppp_channel *chan) struct ppp *ppp; if (pch) { - rcu_read_lock(); ppp = rcu_dereference(pch->ppp); if (ppp && ppp->dev) name = ppp->dev->name; - rcu_read_unlock(); } return name; } diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index ca8ad03eeef0..2f63e9a6cc88 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -72,7 +72,9 @@ extern int ppp_channel_index(struct ppp_channel *); /* Get the unit number associated with a channel, or -1 if none */ extern int ppp_unit_number(struct ppp_channel *); -/* Get the device name associated with a channel, or NULL if none */ +/* Get the device name associated with a channel, or NULL if none. + * Caller must hold RCU read lock. + */ extern char *ppp_dev_name(struct ppp_channel *); /* -- cgit v1.2.3 From b520c4eef83dd406591431f936de0908c3ed7fb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:30 +0100 Subject: block: split bio_alloc_bioset more clearly into a fast and slowpath bio_alloc_bioset tries non-waiting slab allocations first for the bio and bvec array, but does so in a somewhat convoluted way. Restructure the function so that it first open codes these slab allocations, and then falls back to the mempools with the original gfp mask. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 180 +++++++++++++++++++++------------------------------- include/linux/bio.h | 3 +- 2 files changed, 74 insertions(+), 109 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index 6131ccb7284a..5982bf069cef 100644 --- a/block/bio.c +++ b/block/bio.c @@ -176,43 +176,12 @@ static void bvec_free(struct mempool *pool, struct bio_vec *bv, * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ -static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +static inline gfp_t try_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -static struct bio_vec *bvec_alloc(struct mempool *pool, unsigned short *nr_vecs, - gfp_t gfp_mask) -{ - struct biovec_slab *bvs = biovec_slab(*nr_vecs); - - if (WARN_ON_ONCE(!bvs)) - return NULL; - - /* - * Upgrade the nr_vecs request to take full advantage of the allocation. - * We also rely on this in the bvec_free path. - */ - *nr_vecs = bvs->nr_vecs; - - /* - * Try a slab allocation first for all smaller allocations. If that - * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_VECS entries. - */ - if (*nr_vecs < BIO_MAX_VECS) { - struct bio_vec *bvl; - - bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) - return bvl; - *nr_vecs = BIO_MAX_VECS; - } - - return mempool_alloc(pool, gfp_mask); -} - void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP @@ -433,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work) } } +/* + * submit_bio_noacct() converts recursion to iteration; this means if we're + * running beneath it, any bios we allocate and submit will not be submitted + * (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios from the + * same bio_set while running underneath submit_bio_noacct(). If we were to + * allocate multiple bios (say a stacking block driver that was splitting bios), + * we would deadlock if we exhausted the mempool's reserve. + * + * We solve this, and guarantee forward progress by punting the bios on + * current->bio_list to a per bio_set rescuer workqueue before blocking to wait + * for elements being returned to the mempool. + */ static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; - if (WARN_ON_ONCE(!bs->rescue_workqueue)) + if (!current->bio_list || !bs->rescue_workqueue) return; + if (bio_list_empty(¤t->bio_list[0]) && + bio_list_empty(¤t->bio_list[1])) + return; + /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -486,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) local_irq_restore(flags); } -static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, - unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, - struct bio_set *bs) +static struct bio *bio_alloc_percpu_cache(struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; @@ -506,11 +491,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->free_list = bio->bi_next; cache->nr--; put_cpu(); - - if (nr_vecs) - bio_init_inline(bio, bdev, nr_vecs, opf); - else - bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -520,7 +500,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio - * @gfp_mask: the GFP_* mask given to the slab allocator + * @gfp: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -550,91 +530,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs) + blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { - gfp_t saved_gfp = gfp_mask; - struct bio *bio; + struct bio_vec *bvecs = NULL; + struct bio *bio = NULL; + gfp_t saved_gfp = gfp; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + gfp = try_alloc_gfp(gfp); if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { - opf |= REQ_ALLOC_CACHE; - bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, - gfp_mask, bs); - if (bio) - return bio; /* - * No cached bio available, bio returned below marked with - * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. + * Set REQ_ALLOC_CACHE even if no cached bio is available to + * return the allocated bio to the percpu cache when done. */ - } else + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_percpu_cache(bs); + } else { opf &= ~REQ_ALLOC_CACHE; - - /* - * submit_bio_noacct() converts recursion to iteration; this means if - * we're running beneath it, any bios we allocate and submit will not be - * submitted (and thus freed) until after we return. - * - * This exposes us to a potential deadlock if we allocate multiple bios - * from the same bio_set() while running underneath submit_bio_noacct(). - * If we were to allocate multiple bios (say a stacking block driver - * that was splitting bios), we would deadlock if we exhausted the - * mempool's reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are bios on - * current->bio_list, we first try the allocation without - * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be - * blocking to the rescuer workqueue before we retry with the original - * gfp_flags. - */ - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); + p = kmem_cache_alloc(bs->bio_slab, gfp); + if (p) + bio = p + bs->front_pad; } - if (unlikely(!p)) - return NULL; - if (!mempool_is_saturated(&bs->bio_pool)) - opf &= ~REQ_ALLOC_CACHE; - bio = p + bs->front_pad; - if (nr_vecs > BIO_INLINE_VECS) { - struct bio_vec *bvl = NULL; + if (bio && nr_vecs > BIO_INLINE_VECS) { + struct biovec_slab *bvs = biovec_slab(nr_vecs); - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); + /* + * Upgrade nr_vecs to take full advantage of the allocation. + * We also rely on this in bvec_free(). + */ + nr_vecs = bvs->nr_vecs; + bvecs = kmem_cache_alloc(bvs->slab, gfp); + if (unlikely(!bvecs)) { + kmem_cache_free(bs->bio_slab, p); + bio = NULL; } - if (unlikely(!bvl)) - goto err_free; + } - bio_init(bio, bdev, bvl, nr_vecs, opf); - } else if (nr_vecs) { - bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); - } else { - bio_init(bio, bdev, NULL, 0, opf); + if (unlikely(!bio)) { + /* + * Give up if we are not allow to sleep as non-blocking mempool + * allocations just go back to the slab allocation. + */ + if (!(saved_gfp & __GFP_DIRECT_RECLAIM)) + return NULL; + + punt_bios_to_rescuer(bs); + + /* + * Don't rob the mempools by returning to the per-CPU cache if + * we're tight on memory. + */ + opf &= ~REQ_ALLOC_CACHE; + + p = mempool_alloc(&bs->bio_pool, gfp); + bio = p + bs->front_pad; + if (nr_vecs > BIO_INLINE_VECS) { + nr_vecs = BIO_MAX_VECS; + bvecs = mempool_alloc(&bs->bvec_pool, gfp); + } } + if (nr_vecs && nr_vecs <= BIO_INLINE_VECS) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, bvecs, nr_vecs, opf); bio->bi_pool = bs; return bio; - -err_free: - mempool_free(p, &bs->bio_pool); - return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); diff --git a/include/linux/bio.h b/include/linux/bio.h index 9693a0d6fefe..984844d2870b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs); + blk_opf_t opf, gfp_t gfp, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); -- cgit v1.2.3 From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:30 +0000 Subject: sched/topology: Remove sched_domain_shared allocation with sd_data Now that "sd->shared" assignments are using the sched_domain_shared objects allocated with s_data, remove the sd_data based allocations. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 1 - kernel/sched/topology.c | 19 ------------------- 2 files changed, 20 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a1e1032426dc..51c29581f15e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain *__percpu *sd; - struct sched_domain_shared *__percpu *sds; struct sched_group *__percpu *sg; struct sched_group_capacity *__percpu *sgc; }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b19d84f44669..43150591914b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1609,9 +1609,6 @@ static void claim_allocations(int cpu, struct s_data *d) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -2390,10 +2387,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2404,7 +2397,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2415,13 +2407,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2463,8 +2448,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2472,8 +2455,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); -- cgit v1.2.3 From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Mar 2026 00:36:22 +0100 Subject: PM: EM: Switch to rcu_dereference_all() in wakeup path em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu() switch to rcu_dereference_all() and check for rcu_read_lock_any_held() in em_cpu_energy() as well. In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so rcu_read_{,un}lock() can be removed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com --- include/linux/energy_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index e7497f804644..c909a8ba22e8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); + lockdep_assert(rcu_read_lock_any_held()); if (!sum_util) return 0; @@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested performance. */ - em_table = rcu_dereference(pd->em_table); + em_table = rcu_dereference_all(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; -- cgit v1.2.3 From b7560798466a07d9c3fb011698e92c335ab28baf Mon Sep 17 00:00:00 2001 From: Devendra K Verma Date: Wed, 18 Mar 2026 12:34:03 +0530 Subject: dmaengine: dw-edma: Add non-LL mode AMD MDB IP supports Linked List (LL) mode as well as non-LL mode. The current code does not have the mechanisms to enable the DMA transactions using the non-LL mode. The following two cases are added with this patch: - For the AMD (Xilinx) only, when a valid physical base address of the device side DDR is not configured, then the IP can still be used in non-LL mode. For all the channels DMA transactions will be using the non-LL mode only. This, the default non-LL mode, is not applicable for Synopsys IP with the current code addition. - If the default mode is LL-mode, for both AMD (Xilinx) and Synosys, and if user wants to use non-LL mode then user can do so via configuring the peripheral_config param of dma_slave_config. Signed-off-by: Devendra K Verma Reviewed-by: Frank Li Link: https://patch.msgid.link/20260318070403.1634706-3-devendra.verma@amd.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 47 ++++++++++++++++++++++++- drivers/dma/dw-edma/dw-edma-core.h | 1 + drivers/dma/dw-edma/dw-edma-pcie.c | 44 +++++++++++++++++------- drivers/dma/dw-edma/dw-hdma-v0-core.c | 64 ++++++++++++++++++++++++++++++++++- drivers/dma/dw-edma/dw-hdma-v0-regs.h | 1 + include/linux/dma/edma.h | 1 + 6 files changed, 143 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 94bcbd560143..c2feb3adc79f 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -223,6 +223,43 @@ static int dw_edma_device_config(struct dma_chan *dchan, struct dma_slave_config *config) { struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + bool cfg_non_ll; + int non_ll = 0; + + chan->non_ll = false; + if (chan->dw->chip->mf == EDMA_MF_HDMA_NATIVE) { + if (config->peripheral_config && + config->peripheral_size != sizeof(int)) { + dev_err(dchan->device->dev, + "config param peripheral size mismatch\n"); + return -EINVAL; + } + + /* + * When there is no valid LLP base address available then the + * default DMA ops will use the non-LL mode. + * + * Cases where LL mode is enabled and client wants to use the + * non-LL mode then also client can do so via providing the + * peripheral_config param. + */ + cfg_non_ll = chan->dw->chip->cfg_non_ll; + if (config->peripheral_config) { + non_ll = *(int *)config->peripheral_config; + + if (cfg_non_ll && !non_ll) { + dev_err(dchan->device->dev, "invalid configuration\n"); + return -EINVAL; + } + } + + if (cfg_non_ll || non_ll) + chan->non_ll = true; + } else if (config->peripheral_config) { + dev_err(dchan->device->dev, + "peripheral config param applicable only for HDMA\n"); + return -EINVAL; + } memcpy(&chan->config, config, sizeof(*config)); chan->configured = true; @@ -358,6 +395,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) struct dw_edma_desc *desc; u64 src_addr, dst_addr; size_t fsz = 0; + u32 bursts_max; u32 cnt = 0; int i; @@ -415,6 +453,13 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) return NULL; } + /* + * For non-LL mode, only a single burst can be handled + * in a single chunk unlike LL mode where multiple bursts + * can be configured in a single chunk. + */ + bursts_max = chan->non_ll ? 1 : chan->ll_max; + desc = dw_edma_alloc_desc(chan); if (unlikely(!desc)) goto err_alloc; @@ -450,7 +495,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg) break; - if (chunk->bursts_alloc == chan->ll_max) { + if (chunk->bursts_alloc == bursts_max) { chunk = dw_edma_alloc_chunk(desc); if (unlikely(!chunk)) goto err_alloc; diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index 59b24973fa7d..902574b1ba86 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -86,6 +86,7 @@ struct dw_edma_chan { u8 configured; struct dma_slave_config config; + bool non_ll; }; struct dw_edma_irq { diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index 0cb5850ca411..0b30ce138503 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -295,6 +295,15 @@ static void dw_edma_pcie_get_xilinx_dma_data(struct pci_dev *pdev, pdata->devmem_phys_off = off; } +static u64 dw_edma_get_phys_addr(struct pci_dev *pdev, + struct dw_edma_pcie_data *pdata, + enum pci_barno bar) +{ + if (pdev->vendor == PCI_VENDOR_ID_XILINX) + return pdata->devmem_phys_off; + return pci_bus_address(pdev, bar); +} + static int dw_edma_pcie_probe(struct pci_dev *pdev, const struct pci_device_id *pid) { @@ -303,6 +312,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, struct dw_edma_chip *chip; int err, nr_irqs; int i, mask; + bool non_ll = false; struct dw_edma_pcie_data *vsec_data __free(kfree) = kmalloc_obj(*vsec_data); @@ -329,21 +339,24 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, /* * There is no valid address found for the LL memory - * space on the device side. + * space on the device side. In the absence of LL base + * address use the non-LL mode or simple mode supported by + * the HDMA IP. */ if (vsec_data->devmem_phys_off == DW_PCIE_XILINX_MDB_INVALID_ADDR) - return -ENOMEM; + non_ll = true; /* * Configure the channel LL and data blocks if number of * channels enabled in VSEC capability are more than the * channels configured in xilinx_mdb_data. */ - dw_edma_set_chan_region_offset(vsec_data, BAR_2, 0, - DW_PCIE_XILINX_MDB_LL_OFF_GAP, - DW_PCIE_XILINX_MDB_LL_SIZE, - DW_PCIE_XILINX_MDB_DT_OFF_GAP, - DW_PCIE_XILINX_MDB_DT_SIZE); + if (!non_ll) + dw_edma_set_chan_region_offset(vsec_data, BAR_2, 0, + DW_PCIE_XILINX_MDB_LL_OFF_GAP, + DW_PCIE_XILINX_MDB_LL_SIZE, + DW_PCIE_XILINX_MDB_DT_OFF_GAP, + DW_PCIE_XILINX_MDB_DT_SIZE); } /* Mapping PCI BAR regions */ @@ -391,6 +404,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, chip->mf = vsec_data->mf; chip->nr_irqs = nr_irqs; chip->ops = &dw_edma_pcie_plat_ops; + chip->cfg_non_ll = non_ll; chip->ll_wr_cnt = vsec_data->wr_ch_cnt; chip->ll_rd_cnt = vsec_data->rd_ch_cnt; @@ -399,7 +413,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, if (!chip->reg_base) return -ENOMEM; - for (i = 0; i < chip->ll_wr_cnt; i++) { + for (i = 0; i < chip->ll_wr_cnt && !non_ll; i++) { struct dw_edma_region *ll_region = &chip->ll_region_wr[i]; struct dw_edma_region *dt_region = &chip->dt_region_wr[i]; struct dw_edma_block *ll_block = &vsec_data->ll_wr[i]; @@ -410,7 +424,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; ll_region->vaddr.io += ll_block->off; - ll_region->paddr = pci_bus_address(pdev, ll_block->bar); + ll_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + ll_block->bar); ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; @@ -419,12 +434,13 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; dt_region->vaddr.io += dt_block->off; - dt_region->paddr = pci_bus_address(pdev, dt_block->bar); + dt_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + dt_block->bar); dt_region->paddr += dt_block->off; dt_region->sz = dt_block->sz; } - for (i = 0; i < chip->ll_rd_cnt; i++) { + for (i = 0; i < chip->ll_rd_cnt && !non_ll; i++) { struct dw_edma_region *ll_region = &chip->ll_region_rd[i]; struct dw_edma_region *dt_region = &chip->dt_region_rd[i]; struct dw_edma_block *ll_block = &vsec_data->ll_rd[i]; @@ -435,7 +451,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; ll_region->vaddr.io += ll_block->off; - ll_region->paddr = pci_bus_address(pdev, ll_block->bar); + ll_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + ll_block->bar); ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; @@ -444,7 +461,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; dt_region->vaddr.io += dt_block->off; - dt_region->paddr = pci_bus_address(pdev, dt_block->bar); + dt_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + dt_block->bar); dt_region->paddr += dt_block->off; dt_region->sz = dt_block->sz; } diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c index 8f75934e09d0..632abb8b481c 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-core.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c @@ -225,7 +225,7 @@ static void dw_hdma_v0_sync_ll_data(struct dw_edma_chunk *chunk) readl(chunk->ll_region.vaddr.io); } -static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) +static void dw_hdma_v0_core_ll_start(struct dw_edma_chunk *chunk, bool first) { struct dw_edma_chan *chan = chunk->chan; struct dw_edma *dw = chan->dw; @@ -263,6 +263,68 @@ static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) SET_CH_32(dw, chan->dir, chan->id, doorbell, HDMA_V0_DOORBELL_START); } +static void dw_hdma_v0_core_non_ll_start(struct dw_edma_chunk *chunk) +{ + struct dw_edma_chan *chan = chunk->chan; + struct dw_edma *dw = chan->dw; + struct dw_edma_burst *child; + u32 val; + + child = list_first_entry_or_null(&chunk->burst->list, + struct dw_edma_burst, list); + if (!child) + return; + + SET_CH_32(dw, chan->dir, chan->id, ch_en, HDMA_V0_CH_EN); + + /* Source address */ + SET_CH_32(dw, chan->dir, chan->id, sar.lsb, + lower_32_bits(child->sar)); + SET_CH_32(dw, chan->dir, chan->id, sar.msb, + upper_32_bits(child->sar)); + + /* Destination address */ + SET_CH_32(dw, chan->dir, chan->id, dar.lsb, + lower_32_bits(child->dar)); + SET_CH_32(dw, chan->dir, chan->id, dar.msb, + upper_32_bits(child->dar)); + + /* Transfer size */ + SET_CH_32(dw, chan->dir, chan->id, transfer_size, child->sz); + + /* Interrupt setup */ + val = GET_CH_32(dw, chan->dir, chan->id, int_setup) | + HDMA_V0_STOP_INT_MASK | + HDMA_V0_ABORT_INT_MASK | + HDMA_V0_LOCAL_STOP_INT_EN | + HDMA_V0_LOCAL_ABORT_INT_EN; + + if (!(dw->chip->flags & DW_EDMA_CHIP_LOCAL)) { + val |= HDMA_V0_REMOTE_STOP_INT_EN | + HDMA_V0_REMOTE_ABORT_INT_EN; + } + + SET_CH_32(dw, chan->dir, chan->id, int_setup, val); + + /* Channel control setup */ + val = GET_CH_32(dw, chan->dir, chan->id, control1); + val &= ~HDMA_V0_LINKLIST_EN; + SET_CH_32(dw, chan->dir, chan->id, control1, val); + + SET_CH_32(dw, chan->dir, chan->id, doorbell, + HDMA_V0_DOORBELL_START); +} + +static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) +{ + struct dw_edma_chan *chan = chunk->chan; + + if (chan->non_ll) + dw_hdma_v0_core_non_ll_start(chunk); + else + dw_hdma_v0_core_ll_start(chunk, first); +} + static void dw_hdma_v0_core_ch_config(struct dw_edma_chan *chan) { struct dw_edma *dw = chan->dw; diff --git a/drivers/dma/dw-edma/dw-hdma-v0-regs.h b/drivers/dma/dw-edma/dw-hdma-v0-regs.h index eab5fd7177e5..7759ba9b4850 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-regs.h +++ b/drivers/dma/dw-edma/dw-hdma-v0-regs.h @@ -12,6 +12,7 @@ #include #define HDMA_V0_MAX_NR_CH 8 +#define HDMA_V0_CH_EN BIT(0) #define HDMA_V0_LOCAL_ABORT_INT_EN BIT(6) #define HDMA_V0_REMOTE_ABORT_INT_EN BIT(5) #define HDMA_V0_LOCAL_STOP_INT_EN BIT(4) diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 9da53c75e49b..1fafd5b0e315 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -103,6 +103,7 @@ struct dw_edma_chip { enum dw_edma_map_format mf; struct dw_edma *dw; + bool cfg_non_ll; }; /* Export to the platform drivers */ -- cgit v1.2.3 From de9e2b3d88af36411301c049a1b049f3e4fe0757 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 3 Mar 2026 21:24:17 +0200 Subject: uapi: Provide DIV_ROUND_CLOSEST() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently DIV_ROUND_CLOSEST() is only available for the kernel via include/linux/math.h. Expose it to userland as well by adding __KERNEL_DIV_ROUND_CLOSEST() as a common definition in uapi. Additionally, ensure it allows building ISO C applications by switching from the 'typeof' GNU extension to the ISO-friendly __typeof__. Reviewed-by: Nícolas F. R. A. Prado Tested-by: Diederik de Haas Acked-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20260303-rk3588-bgcolor-v8-1-fee377037ad1@collabora.com Signed-off-by: Daniel Stone --- include/linux/math.h | 18 +----------------- include/uapi/linux/const.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/math.h b/include/linux/math.h index 6dc1d1d32fbc..1e8fb3efbc8c 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -89,23 +89,7 @@ } \ ) -/* - * Divide positive or negative dividend by positive or negative divisor - * and round to closest integer. Result is undefined for negative - * divisors if the dividend variable type is unsigned and for negative - * dividends if the divisor variable type is unsigned. - */ -#define DIV_ROUND_CLOSEST(x, divisor)( \ -{ \ - typeof(x) __x = x; \ - typeof(divisor) __d = divisor; \ - (((typeof(x))-1) > 0 || \ - ((typeof(divisor))-1) > 0 || \ - (((__x) > 0) == ((__d) > 0))) ? \ - (((__x) + ((__d) / 2)) / (__d)) : \ - (((__x) - ((__d) / 2)) / (__d)); \ -} \ -) +#define DIV_ROUND_CLOSEST __KERNEL_DIV_ROUND_CLOSEST /* * Same as above but for u64 dividends. divisor must be a 32-bit * number. diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index b8f629ef135f..565f309b9df8 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -50,4 +50,22 @@ #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +/* + * Divide positive or negative dividend by positive or negative divisor + * and round to closest integer. Result is undefined for negative + * divisors if the dividend variable type is unsigned and for negative + * dividends if the divisor variable type is unsigned. + */ +#define __KERNEL_DIV_ROUND_CLOSEST(x, divisor) \ +({ \ + __typeof__(x) __x = x; \ + __typeof__(divisor) __d = divisor; \ + \ + (((__typeof__(x))-1) > 0 || \ + ((__typeof__(divisor))-1) > 0 || \ + (((__x) > 0) == ((__d) > 0))) ? \ + (((__x) + ((__d) / 2)) / (__d)) : \ + (((__x) - ((__d) / 2)) / (__d)); \ +}) + #endif /* _UAPI_LINUX_CONST_H */ -- cgit v1.2.3 From 4c684596cde44d03dfd9624c86e1de4db0dcf121 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 3 Mar 2026 21:24:18 +0200 Subject: drm: Add CRTC background color property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some display controllers can be hardware programmed to show non-black colors for pixels that are either not covered by any plane or are exposed through transparent regions of higher planes. This feature can help reduce memory bandwidth usage, e.g. in compositors managing a UI with a solid background color while using smaller planes to render the remaining content. To support this capability, introduce the BACKGROUND_COLOR standard DRM mode property, which can be attached to a CRTC through the drm_crtc_attach_background_color_property() helper function. Additionally, define a 64-bit ARGB format value to be built with the help of a couple of dedicated DRM_ARGB64_PREP*() helpers. Individual color components can be extracted with desired precision using the corresponding DRM_ARGB64_GET*() macros. Co-developed-by: Matt Roper Signed-off-by: Matt Roper Reviewed-by: Nícolas F. R. A. Prado Tested-by: Diederik de Haas Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20260303-rk3588-bgcolor-v8-2-fee377037ad1@collabora.com Signed-off-by: Daniel Stone --- drivers/gpu/drm/drm_atomic.c | 1 + drivers/gpu/drm/drm_atomic_state_helper.c | 1 + drivers/gpu/drm/drm_atomic_uapi.c | 4 ++ drivers/gpu/drm/drm_blend.c | 39 +++++++++++++-- drivers/gpu/drm/drm_mode_config.c | 6 +++ include/drm/drm_blend.h | 4 +- include/drm/drm_crtc.h | 12 +++++ include/drm/drm_mode_config.h | 5 ++ include/uapi/drm/drm_mode.h | 80 +++++++++++++++++++++++++++++++ 9 files changed, 147 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index dd9f27cfe991..6a395e5e3885 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -475,6 +475,7 @@ static void drm_atomic_crtc_print_state(struct drm_printer *p, drm_printf(p, "\tconnector_mask=%x\n", state->connector_mask); drm_printf(p, "\tencoder_mask=%x\n", state->encoder_mask); drm_printf(p, "\tmode: " DRM_MODE_FMT "\n", DRM_MODE_ARG(&state->mode)); + drm_printf(p, "\tbackground_color=%llx\n", state->background_color); if (crtc->funcs->atomic_print_state) crtc->funcs->atomic_print_state(p, state); diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c index bd6faa09f83b..76746ad4a1bb 100644 --- a/drivers/gpu/drm/drm_atomic_state_helper.c +++ b/drivers/gpu/drm/drm_atomic_state_helper.c @@ -75,6 +75,7 @@ __drm_atomic_helper_crtc_state_reset(struct drm_crtc_state *crtc_state, struct drm_crtc *crtc) { crtc_state->crtc = crtc; + crtc_state->background_color = DRM_ARGB64_PREP(0xffff, 0, 0, 0); } EXPORT_SYMBOL(__drm_atomic_helper_crtc_state_reset); diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 87de41fb4459..5bd5bf6661df 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -454,6 +454,8 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, &replaced); state->color_mgmt_changed |= replaced; return ret; + } else if (property == config->background_color_property) { + state->background_color = val; } else if (property == config->prop_out_fence_ptr) { s32 __user *fence_ptr = u64_to_user_ptr(val); @@ -501,6 +503,8 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc, *val = (state->ctm) ? state->ctm->base.id : 0; else if (property == config->gamma_lut_property) *val = (state->gamma_lut) ? state->gamma_lut->base.id : 0; + else if (property == config->background_color_property) + *val = state->background_color; else if (property == config->prop_out_fence_ptr) *val = 0; else if (property == crtc->scaling_filter_property) diff --git a/drivers/gpu/drm/drm_blend.c b/drivers/gpu/drm/drm_blend.c index 3b1f5f72885e..1f3af27d2418 100644 --- a/drivers/gpu/drm/drm_blend.c +++ b/drivers/gpu/drm/drm_blend.c @@ -191,10 +191,6 @@ * plane does not expose the "alpha" property, then this is * assumed to be 1.0 * - * Note that all the property extensions described here apply either to the - * plane or the CRTC (e.g. for the background color, which currently is not - * exposed and assumed to be black). - * * SCALING_FILTER: * Indicates scaling filter to be used for plane scaler * @@ -207,6 +203,25 @@ * * Drivers can set up this property for a plane by calling * drm_plane_create_scaling_filter_property + * + * The property extensions described above all apply to the plane. Drivers + * may also expose the following crtc property extension: + * + * BACKGROUND_COLOR: + * Background color is set up with drm_crtc_attach_background_color_property(), + * and expects a 64-bit ARGB value following DRM_FORMAT_ARGB16161616, as + * generated by the DRM_ARGB64_PREP*() helpers. It controls the color of a + * full-screen layer that exists below all planes. This color will be used + * for pixels not covered by any plane and may also be blended with plane + * contents as allowed by a plane's alpha values. + * The background color defaults to black, and is assumed to be black for + * drivers that do not expose this property. Although background color + * isn't a plane, it is assumed that the color provided here undergoes the + * CRTC degamma/CSC/gamma transformations applied after the planes blending. + * Note that the color value includes an alpha channel, hence non-opaque + * background color values are allowed, but since physically transparent + * monitors do not (yet) exists, the final alpha value may not reach the + * video sink or it may simply ignore it. */ /** @@ -621,3 +636,19 @@ int drm_plane_create_blend_mode_property(struct drm_plane *plane, return 0; } EXPORT_SYMBOL(drm_plane_create_blend_mode_property); + +/** + * drm_crtc_attach_background_color_property - attach background color property + * @crtc: drm crtc + * + * Attaches the background color property to @crtc. The property defaults to + * solid black and will accept 64-bit ARGB values in the format generated by + * DRM_ARGB64_PREP*() helpers. + */ +void drm_crtc_attach_background_color_property(struct drm_crtc *crtc) +{ + drm_object_attach_property(&crtc->base, + crtc->dev->mode_config.background_color_property, + DRM_ARGB64_PREP(0xffff, 0, 0, 0)); +} +EXPORT_SYMBOL(drm_crtc_attach_background_color_property); diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c index 84ae8a23a367..66f7dc37b597 100644 --- a/drivers/gpu/drm/drm_mode_config.c +++ b/drivers/gpu/drm/drm_mode_config.c @@ -380,6 +380,12 @@ static int drm_mode_create_standard_properties(struct drm_device *dev) return -ENOMEM; dev->mode_config.gamma_lut_size_property = prop; + prop = drm_property_create_range(dev, 0, + "BACKGROUND_COLOR", 0, U64_MAX); + if (!prop) + return -ENOMEM; + dev->mode_config.background_color_property = prop; + prop = drm_property_create(dev, DRM_MODE_PROP_IMMUTABLE | DRM_MODE_PROP_BLOB, "IN_FORMATS", 0); diff --git a/include/drm/drm_blend.h b/include/drm/drm_blend.h index 88bdfec3bd88..c7e888767c81 100644 --- a/include/drm/drm_blend.h +++ b/include/drm/drm_blend.h @@ -31,8 +31,9 @@ #define DRM_MODE_BLEND_COVERAGE 1 #define DRM_MODE_BLEND_PIXEL_NONE 2 -struct drm_device; struct drm_atomic_state; +struct drm_crtc; +struct drm_device; struct drm_plane; static inline bool drm_rotation_90_or_270(unsigned int rotation) @@ -58,4 +59,5 @@ int drm_atomic_normalize_zpos(struct drm_device *dev, struct drm_atomic_state *state); int drm_plane_create_blend_mode_property(struct drm_plane *plane, unsigned int supported_modes); +void drm_crtc_attach_background_color_property(struct drm_crtc *crtc); #endif diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 66278ffeebd6..312fc1e745d2 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -274,6 +274,18 @@ struct drm_crtc_state { */ struct drm_property_blob *gamma_lut; + /** + * @background_color: + * + * RGB value representing the CRTC's background color. The background + * color (aka "canvas color") of a CRTC is the color that will be used + * for pixels not covered by a plane, or covered by transparent pixels + * of a plane. The value here should be built using DRM_ARGB64_PREP*() + * helpers, while the individual color components can be extracted with + * desired precision via the DRM_ARGB64_GET*() macros. + */ + u64 background_color; + /** * @target_vblank: * diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index 5e1dd0cfccde..687c0ee163d2 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -836,6 +836,11 @@ struct drm_mode_config { * gamma LUT as supported by the driver (read-only). */ struct drm_property *gamma_lut_size_property; + /** + * @background_color_property: Optional CRTC property to set the + * background color. + */ + struct drm_property *background_color_property; /** * @suggested_x_property: Optional connector property with a hint for diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 3693d82b5279..a4bdc4bd11bc 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -27,6 +27,9 @@ #ifndef _DRM_MODE_H #define _DRM_MODE_H +#include +#include + #include "drm.h" #if defined(__cplusplus) @@ -1549,6 +1552,83 @@ struct drm_mode_closefb { __u32 pad; }; +/* + * Put 16-bit ARGB values into a standard 64-bit representation that can be + * used for ioctl parameters, inter-driver communication, etc. + * + * If the component values being provided contain less than 16 bits of + * precision, use a conversion ratio to get a better color approximation. + * The ratio is computed as (2^16 - 1) / (2^bpc - 1), where bpc and 16 are + * the input and output precision, respectively. + * Also note bpc must be greater than 0. + */ +#define __DRM_ARGB64_PREP(c, shift) \ + (((__u64)(c) & __GENMASK(15, 0)) << (shift)) + +#define __DRM_ARGB64_PREP_BPC(c, shift, bpc) \ +({ \ + __u16 mask = __GENMASK((bpc) - 1, 0); \ + __u16 conv = __KERNEL_DIV_ROUND_CLOSEST((mask & (c)) * \ + __GENMASK(15, 0), mask);\ + __DRM_ARGB64_PREP(conv, shift); \ +}) + +#define DRM_ARGB64_PREP(alpha, red, green, blue) \ +( \ + __DRM_ARGB64_PREP(alpha, 48) | \ + __DRM_ARGB64_PREP(red, 32) | \ + __DRM_ARGB64_PREP(green, 16) | \ + __DRM_ARGB64_PREP(blue, 0) \ +) + +#define DRM_ARGB64_PREP_BPC(alpha, red, green, blue, bpc) \ +({ \ + __typeof__(bpc) __bpc = bpc; \ + __DRM_ARGB64_PREP_BPC(alpha, 48, __bpc) | \ + __DRM_ARGB64_PREP_BPC(red, 32, __bpc) | \ + __DRM_ARGB64_PREP_BPC(green, 16, __bpc) | \ + __DRM_ARGB64_PREP_BPC(blue, 0, __bpc); \ +}) + +/* + * Extract the specified color component from a standard 64-bit ARGB value. + * + * If the requested precision is less than 16 bits, make use of a conversion + * ratio calculated as (2^bpc - 1) / (2^16 - 1), where bpc and 16 are the + * output and input precision, respectively. + * + * If speed is more important than accuracy, use DRM_ARGB64_GET*_BPCS() + * instead of DRM_ARGB64_GET*_BPC() in order to replace the expensive + * division with a simple bit right-shift operation. + */ +#define __DRM_ARGB64_GET(c, shift) \ + ((__u16)(((__u64)(c) >> (shift)) & __GENMASK(15, 0))) + +#define __DRM_ARGB64_GET_BPC(c, shift, bpc) \ +({ \ + __u16 comp = __DRM_ARGB64_GET(c, shift); \ + __KERNEL_DIV_ROUND_CLOSEST(comp * __GENMASK((bpc) - 1, 0), \ + __GENMASK(15, 0)); \ +}) + +#define __DRM_ARGB64_GET_BPCS(c, shift, bpc) \ + (__DRM_ARGB64_GET(c, shift) >> (16 - (bpc))) + +#define DRM_ARGB64_GETA(c) __DRM_ARGB64_GET(c, 48) +#define DRM_ARGB64_GETR(c) __DRM_ARGB64_GET(c, 32) +#define DRM_ARGB64_GETG(c) __DRM_ARGB64_GET(c, 16) +#define DRM_ARGB64_GETB(c) __DRM_ARGB64_GET(c, 0) + +#define DRM_ARGB64_GETA_BPC(c, bpc) __DRM_ARGB64_GET_BPC(c, 48, bpc) +#define DRM_ARGB64_GETR_BPC(c, bpc) __DRM_ARGB64_GET_BPC(c, 32, bpc) +#define DRM_ARGB64_GETG_BPC(c, bpc) __DRM_ARGB64_GET_BPC(c, 16, bpc) +#define DRM_ARGB64_GETB_BPC(c, bpc) __DRM_ARGB64_GET_BPC(c, 0, bpc) + +#define DRM_ARGB64_GETA_BPCS(c, bpc) __DRM_ARGB64_GET_BPCS(c, 48, bpc) +#define DRM_ARGB64_GETR_BPCS(c, bpc) __DRM_ARGB64_GET_BPCS(c, 32, bpc) +#define DRM_ARGB64_GETG_BPCS(c, bpc) __DRM_ARGB64_GET_BPCS(c, 16, bpc) +#define DRM_ARGB64_GETB_BPCS(c, bpc) __DRM_ARGB64_GET_BPCS(c, 0, bpc) + #if defined(__cplusplus) } #endif -- cgit v1.2.3 From f8a5f6934f30b9ee334256347dd70a7ba0f8be7f Mon Sep 17 00:00:00 2001 From: Aldo Conte Date: Wed, 11 Mar 2026 17:33:20 +0100 Subject: usb: typec: Document priority and mode_selection fields in struct typec_altmode The fields 'priority' and 'mode_selection' in struct typec_altmode are missing from the kernel-doc comment, which results in warnings when building the documentation with 'make htmldocs'. WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'priority' not described in 'typec_altmode' WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'mode_selection' not described in 'typec_altmode' Document both fields to keep the kernel-doc comment aligned with the structure definition. Signed-off-by: Aldo Conte Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260311163320.61534-1-aldocontelk@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_altmode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 0513d333b797..b90cc5cfff8d 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -26,6 +26,9 @@ struct typec_altmode_ops; * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @active: Tells has the mode been entered or not + * @priority: Priority used by the automatic alternate mode selection process + * @mode_selection: Whether entry to this alternate mode is managed by the + * automatic alternate mode selection process or by the specific driver * @desc: Optional human readable description of the mode * @ops: Operations vector from the driver * @cable_ops: Cable operations vector from the driver. -- cgit v1.2.3 From 9270102a00aabbe4d1bbb6890d514b01f1c42989 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:02:59 +0000 Subject: dt-bindings: connector: Add SPR AVS Sink APDO definitions USB Power Delivery 3.2 introduces a new power supply type SPR AVS. Add macro definitions for the USB Power Delivery (PD) Standard Power Range (SPR) Adjustable Voltage Supply (AVS) as a Sink Augmented Power Data Object (APDO) in the device tree bindings. Signed-off-by: Badhri Jagan Sridharan Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260316150301.3892223-2-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/connector/usb-connector.yaml | 5 +++-- include/dt-bindings/usb/pd.h | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/connector/usb-connector.yaml b/Documentation/devicetree/bindings/connector/usb-connector.yaml index 901986de3e2b..a00b239960a3 100644 --- a/Documentation/devicetree/bindings/connector/usb-connector.yaml +++ b/Documentation/devicetree/bindings/connector/usb-connector.yaml @@ -364,8 +364,9 @@ $defs: "Universal Serial Bus Power Delivery Specification" chapter 6.4.1.3 Sink Capabilities Message, the order of each entry(PDO) should follow the PD spec chapter 6.4.1. Required for power sink and power dual role. User - can specify the sink PDO array via PDO_FIXED/BATT/VAR/PPS_APDO() defined - in dt-bindings/usb/pd.h. + can specify the sink PDO array via + PDO_FIXED/BATT/VAR/PPS_APDO/SPR_AVS_SNK_APDO() defined in + dt-bindings/usb/pd.h. minItems: 1 maxItems: 7 $ref: /schemas/types.yaml#/definitions/uint32-array diff --git a/include/dt-bindings/usb/pd.h b/include/dt-bindings/usb/pd.h index 6cff2339bda3..1e64a1f563f9 100644 --- a/include/dt-bindings/usb/pd.h +++ b/include/dt-bindings/usb/pd.h @@ -60,6 +60,7 @@ PDO_VAR_MAX_VOLT(max_mv) | PDO_VAR_MAX_CURR(max_ma)) #define APDO_TYPE_PPS 0 +#define APDO_TYPE_SPR_AVS 2 #define PDO_APDO_TYPE_SHIFT 28 /* Only valid value currently is 0x0 - PPS */ #define PDO_APDO_TYPE_MASK 0x3 @@ -85,6 +86,23 @@ PDO_PPS_APDO_MIN_VOLT(min_mv) | PDO_PPS_APDO_MAX_VOLT(max_mv) | \ PDO_PPS_APDO_MAX_CURR(max_ma)) +#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR_SHIFT 10 /* 10mA units */ +#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR_SHIFT 0 /* 10mA units */ +#define PDO_SPR_AVS_APDO_MAX_CURR_MASK 0x3ff + +#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR(max_cur_9v_to_15v_ma) \ + ((((max_cur_9v_to_15v_ma) / 10) & PDO_SPR_AVS_APDO_MAX_CURR_MASK) << \ + PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR_SHIFT) + +#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR(max_cur_15v_to_20v_ma) \ + ((((max_cur_15v_to_20v_ma) / 10) & PDO_SPR_AVS_APDO_MAX_CURR_MASK) << \ + PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR_SHIFT) + +#define PDO_SPR_AVS_SNK_APDO(max_cur_9v_to_15v_ma, max_cur_15v_to_20v_ma) \ + (PDO_TYPE(PDO_TYPE_APDO) | PDO_APDO_TYPE(APDO_TYPE_SPR_AVS) | \ + PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR(max_cur_9v_to_15v_ma) | \ + PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR(max_cur_15v_to_20v_ma)) + /* * Based on "Table 6-14 Fixed Supply PDO - Sink" of "USB Power Delivery Specification Revision 3.0, * Version 1.2" -- cgit v1.2.3 From a43dd4f6f91ed1a1d16595cb0c550b283e9b2298 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:00 +0000 Subject: power: supply: Add PD SPR AVS support to USB type enum Add two new members to the power_supply_usb_type to represent the USB Power Delivery (PD) Standard Power Range (SPR) Adjustable Voltage Supply (AVS) charging types: POWER_SUPPLY_USB_TYPE_PD_SPR_AVS: For devices supporting only the PD SPR AVS type. POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS: For devices that support both PD Programmable Power Supply (PPS) and PD SPR AVS. Signed-off-by: Badhri Jagan Sridharan Link: https://patch.msgid.link/20260316150301.3892223-3-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-class-power | 3 ++- drivers/power/supply/power_supply_sysfs.c | 2 ++ include/linux/power_supply.h | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 4b21d5d23251..32697b926cc8 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -675,7 +675,8 @@ Description: Valid values: "Unknown", "SDP", "DCP", "CDP", "ACA", "C", "PD", - "PD_DRP", "PD_PPS", "BrickID" + "PD_DRP", "PD_PPS", "BrickID", "PD_SPR_AVS", + "PD_PPS_SPR_AVS" **Device Specific Properties** diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index dd3a48d72d2b..f30a7b9ccd5e 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -70,6 +70,8 @@ static const char * const POWER_SUPPLY_USB_TYPE_TEXT[] = { [POWER_SUPPLY_USB_TYPE_PD] = "PD", [POWER_SUPPLY_USB_TYPE_PD_DRP] = "PD_DRP", [POWER_SUPPLY_USB_TYPE_PD_PPS] = "PD_PPS", + [POWER_SUPPLY_USB_TYPE_PD_SPR_AVS] = "PD_SPR_AVS", + [POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS] = "PD_PPS_SPR_AVS", [POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID] = "BrickID", }; diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 360ffdf272da..7a5e4c3242a0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -210,6 +210,9 @@ enum power_supply_usb_type { POWER_SUPPLY_USB_TYPE_PD, /* Power Delivery Port */ POWER_SUPPLY_USB_TYPE_PD_DRP, /* PD Dual Role Port */ POWER_SUPPLY_USB_TYPE_PD_PPS, /* PD Programmable Power Supply */ + /* PD Standard Power Range Adjustable Voltage Supply */ + POWER_SUPPLY_USB_TYPE_PD_SPR_AVS, + POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS, /* Supports both PD PPS + SPR AVS */ POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID, /* Apple Charging Method */ }; -- cgit v1.2.3 From d3d959404e6c72e09db8de8893a970edf0ac565d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:01 +0000 Subject: tcpm: Implement sink support for PD SPR AVS negotiation Add support to enable TCPM to negotiate with USB PD Standard Power Range Adjustable Voltage Supply (SPR AVS) when acting as a power sink. * Added support to the tcpm power supply properties, allowing userspace to enable and control the dynamic limits (voltage and current) specific to the SPR AVS contract. * Implemented tcpm_pd_select_spr_avs_apdo() to select the appropriate APDO and validate the requested voltage/current against both the Source and Sink capabilities. * Implemented tcpm_pd_build_spr_avs_request() to construct the Request Data Object (RDO) for SPR AVS. * Added SNK_NEGOTIATE_SPR_AVS_CAPABILITIES state to the state machine to handle negotiation for SPR AVS. * Updated the SNK_TRANSITION_SINK state to implement the SPR AVS-specific VBUS transition rules, including reducing current draw to PD_I_SNK_STBY_MA for large voltage changes, as required by USB PD spec. Log stub captured when enabling AVS: $ echo 3 > /sys/class/power_supply/tcpm-source-psy-1-0025/online $ cat /d/usb/tcpm-1-0025/log [ 358.895775] request to set AVS online [ 358.895792] AMS POWER_NEGOTIATION start [ 358.895806] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 358.895850] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 358.895866] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9000 [ 358.895880] Requesting APDO SPR AVS 4: 9000 mV, 2200 mA [ 358.896405] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 358.896422] PD TX, header: 0x1a82 [ 358.900158] PD TX complete, status: 0 [ 358.900205] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 358.904832] PD RX, header: 0x1a3 [1] [ 358.904854] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 358.904888] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 359.021530] PD RX, header: 0x3a6 [1] [ 359.021546] Setting voltage/current limit 9000 mV 2200 mA [ 359.023035] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9000 pps_apdo_min_volt:0 ret:0 [ 359.023053] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 359.023090] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/online 3 Log stub captured when increasing voltage: $ echo 9100000 > /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now $ cat /d/usb/tcpm-1-0025/log [ 632.116714] AMS POWER_NEGOTIATION start [ 632.116728] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 632.116779] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 632.116798] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9100 [ 632.116811] Requesting APDO SPR AVS 4: 9100 mV, 2200 mA [ 632.117315] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 632.117328] PD TX, header: 0x1c82 [ 632.121007] PD TX complete, status: 0 [ 632.121052] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 632.124572] PD RX, header: 0x5a3 [1] [ 632.124594] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 632.124623] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 632.149256] PD RX, header: 0x7a6 [1] [ 632.149271] Setting voltage/current limit 9100 mV 2200 mA [ 632.150770] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9100 pps_apdo_min_volt:0 ret:0 [ 632.150787] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 632.150823] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now 9100000 Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Acked-by: Heikki Krogerus Link: https://patch.msgid.link/20260316150301.3892223-4-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 611 +++++++++++++++++++++++++++++++++++------- include/linux/usb/pd.h | 32 ++- include/linux/usb/tcpm.h | 2 +- 3 files changed, 537 insertions(+), 108 deletions(-) (limited to 'include') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 63a75b94743d..dfbb94ddc98a 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -62,6 +62,7 @@ S(SNK_WAIT_CAPABILITIES_TIMEOUT), \ S(SNK_NEGOTIATE_CAPABILITIES), \ S(SNK_NEGOTIATE_PPS_CAPABILITIES), \ + S(SNK_NEGOTIATE_SPR_AVS_CAPABILITIES), \ S(SNK_TRANSITION_SINK), \ S(SNK_TRANSITION_SINK_VBUS), \ S(SNK_READY), \ @@ -308,6 +309,51 @@ struct pd_pps_data { bool active; }; +enum spr_avs_status { + SPR_AVS_UNKNOWN, + SPR_AVS_NOT_SUPPORTED, + SPR_AVS_SUPPORTED +}; + +static const char * const spr_avs_status_strings[] = { + [SPR_AVS_UNKNOWN] = "Unknown", + [SPR_AVS_SUPPORTED] = "Supported", + [SPR_AVS_NOT_SUPPORTED] = "Not Supported", +}; + +/* + * Standard Power Range Adjustable Voltage Supply (SPR - AVS) data + * @max_current_ma_9v_to_15v: Max current for 9V to 15V range derived from + * source cap & sink cap + * @max_current_ma_15v_to_20v: Max current for 15V to 20V range derived from + * source cap & sink cap + * @req_op_curr_ma: Requested operating current to the port partner acting as source + * @req_out_volt_mv: Requested output voltage to the port partner acting as source + * @max_out_volt_mv: Max SPR voltage supported by the port and the port partner + * @max_current_ma; MAX SPR current supported by the port and the port partner + * @port_partner_src_status: SPR AVS status of port partner acting as source + * @port_partner_src_pdo_index: PDO index of SPR AVS cap of the port partner + * acting as source. Valid only when + * port_partner_src_status is SPR_AVS_SUPPORTED. + * @port_snk_status: SPR AVS status of the local port acting as sink. + * @port_snk_pdo_index: PDO index of SPR AVS cap of local port acting as sink + * @active: True when the local port acting as the sink has negotiated SPR AVS + * with the partner acting as source. + */ +struct pd_spr_avs_data { + u32 max_current_ma_9v_to_15v; + u32 max_current_ma_15v_to_20v; + u32 req_op_curr_ma; + u32 req_out_volt_mv; + u32 max_out_volt_mv; + u32 max_current_ma; + enum spr_avs_status port_partner_src_status; + unsigned int port_partner_src_pdo_index; + enum spr_avs_status port_snk_status; + unsigned int port_snk_pdo_index; + bool active; +}; + struct pd_data { struct usb_power_delivery *pd; struct usb_power_delivery_capabilities *source_cap; @@ -376,6 +422,11 @@ struct sink_caps_ext_data { u8 spr_max_pdp; }; +enum aug_req_type { + PD_PPS, + PD_SPR_AVS, +}; + struct tcpm_port { struct device *dev; @@ -538,9 +589,14 @@ struct tcpm_port { /* PPS */ struct pd_pps_data pps_data; - struct completion pps_complete; - bool pps_pending; - int pps_status; + + /* SPR AVS */ + struct pd_spr_avs_data spr_avs_data; + + /* Augmented supply request - PPS; SPR_AVS */ + struct completion aug_supply_req_complete; + bool aug_supply_req_pending; + int aug_supply_req_status; /* Alternate mode data */ struct pd_mode_data mode_data; @@ -3285,6 +3341,7 @@ static void tcpm_pd_data_request(struct tcpm_port *port, switch (type) { case PD_DATA_SOURCE_CAP: + port->spr_avs_data.port_partner_src_status = SPR_AVS_UNKNOWN; for (i = 0; i < cnt; i++) port->source_caps[i] = le32_to_cpu(msg->payload[i]); @@ -3456,12 +3513,12 @@ static void tcpm_pd_data_request(struct tcpm_port *port, } } -static void tcpm_pps_complete(struct tcpm_port *port, int result) +static void tcpm_aug_supply_req_complete(struct tcpm_port *port, int result) { - if (port->pps_pending) { - port->pps_status = result; - port->pps_pending = false; - complete(&port->pps_complete); + if (port->aug_supply_req_pending) { + port->aug_supply_req_status = result; + port->aug_supply_req_pending = false; + complete(&port->aug_supply_req_complete); } } @@ -3559,7 +3616,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, /* Revert data back from any requested PPS updates */ port->pps_data.req_out_volt = port->supply_voltage; port->pps_data.req_op_curr = port->current_limit; - port->pps_status = (type == PD_CTRL_WAIT ? + port->aug_supply_req_status = (type == PD_CTRL_WAIT ? -EAGAIN : -EOPNOTSUPP); /* Threshold was relaxed before sending Request. Restore it back. */ @@ -3567,6 +3624,20 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, port->pps_data.active, port->supply_voltage); + tcpm_set_state(port, SNK_READY, 0); + break; + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + /* Revert data back from any requested SPR AVS updates */ + port->spr_avs_data.req_out_volt_mv = port->supply_voltage; + port->spr_avs_data.req_op_curr_ma = port->current_limit; + port->aug_supply_req_status = (type == PD_CTRL_WAIT ? + -EAGAIN : -EOPNOTSUPP); + + /* Threshold was relaxed before sending Request. Restore it back. */ + tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_PD, + port->spr_avs_data.active, + port->supply_voltage); + tcpm_set_state(port, SNK_READY, 0); break; case DR_SWAP_SEND: @@ -3621,6 +3692,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, switch (port->state) { case SNK_NEGOTIATE_CAPABILITIES: port->pps_data.active = false; + port->spr_avs_data.active = false; tcpm_set_state(port, SNK_TRANSITION_SINK, 0); break; case SNK_NEGOTIATE_PPS_CAPABILITIES: @@ -3633,6 +3705,13 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, power_supply_changed(port->psy); tcpm_set_state(port, SNK_TRANSITION_SINK, 0); break; + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + port->spr_avs_data.active = true; + port->req_supply_voltage = port->spr_avs_data.req_out_volt_mv; + port->req_current_limit = port->spr_avs_data.req_op_curr_ma; + power_supply_changed(port->psy); + tcpm_set_state(port, SNK_TRANSITION_SINK, 0); + break; case SOFT_RESET_SEND: if (port->ams == SOFT_RESET_AMS) tcpm_ams_finish(port); @@ -4130,9 +4209,9 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, case PDO_TYPE_APDO: if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) { port->pps_data.supported = true; - port->usb_type = - POWER_SUPPLY_USB_TYPE_PD_PPS; - power_supply_changed(port->psy); + } else if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) { + port->spr_avs_data.port_partner_src_status = SPR_AVS_SUPPORTED; + port->spr_avs_data.port_partner_src_pdo_index = i; } continue; default: @@ -4170,6 +4249,10 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, min_snk_mv = pdo_min_voltage(pdo); break; case PDO_TYPE_APDO: + if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) { + port->spr_avs_data.port_snk_status = SPR_AVS_SUPPORTED; + port->spr_avs_data.port_snk_pdo_index = j; + } continue; default: tcpm_log(port, "Invalid sink PDO type, ignoring"); @@ -4191,6 +4274,23 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, } } + if (port->spr_avs_data.port_snk_status == SPR_AVS_UNKNOWN) + port->spr_avs_data.port_snk_status = SPR_AVS_NOT_SUPPORTED; + + if (port->spr_avs_data.port_partner_src_status == SPR_AVS_UNKNOWN) + port->spr_avs_data.port_partner_src_status = SPR_AVS_NOT_SUPPORTED; + + if (port->pps_data.supported && + port->spr_avs_data.port_partner_src_status == SPR_AVS_SUPPORTED) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS; + else if (port->pps_data.supported) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_PPS; + else if (port->spr_avs_data.port_partner_src_status == SPR_AVS_SUPPORTED) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_SPR_AVS; + + if (port->usb_type != POWER_SUPPLY_USB_TYPE_PD) + power_supply_changed(port->psy); + return ret; } @@ -4241,6 +4341,88 @@ static unsigned int tcpm_pd_select_pps_apdo(struct tcpm_port *port) return src_pdo; } +static int tcpm_pd_select_spr_avs_apdo(struct tcpm_port *port) +{ + u32 req_out_volt_mv, req_op_curr_ma, src_max_curr_ma = 0, source_cap; + u32 snk_max_curr_ma = 0, src_pdo_index, snk_pdo_index, snk_pdo; + + if (port->spr_avs_data.port_snk_status != SPR_AVS_SUPPORTED || + port->spr_avs_data.port_partner_src_status != + SPR_AVS_SUPPORTED) { + tcpm_log(port, "SPR AVS not supported. port:%s partner:%s", + spr_avs_status_strings[port->spr_avs_data.port_snk_status], + spr_avs_status_strings[port->spr_avs_data.port_partner_src_status]); + return -EOPNOTSUPP; + } + + /* Round up to SPR_AVS_VOLT_MV_STEP */ + req_out_volt_mv = port->spr_avs_data.req_out_volt_mv; + if (req_out_volt_mv % SPR_AVS_VOLT_MV_STEP) { + req_out_volt_mv += SPR_AVS_VOLT_MV_STEP - + (req_out_volt_mv % SPR_AVS_VOLT_MV_STEP); + port->spr_avs_data.req_out_volt_mv = req_out_volt_mv; + } + + /* Round up to RDO_SPR_AVS_CURR_MA_STEP */ + req_op_curr_ma = port->spr_avs_data.req_op_curr_ma; + if (req_op_curr_ma % RDO_SPR_AVS_CURR_MA_STEP) { + req_op_curr_ma += RDO_SPR_AVS_CURR_MA_STEP - + (req_op_curr_ma % RDO_SPR_AVS_CURR_MA_STEP); + port->spr_avs_data.req_op_curr_ma = req_op_curr_ma; + } + + src_pdo_index = port->spr_avs_data.port_partner_src_pdo_index; + snk_pdo_index = port->spr_avs_data.port_snk_pdo_index; + source_cap = port->source_caps[src_pdo_index]; + snk_pdo = port->snk_pdo[snk_pdo_index]; + tcpm_log(port, + "SPR AVS src_pdo_index:%d snk_pdo_index:%d req_op_curr_ma roundup:%u req_out_volt_mv roundup:%u", + src_pdo_index, snk_pdo_index, req_op_curr_ma, req_out_volt_mv); + + if (req_out_volt_mv >= SPR_AVS_TIER1_MIN_VOLT_MV && + req_out_volt_mv <= SPR_AVS_TIER1_MAX_VOLT_MV) { + src_max_curr_ma = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap); + snk_max_curr_ma = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo); + } else if (req_out_volt_mv > SPR_AVS_TIER1_MAX_VOLT_MV && + req_out_volt_mv <= SPR_AVS_TIER2_MAX_VOLT_MV) { + src_max_curr_ma = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap); + snk_max_curr_ma = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo); + } else { + tcpm_log(port, "Invalid SPR AVS req_volt:%umV", req_out_volt_mv); + return -EINVAL; + } + + if (req_op_curr_ma > src_max_curr_ma || + req_op_curr_ma > snk_max_curr_ma) { + tcpm_log(port, + "Invalid SPR AVS request. req_volt:%umV req_curr:%umA src_max_cur:%umA snk_max_cur:%umA", + req_out_volt_mv, req_op_curr_ma, src_max_curr_ma, + snk_max_curr_ma); + return -EINVAL; + } + + /* Max SPR voltage based on both the port and the partner caps */ + if (pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo) && + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap)) + port->spr_avs_data.max_out_volt_mv = SPR_AVS_TIER2_MAX_VOLT_MV; + else + port->spr_avs_data.max_out_volt_mv = SPR_AVS_TIER1_MAX_VOLT_MV; + + /* + * Max SPR AVS curr based on 9V to 15V. This should be higher than or + * equal to 15V to 20V range. + */ + port->spr_avs_data.max_current_ma = + min(pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap), + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo)); + + return src_pdo_index; +} + static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo) { unsigned int mv, ma, mw, flags; @@ -4408,13 +4590,74 @@ static int tcpm_pd_build_pps_request(struct tcpm_port *port, u32 *rdo) return 0; } -static int tcpm_pd_send_pps_request(struct tcpm_port *port) +static int tcpm_pd_build_spr_avs_request(struct tcpm_port *port, u32 *rdo) +{ + u32 out_mv, op_ma, flags, snk_pdo_index, source_cap; + unsigned int src_power_mw, snk_power_mw; + int src_pdo_index; + u32 snk_pdo; + + src_pdo_index = tcpm_pd_select_spr_avs_apdo(port); + if (src_pdo_index < 0) + return src_pdo_index; + snk_pdo_index = port->spr_avs_data.port_snk_pdo_index; + source_cap = port->source_caps[src_pdo_index]; + snk_pdo = port->snk_pdo[snk_pdo_index]; + out_mv = port->spr_avs_data.req_out_volt_mv; + op_ma = port->spr_avs_data.req_op_curr_ma; + + flags = RDO_USB_COMM | RDO_NO_SUSPEND; + + /* + * Set capability mismatch when the maximum power needs in the current + * requested AVS voltage tier range is greater than + * port->operating_snk_mw, however, the maximum power offered by the + * source at the current requested AVS voltage tier is less than + * port->operating_sink_mw. + */ + if (out_mv > SPR_AVS_TIER1_MAX_VOLT_MV) { + src_power_mw = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap) * + SPR_AVS_TIER2_MAX_VOLT_MV / 1000; + snk_power_mw = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo) * + SPR_AVS_TIER2_MAX_VOLT_MV / 1000; + } else { + src_power_mw = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap) * + SPR_AVS_TIER1_MAX_VOLT_MV / 1000; + snk_power_mw = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo) * + SPR_AVS_TIER1_MAX_VOLT_MV / 1000; + } + + if (snk_power_mw >= port->operating_snk_mw && + src_power_mw < port->operating_snk_mw) + flags |= RDO_CAP_MISMATCH; + + *rdo = RDO_AVS(src_pdo_index + 1, out_mv, op_ma, flags); + + tcpm_log(port, "Requesting APDO SPR AVS %d: %u mV, %u mA", + src_pdo_index, out_mv, op_ma); + + return 0; +} + +static int tcpm_pd_send_aug_supply_request(struct tcpm_port *port, + enum aug_req_type type) { struct pd_message msg; int ret; u32 rdo; - ret = tcpm_pd_build_pps_request(port, &rdo); + if (type == PD_PPS) { + ret = tcpm_pd_build_pps_request(port, &rdo); + } else if (type == PD_SPR_AVS) { + ret = tcpm_pd_build_spr_avs_request(port, &rdo); + } else { + tcpm_log(port, "Invalid aug_req_type %d", type); + ret = -EOPNOTSUPP; + } if (ret < 0) return ret; @@ -4637,6 +4880,14 @@ static void tcpm_set_partner_usb_comm_capable(struct tcpm_port *port, bool capab port->tcpc->set_partner_usb_comm_capable(port->tcpc, capable); } +static void tcpm_partner_source_caps_reset(struct tcpm_port *port) +{ + usb_power_delivery_unregister_capabilities(port->partner_source_caps); + port->partner_source_caps = NULL; + port->spr_avs_data.port_partner_src_status = SPR_AVS_UNKNOWN; + port->spr_avs_data.active = false; +} + static void tcpm_reset_port(struct tcpm_port *port) { tcpm_enable_auto_vbus_discharge(port, false); @@ -4676,8 +4927,7 @@ static void tcpm_reset_port(struct tcpm_port *port) usb_power_delivery_unregister_capabilities(port->partner_sink_caps); port->partner_sink_caps = NULL; - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); usb_power_delivery_unregister(port->partner_pd); port->partner_pd = NULL; } @@ -5169,7 +5419,7 @@ static void run_state_machine(struct tcpm_port *port) case SNK_UNATTACHED: if (!port->non_pd_role_swap) tcpm_swap_complete(port, -ENOTCONN); - tcpm_pps_complete(port, -ENOTCONN); + tcpm_aug_supply_req_complete(port, -ENOTCONN); tcpm_snk_detach(port); if (port->potential_contaminant) { tcpm_set_state(port, CHECK_CONTAMINANT, 0); @@ -5400,13 +5650,16 @@ static void run_state_machine(struct tcpm_port *port) } break; case SNK_NEGOTIATE_PPS_CAPABILITIES: - ret = tcpm_pd_send_pps_request(port); + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + ret = tcpm_pd_send_aug_supply_request(port, port->state == + SNK_NEGOTIATE_PPS_CAPABILITIES ? + PD_PPS : PD_SPR_AVS); if (ret < 0) { /* Restore back to the original state */ tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_PD, port->pps_data.active, port->supply_voltage); - port->pps_status = ret; + port->aug_supply_req_status = ret; /* * If this was called due to updates to sink * capabilities, and pps is no longer valid, we should @@ -5422,23 +5675,58 @@ static void run_state_machine(struct tcpm_port *port) } break; case SNK_TRANSITION_SINK: - /* From the USB PD spec: - * "The Sink Shall transition to Sink Standby before a positive or - * negative voltage transition of VBUS. During Sink Standby - * the Sink Shall reduce its power draw to pSnkStdby." - * - * This is not applicable to PPS though as the port can continue - * to draw negotiated power without switching to standby. - */ - if (port->supply_voltage != port->req_supply_voltage && !port->pps_data.active && - port->current_limit * port->supply_voltage / 1000 > PD_P_SNK_STDBY_MW) { - u32 stdby_ma = PD_P_SNK_STDBY_MW * 1000 / port->supply_voltage; + if (port->spr_avs_data.active) { + if (abs(port->req_supply_voltage - port->supply_voltage) > + SPR_AVS_AVS_SMALL_STEP_V * 1000) { + /* + * The Sink Shall reduce its current draw to + * iSnkStdby within tSnkStdby. The reduction to + * iSnkStdby is not required if the voltage + * increase is less than or equal to + * vAvsSmallStep. + */ + tcpm_log(port, + "SPR AVS Setting iSnkstandby. Req vol: %u mV Curr vol: %u mV", + port->req_supply_voltage, + port->supply_voltage); + tcpm_set_current_limit(port, PD_I_SNK_STBY_MA, + port->supply_voltage); + } + /* + * Although tAvsSrcTransSmall is expected to be used + * for voltage transistions smaller than 1V, using + * tAvsSrcTransLarge to be resilient against chargers + * which strictly cannot honor tAvsSrcTransSmall to + * improve interoperability. + */ + tcpm_set_state(port, hard_reset_state(port), + PD_T_AVS_SRC_TRANS_LARGE); + /* + * From the USB PD spec: + * "The Sink Shall transition to Sink Standby before a + * positive ornegative voltage transition of VBUS. + * During Sink Standby the Sink Shall reduce its power + * draw to pSnkStdby." + * + * This is not applicable to PPS though as the port can + * continue to draw negotiated power without switching + * to standby. + */ + } else if (port->supply_voltage != port->req_supply_voltage && + !port->pps_data.active && + (port->current_limit * port->supply_voltage / 1000 > + PD_P_SNK_STDBY_MW)) { + u32 stdby_ma = PD_P_SNK_STDBY_MW * 1000 / + port->supply_voltage; tcpm_log(port, "Setting standby current %u mV @ %u mA", port->supply_voltage, stdby_ma); - tcpm_set_current_limit(port, stdby_ma, port->supply_voltage); + tcpm_set_current_limit(port, stdby_ma, + port->supply_voltage); + tcpm_set_state(port, hard_reset_state(port), + PD_T_PS_TRANSITION); } - fallthrough; + break; case SNK_TRANSITION_SINK_VBUS: tcpm_set_state(port, hard_reset_state(port), PD_T_PS_TRANSITION); @@ -5458,7 +5746,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_typec_connect(port); if (port->pd_capable && port->source_caps[0] & PDO_FIXED_DUAL_ROLE) mod_enable_frs_delayed_work(port, 0); - tcpm_pps_complete(port, port->pps_status); + tcpm_aug_supply_req_complete(port, port->aug_supply_req_status); if (port->ams != NONE_AMS) tcpm_ams_finish(port); @@ -5645,8 +5933,7 @@ static void run_state_machine(struct tcpm_port *port) port->message_id = 0; port->rx_msgid = -1; /* remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); tcpm_pd_send_control(port, PD_CTRL_ACCEPT, TCPC_TX_SOP); tcpm_ams_finish(port); if (port->pwr_role == TYPEC_SOURCE) { @@ -5679,8 +5966,7 @@ static void run_state_machine(struct tcpm_port *port) port->message_id = 0; port->rx_msgid = -1; /* remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); if (tcpm_pd_send_control(port, PD_CTRL_SOFT_RESET, TCPC_TX_SOP)) tcpm_set_state_cond(port, hard_reset_state(port), 0); else @@ -5817,8 +6103,7 @@ static void run_state_machine(struct tcpm_port *port) break; case PR_SWAP_SNK_SRC_SINK_OFF: /* will be source, remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); /* * Prevent vbus discharge circuit from turning on during PR_SWAP * as this is not a disconnect. @@ -5966,7 +6251,7 @@ static void run_state_machine(struct tcpm_port *port) break; case ERROR_RECOVERY: tcpm_swap_complete(port, -EPROTO); - tcpm_pps_complete(port, -EPROTO); + tcpm_aug_supply_req_complete(port, -EPROTO); tcpm_set_state(port, PORT_RESET, 0); break; case PORT_RESET: @@ -6940,7 +7225,7 @@ static int tcpm_try_role(struct typec_port *p, int role) return ret; } -static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) +static int tcpm_aug_set_op_curr(struct tcpm_port *port, u16 req_op_curr_ma) { unsigned int target_mw; int ret; @@ -6948,7 +7233,19 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) mutex_lock(&port->swap_lock); mutex_lock(&port->lock); - if (!port->pps_data.active) { + if (port->pps_data.active) { + req_op_curr_ma = req_op_curr_ma - + (req_op_curr_ma % RDO_PROG_CURR_MA_STEP); + if (req_op_curr_ma > port->pps_data.max_curr) { + ret = -EINVAL; + goto port_unlock; + } + target_mw = (req_op_curr_ma * port->supply_voltage) / 1000; + if (target_mw < port->operating_snk_mw) { + ret = -EINVAL; + goto port_unlock; + } + } else if (!port->spr_avs_data.active) { ret = -EOPNOTSUPP; goto port_unlock; } @@ -6958,38 +7255,31 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) goto port_unlock; } - if (req_op_curr > port->pps_data.max_curr) { - ret = -EINVAL; - goto port_unlock; - } - - target_mw = (req_op_curr * port->supply_voltage) / 1000; - if (target_mw < port->operating_snk_mw) { - ret = -EINVAL; - goto port_unlock; - } + if (port->pps_data.active) + port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; - port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; ret = tcpm_ams_start(port, POWER_NEGOTIATION); if (ret == -EAGAIN) { port->upcoming_state = INVALID_STATE; goto port_unlock; } - /* Round down operating current to align with PPS valid steps */ - req_op_curr = req_op_curr - (req_op_curr % RDO_PROG_CURR_MA_STEP); - - reinit_completion(&port->pps_complete); - port->pps_data.req_op_curr = req_op_curr; - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + if (port->pps_data.active) + port->pps_data.req_op_curr = req_op_curr_ma; + else + port->spr_avs_data.req_op_curr_ma = req_op_curr_ma; + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7001,7 +7291,7 @@ swap_unlock: return ret; } -static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) +static int tcpm_aug_set_out_volt(struct tcpm_port *port, u16 req_out_volt_mv) { unsigned int target_mw; int ret; @@ -7009,7 +7299,16 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) mutex_lock(&port->swap_lock); mutex_lock(&port->lock); - if (!port->pps_data.active) { + if (port->pps_data.active) { + req_out_volt_mv = req_out_volt_mv - (req_out_volt_mv % + RDO_PROG_VOLT_MV_STEP); + /* Round down output voltage to align with PPS valid steps */ + target_mw = (port->current_limit * req_out_volt_mv) / 1000; + if (target_mw < port->operating_snk_mw) { + ret = -EINVAL; + goto port_unlock; + } + } else if (!port->spr_avs_data.active) { ret = -EOPNOTSUPP; goto port_unlock; } @@ -7019,33 +7318,31 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) goto port_unlock; } - target_mw = (port->current_limit * req_out_volt) / 1000; - if (target_mw < port->operating_snk_mw) { - ret = -EINVAL; - goto port_unlock; - } + if (port->pps_data.active) + port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; - port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; ret = tcpm_ams_start(port, POWER_NEGOTIATION); if (ret == -EAGAIN) { port->upcoming_state = INVALID_STATE; goto port_unlock; } - /* Round down output voltage to align with PPS valid steps */ - req_out_volt = req_out_volt - (req_out_volt % RDO_PROG_VOLT_MV_STEP); - - reinit_completion(&port->pps_complete); - port->pps_data.req_out_volt = req_out_volt; - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + if (port->pps_data.active) + port->pps_data.req_out_volt = req_out_volt_mv; + else + port->spr_avs_data.req_out_volt_mv = req_out_volt_mv; + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7088,9 +7385,9 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate) goto port_unlock; } - reinit_completion(&port->pps_complete); - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; /* Trigger PPS request or move back to standard PDO contract */ if (activate) { @@ -7099,11 +7396,75 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate) } mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) + ret = -ETIMEDOUT; + else + ret = port->aug_supply_req_status; + + goto swap_unlock; + +port_unlock: + mutex_unlock(&port->lock); +swap_unlock: + mutex_unlock(&port->swap_lock); + + return ret; +} + +static int tcpm_spr_avs_activate(struct tcpm_port *port, bool activate) +{ + int ret = 0; + + mutex_lock(&port->swap_lock); + mutex_lock(&port->lock); + + if (port->spr_avs_data.port_snk_status == SPR_AVS_NOT_SUPPORTED || + port->spr_avs_data.port_partner_src_status == SPR_AVS_NOT_SUPPORTED) { + tcpm_log(port, "SPR_AVS not supported"); + ret = -EOPNOTSUPP; + goto port_unlock; + } + + /* Trying to deactivate SPR AVS when already deactivated so just bail */ + if (!port->spr_avs_data.active && !activate) + goto port_unlock; + + if (port->state != SNK_READY) { + tcpm_log(port, + "SPR_AVS cannot be activated. Port not in SNK_READY"); + ret = -EAGAIN; + goto port_unlock; + } + + if (activate) + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_CAPABILITIES; + ret = tcpm_ams_start(port, POWER_NEGOTIATION); + if (ret == -EAGAIN) { + tcpm_log(port, "SPR_AVS cannot be %s. AMS start failed", + activate ? "activated" : "deactivated"); + port->upcoming_state = INVALID_STATE; + goto port_unlock; + } + + reinit_completion(&port->aug_supply_req_complete); + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; + + /* Trigger AVS request or move back to standard PDO contract */ + if (activate) { + port->spr_avs_data.req_out_volt_mv = port->supply_voltage; + port->spr_avs_data.req_op_curr_ma = port->current_limit; + } + mutex_unlock(&port->lock); + + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7259,16 +7620,26 @@ static int tcpm_pd_set(struct typec_port *p, struct usb_power_delivery *pd) break; case SNK_NEGOTIATE_CAPABILITIES: case SNK_NEGOTIATE_PPS_CAPABILITIES: + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: case SNK_READY: case SNK_TRANSITION_SINK: case SNK_TRANSITION_SINK_VBUS: - if (port->pps_data.active) + if (port->pps_data.active) { port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; - else if (port->pd_capable) + } else if (port->pd_capable) { port->upcoming_state = SNK_NEGOTIATE_CAPABILITIES; - else + if (port->spr_avs_data.active) { + /* + * De-activate AVS and fallback to PD to + * re-evaluate whether AVS is supported in the + * current sink cap set. + */ + port->spr_avs_data.active = false; + port->spr_avs_data.port_snk_status = SPR_AVS_UNKNOWN; + } + } else { break; - + } port->update_sink_caps = true; ret = tcpm_ams_start(port, POWER_NEGOTIATION); @@ -7778,7 +8149,8 @@ static void tcpm_fw_get_pd_revision(struct tcpm_port *port, struct fwnode_handle enum tcpm_psy_online_states { TCPM_PSY_OFFLINE = 0, TCPM_PSY_FIXED_ONLINE, - TCPM_PSY_PROG_ONLINE, + TCPM_PSY_PPS_ONLINE, + TCPM_PSY_SPR_AVS_ONLINE, }; static enum power_supply_property tcpm_psy_props[] = { @@ -7796,7 +8168,9 @@ static int tcpm_psy_get_online(struct tcpm_port *port, { if (port->vbus_charge) { if (port->pps_data.active) - val->intval = TCPM_PSY_PROG_ONLINE; + val->intval = TCPM_PSY_PPS_ONLINE; + else if (port->spr_avs_data.active) + val->intval = TCPM_PSY_SPR_AVS_ONLINE; else val->intval = TCPM_PSY_FIXED_ONLINE; } else { @@ -7811,6 +8185,8 @@ static int tcpm_psy_get_voltage_min(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.min_volt * 1000; + else if (port->spr_avs_data.active) + val->intval = SPR_AVS_TIER1_MIN_VOLT_MV * 1000; else val->intval = port->supply_voltage * 1000; @@ -7822,6 +8198,8 @@ static int tcpm_psy_get_voltage_max(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.max_volt * 1000; + else if (port->spr_avs_data.active) + val->intval = port->spr_avs_data.max_out_volt_mv * 1000; else val->intval = port->supply_voltage * 1000; @@ -7841,6 +8219,8 @@ static int tcpm_psy_get_current_max(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.max_curr * 1000; + else if (port->spr_avs_data.active) + val->intval = port->spr_avs_data.max_current_ma * 1000; else val->intval = port->current_limit * 1000; @@ -7916,17 +8296,41 @@ static int tcpm_psy_get_prop(struct power_supply *psy, return ret; } +static int tcpm_disable_pps_avs(struct tcpm_port *port) +{ + int ret = 0; + + if (port->pps_data.active) + ret = tcpm_pps_activate(port, false); + else if (port->spr_avs_data.active) + ret = tcpm_spr_avs_activate(port, false); + + return ret; +} + static int tcpm_psy_set_online(struct tcpm_port *port, const union power_supply_propval *val) { - int ret; + int ret = 0; switch (val->intval) { case TCPM_PSY_FIXED_ONLINE: - ret = tcpm_pps_activate(port, false); + ret = tcpm_disable_pps_avs(port); + break; + case TCPM_PSY_PPS_ONLINE: + if (port->spr_avs_data.active) + ret = tcpm_spr_avs_activate(port, false); + if (!ret) + ret = tcpm_pps_activate(port, true); break; - case TCPM_PSY_PROG_ONLINE: - ret = tcpm_pps_activate(port, true); + case TCPM_PSY_SPR_AVS_ONLINE: + tcpm_log(port, "request to set AVS online"); + if (port->spr_avs_data.active) + return 0; + ret = tcpm_disable_pps_avs(port); + if (ret) + break; + ret = tcpm_spr_avs_activate(port, true); break; default: ret = -EINVAL; @@ -7955,13 +8359,10 @@ static int tcpm_psy_set_prop(struct power_supply *psy, ret = tcpm_psy_set_online(port, val); break; case POWER_SUPPLY_PROP_VOLTAGE_NOW: - ret = tcpm_pps_set_out_volt(port, val->intval / 1000); + ret = tcpm_aug_set_out_volt(port, val->intval / 1000); break; case POWER_SUPPLY_PROP_CURRENT_NOW: - if (val->intval > port->pps_data.max_curr * 1000) - ret = -EINVAL; - else - ret = tcpm_pps_set_op_curr(port, val->intval / 1000); + ret = tcpm_aug_set_op_curr(port, val->intval / 1000); break; default: ret = -EINVAL; @@ -8006,7 +8407,9 @@ static int devm_tcpm_psy_register(struct tcpm_port *port) port->psy_desc.type = POWER_SUPPLY_TYPE_USB; port->psy_desc.usb_types = BIT(POWER_SUPPLY_USB_TYPE_C) | BIT(POWER_SUPPLY_USB_TYPE_PD) | - BIT(POWER_SUPPLY_USB_TYPE_PD_PPS); + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS) | + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS) | + BIT(POWER_SUPPLY_USB_TYPE_PD_SPR_AVS); port->psy_desc.properties = tcpm_psy_props; port->psy_desc.num_properties = ARRAY_SIZE(tcpm_psy_props); port->psy_desc.get_property = tcpm_psy_get_prop; @@ -8101,7 +8504,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) init_completion(&port->tx_complete); init_completion(&port->swap_complete); - init_completion(&port->pps_complete); + init_completion(&port->aug_supply_req_complete); tcpm_debugfs_init(port); err = tcpm_fw_get_caps(port, tcpc->fwnode); diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 5a98983195cb..337a5485af7c 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -398,9 +398,30 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ /* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ -#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 -#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 -#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + +#define SPR_AVS_AVS_SMALL_STEP_V 1 +/* vAvsStep - 100mv */ +#define SPR_AVS_VOLT_MV_STEP 100 +/* SPR AVS RDO Operating Current is in 50mA step */ +#define RDO_SPR_AVS_CURR_MA_STEP 50 +/* SPR AVS RDO Output voltage is in 25mV step */ +#define RDO_SPR_AVS_OUT_VOLT_MV_STEP 25 + +#define RDO_SPR_AVS_VOLT GENMASK(20, 9) +#define RDO_SPR_AVS_CURR GENMASK(6, 0) + +#define RDO_SPR_AVS_OUT_VOLT(mv) \ + FIELD_PREP(RDO_SPR_AVS_VOLT, ((mv) / RDO_SPR_AVS_OUT_VOLT_MV_STEP)) + +#define RDO_SPR_AVS_OP_CURR(ma) \ + FIELD_PREP(RDO_SPR_AVS_CURR, ((ma) / RDO_SPR_AVS_CURR_MA_STEP)) + +#define RDO_AVS(idx, out_mv, op_ma, flags) \ + (RDO_OBJ(idx) | (flags) | \ + RDO_SPR_AVS_OUT_VOLT(out_mv) | RDO_SPR_AVS_OP_CURR(op_ma)) static inline enum pd_pdo_type pdo_type(u32 pdo) { @@ -660,6 +681,11 @@ static inline unsigned int rdo_max_power(u32 rdo) #define PD_P_SNK_STDBY_MW 2500 /* 2500 mW */ +#define PD_I_SNK_STBY_MA 500 /* 500 mA */ + +#define PD_T_AVS_SRC_TRANS_SMALL 50 /* 50 ms */ +#define PD_T_AVS_SRC_TRANS_LARGE 700 /* 700 ms */ + #if IS_ENABLED(CONFIG_TYPEC) struct usb_power_delivery; diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index b22e659f81ba..93079450bba0 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -31,7 +31,7 @@ enum typec_cc_polarity { /* Time to wait for TCPC to complete transmit */ #define PD_T_TCPC_TX_TIMEOUT 100 /* in ms */ #define PD_ROLE_SWAP_TIMEOUT (MSEC_PER_SEC * 10) -#define PD_PPS_CTRL_TIMEOUT (MSEC_PER_SEC * 10) +#define PD_AUG_PSY_CTRL_TIMEOUT (MSEC_PER_SEC * 10) enum tcpm_transmit_status { TCPC_TX_SUCCESS = 0, -- cgit v1.2.3 From f287826fd7e406caa56692ecc39742bdb312b2e8 Mon Sep 17 00:00:00 2001 From: Yuanshen Cao Date: Thu, 5 Mar 2026 03:34:07 +0000 Subject: dt-bindings: power: Add Support for Allwinner A733 PCK600 Power Domain Controller The A733 PCK600, similar to A523 PCK600, is likely a customized version of ARM PCK-600 power controller. They share the same BSP drivers in the package provided by Radxa, with the only difference being the lack of resets. Therefore, document A733 compatible and make resets required only for the other models, as well as prepare the PD definitions for future device trees. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Yuanshen Cao Signed-off-by: Ulf Hansson --- .../bindings/power/allwinner,sun20i-d1-ppu.yaml | 17 ++++++++++++++++- .../dt-bindings/power/allwinner,sun60i-a733-pck-600.h | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/power/allwinner,sun60i-a733-pck-600.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml b/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml index a28e75a9cb6a..b97361ce2a00 100644 --- a/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml +++ b/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml @@ -20,6 +20,7 @@ properties: - allwinner,sun20i-d1-ppu - allwinner,sun55i-a523-pck-600 - allwinner,sun55i-a523-ppu + - allwinner,sun60i-a733-pck-600 reg: maxItems: 1 @@ -38,9 +39,23 @@ required: - compatible - reg - clocks - - resets - '#power-domain-cells' +allOf: + - if: + properties: + compatible: + contains: + enum: + - allwinner,sun8i-v853-ppu + - allwinner,sun20i-d1-ppu + - allwinner,sun55i-a523-pck-600 + - allwinner,sun55i-a523-ppu + + then: + required: + - resets + additionalProperties: false examples: diff --git a/include/dt-bindings/power/allwinner,sun60i-a733-pck-600.h b/include/dt-bindings/power/allwinner,sun60i-a733-pck-600.h new file mode 100644 index 000000000000..cf476a005b55 --- /dev/null +++ b/include/dt-bindings/power/allwinner,sun60i-a733-pck-600.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _DT_BINDINGS_POWER_SUN60I_A733_PCK600_H_ +#define _DT_BINDINGS_POWER_SUN60I_A733_PCK600_H_ + +#define PD_VI 0 +#define PD_DE_SYS 1 +#define PD_VE_DEC 2 +#define PD_VE_ENC 3 +#define PD_NPU 4 +#define PD_GPU_TOP 5 +#define PD_GPU_CORE 6 +#define PD_PCIE 7 +#define PD_USB2 8 +#define PD_VO 9 +#define PD_VO1 10 + +#endif /* _DT_BINDINGS_POWER_SUN60I_A733_PCK600_H_ */ -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From deffe1edba626d474fef38007c03646ca5876a0e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:02 +0100 Subject: module: Fix freeing of charp module parameters when CONFIG_SYSFS=n When setting a charp module parameter, the param_set_charp() function allocates memory to store a copy of the input value. Later, when the module is potentially unloaded, the destroy_params() function is called to free this allocated memory. However, destroy_params() is available only when CONFIG_SYSFS=y, otherwise only a dummy variant is present. In the unlikely case that the kernel is configured with CONFIG_MODULES=y and CONFIG_SYSFS=n, this results in a memory leak of charp values when a module is unloaded. Fix this issue by making destroy_params() always available when CONFIG_MODULES=y. Rename the function to module_destroy_params() to clarify that it is intended for use by the module loader. Fixes: e180a6b7759a ("param: fix charp parameters set via sysfs") Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 11 +++-------- kernel/module/main.c | 4 ++-- kernel/params.c | 27 ++++++++++++++++++--------- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 7d22d4c4ea2e..8667f72503d9 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -426,14 +426,9 @@ extern char *parse_args(const char *name, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ -#ifdef CONFIG_SYSFS -extern void destroy_params(const struct kernel_param *params, unsigned num); -#else -static inline void destroy_params(const struct kernel_param *params, - unsigned num) -{ -} -#endif /* !CONFIG_SYSFS */ +#ifdef CONFIG_MODULES +void module_destroy_params(const struct kernel_param *params, unsigned int num); +#endif /* All the helper functions */ /* The macros to do compile-time type checking stolen from Jakub diff --git a/kernel/module/main.c b/kernel/module/main.c index c3ce106c70af..ef2e2130972f 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1408,7 +1408,7 @@ static void free_module(struct module *mod) module_unload_free(mod); /* Free any allocated parameters. */ - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); if (is_livepatch_module(mod)) free_module_elf(mod); @@ -3519,7 +3519,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); diff --git a/kernel/params.c b/kernel/params.c index 7188a12dbe86..c6a354d54213 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -745,15 +745,6 @@ void module_param_sysfs_remove(struct module *mod) } #endif -void destroy_params(const struct kernel_param *params, unsigned num) -{ - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].ops->free) - params[i].ops->free(params[i].arg); -} - struct module_kobject * __init_or_module lookup_or_create_module_kobject(const char *name) { @@ -985,3 +976,21 @@ static int __init param_sysfs_builtin_init(void) late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODULES + +/* + * module_destroy_params - free all parameters for one module + * @params: module parameters (array) + * @num: number of module parameters + */ +void module_destroy_params(const struct kernel_param *params, unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); +} + +#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 65f535501e2a3378629b8650eca553920de5e5a2 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:03 +0100 Subject: module: Clean up parse_args() arguments * Use the preferred `unsigned int` over plain `unsigned` for the `num` parameter. * Synchronize the parameter names in moduleparam.h with the ones used by the implementation in params.c. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 8 ++++---- kernel/params.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 8667f72503d9..604bc6e9f3a1 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -417,12 +417,12 @@ extern bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *name, +extern char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, - s16 level_min, - s16 level_max, + unsigned int num, + s16 min_level, + s16 max_level, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ diff --git a/kernel/params.c b/kernel/params.c index c6a354d54213..74d620bc2521 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -161,7 +161,7 @@ static int parse_one(char *param, char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, + unsigned int num, s16 min_level, s16 max_level, void *arg, parse_unknown_fn unknown) -- cgit v1.2.3 From 44a063c00fb13cf1f2e8a53a2ab10b232a44954b Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:04 +0100 Subject: module: Remove extern keyword from param prototypes The external function declarations do not need the "extern" keyword. Remove it to align with the Linux kernel coding style and to silence the associated checkpatch warnings. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 89 ++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 604bc6e9f3a1..075f28585074 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -317,8 +317,8 @@ struct kparam_array name, &__param_ops_##name, arg, perm, -1, 0) #ifdef CONFIG_SYSFS -extern void kernel_param_lock(struct module *mod); -extern void kernel_param_unlock(struct module *mod); +void kernel_param_lock(struct module *mod); +void kernel_param_unlock(struct module *mod); #else static inline void kernel_param_lock(struct module *mod) { @@ -398,7 +398,7 @@ static inline void kernel_param_unlock(struct module *mod) * Returns: true if the two parameter names are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameq(const char *name1, const char *name2); +bool parameq(const char *name1, const char *name2); /** * parameqn - checks if two parameter names match @@ -412,18 +412,18 @@ extern bool parameq(const char *name1, const char *name2); * are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameqn(const char *name1, const char *name2, size_t n); +bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned int num, - s16 min_level, - s16 max_level, - void *arg, parse_unknown_fn unknown); +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned int num, + s16 min_level, + s16 max_level, + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_MODULES @@ -437,78 +437,77 @@ void module_destroy_params(const struct kernel_param *params, unsigned int num); static inline type __always_unused *__check_##name(void) { return(p); } extern const struct kernel_param_ops param_ops_byte; -extern int param_set_byte(const char *val, const struct kernel_param *kp); -extern int param_get_byte(char *buffer, const struct kernel_param *kp); +int param_set_byte(const char *val, const struct kernel_param *kp); +int param_get_byte(char *buffer, const struct kernel_param *kp); #define param_check_byte(name, p) __param_check(name, p, unsigned char) extern const struct kernel_param_ops param_ops_short; -extern int param_set_short(const char *val, const struct kernel_param *kp); -extern int param_get_short(char *buffer, const struct kernel_param *kp); +int param_set_short(const char *val, const struct kernel_param *kp); +int param_get_short(char *buffer, const struct kernel_param *kp); #define param_check_short(name, p) __param_check(name, p, short) extern const struct kernel_param_ops param_ops_ushort; -extern int param_set_ushort(const char *val, const struct kernel_param *kp); -extern int param_get_ushort(char *buffer, const struct kernel_param *kp); +int param_set_ushort(const char *val, const struct kernel_param *kp); +int param_get_ushort(char *buffer, const struct kernel_param *kp); #define param_check_ushort(name, p) __param_check(name, p, unsigned short) extern const struct kernel_param_ops param_ops_int; -extern int param_set_int(const char *val, const struct kernel_param *kp); -extern int param_get_int(char *buffer, const struct kernel_param *kp); +int param_set_int(const char *val, const struct kernel_param *kp); +int param_get_int(char *buffer, const struct kernel_param *kp); #define param_check_int(name, p) __param_check(name, p, int) extern const struct kernel_param_ops param_ops_uint; -extern int param_set_uint(const char *val, const struct kernel_param *kp); -extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint(const char *val, const struct kernel_param *kp); +int param_get_uint(char *buffer, const struct kernel_param *kp); int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; -extern int param_set_long(const char *val, const struct kernel_param *kp); -extern int param_get_long(char *buffer, const struct kernel_param *kp); +int param_set_long(const char *val, const struct kernel_param *kp); +int param_get_long(char *buffer, const struct kernel_param *kp); #define param_check_long(name, p) __param_check(name, p, long) extern const struct kernel_param_ops param_ops_ulong; -extern int param_set_ulong(const char *val, const struct kernel_param *kp); -extern int param_get_ulong(char *buffer, const struct kernel_param *kp); +int param_set_ulong(const char *val, const struct kernel_param *kp); +int param_get_ulong(char *buffer, const struct kernel_param *kp); #define param_check_ulong(name, p) __param_check(name, p, unsigned long) extern const struct kernel_param_ops param_ops_ullong; -extern int param_set_ullong(const char *val, const struct kernel_param *kp); -extern int param_get_ullong(char *buffer, const struct kernel_param *kp); +int param_set_ullong(const char *val, const struct kernel_param *kp); +int param_get_ullong(char *buffer, const struct kernel_param *kp); #define param_check_ullong(name, p) __param_check(name, p, unsigned long long) extern const struct kernel_param_ops param_ops_hexint; -extern int param_set_hexint(const char *val, const struct kernel_param *kp); -extern int param_get_hexint(char *buffer, const struct kernel_param *kp); +int param_set_hexint(const char *val, const struct kernel_param *kp); +int param_get_hexint(char *buffer, const struct kernel_param *kp); #define param_check_hexint(name, p) param_check_uint(name, p) extern const struct kernel_param_ops param_ops_charp; -extern int param_set_charp(const char *val, const struct kernel_param *kp); -extern int param_get_charp(char *buffer, const struct kernel_param *kp); -extern void param_free_charp(void *arg); +int param_set_charp(const char *val, const struct kernel_param *kp); +int param_get_charp(char *buffer, const struct kernel_param *kp); +void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ extern const struct kernel_param_ops param_ops_bool; -extern int param_set_bool(const char *val, const struct kernel_param *kp); -extern int param_get_bool(char *buffer, const struct kernel_param *kp); +int param_set_bool(const char *val, const struct kernel_param *kp); +int param_get_bool(char *buffer, const struct kernel_param *kp); #define param_check_bool(name, p) __param_check(name, p, bool) extern const struct kernel_param_ops param_ops_bool_enable_only; -extern int param_set_bool_enable_only(const char *val, - const struct kernel_param *kp); +int param_set_bool_enable_only(const char *val, const struct kernel_param *kp); /* getter is the same as for the regular bool */ #define param_check_bool_enable_only param_check_bool extern const struct kernel_param_ops param_ops_invbool; -extern int param_set_invbool(const char *val, const struct kernel_param *kp); -extern int param_get_invbool(char *buffer, const struct kernel_param *kp); +int param_set_invbool(const char *val, const struct kernel_param *kp); +int param_get_invbool(char *buffer, const struct kernel_param *kp); #define param_check_invbool(name, p) __param_check(name, p, bool) /* An int, which can only be set like a bool (though it shows as an int). */ extern const struct kernel_param_ops param_ops_bint; -extern int param_set_bint(const char *val, const struct kernel_param *kp); +int param_set_bint(const char *val, const struct kernel_param *kp); #define param_get_bint param_get_int #define param_check_bint param_check_int @@ -615,19 +614,19 @@ enum hwparam_type { extern const struct kernel_param_ops param_array_ops; extern const struct kernel_param_ops param_ops_string; -extern int param_set_copystring(const char *val, const struct kernel_param *); -extern int param_get_string(char *buffer, const struct kernel_param *kp); +int param_set_copystring(const char *val, const struct kernel_param *kp); +int param_get_string(char *buffer, const struct kernel_param *kp); /* for exporting parameters in /sys/module/.../parameters */ struct module; #if defined(CONFIG_SYSFS) && defined(CONFIG_MODULES) -extern int module_param_sysfs_setup(struct module *mod, - const struct kernel_param *kparam, - unsigned int num_params); +int module_param_sysfs_setup(struct module *mod, + const struct kernel_param *kparam, + unsigned int num_params); -extern void module_param_sysfs_remove(struct module *mod); +void module_param_sysfs_remove(struct module *mod); #else static inline int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, -- cgit v1.2.3 From 306c36a76da2d6d2b5e91db925d41a9a8d77dbfd Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:12 +0000 Subject: bootconfig: constify xbc_calc_checksum() data parameter xbc_calc_checksum() only reads the data buffer, so mark the parameter as const void * and the internal pointer as const unsigned char *. Link: https://lore.kernel.org/all/20260318155919.78168-7-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 25df9260d206..23a96c5edcf3 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -36,9 +36,9 @@ bool __init cmdline_has_extra_options(void); * The checksum will be used with the BOOTCONFIG_MAGIC and the size for * embedding the bootconfig in the initrd image. */ -static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size) +static inline __init uint32_t xbc_calc_checksum(const void *data, uint32_t size) { - unsigned char *p = data; + const unsigned char *p = data; uint32_t ret = 0; while (size--) -- cgit v1.2.3 From 6eb255d019b810614c5cbd99b9ef281b7b9361e3 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:19 +0000 Subject: lib/bootconfig: change xbc_node_index() return type to uint16_t lib/bootconfig.c:136:21: warning: conversion from 'long int' to 'int' may change value [-Wconversion] lib/bootconfig.c:308:33: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:467:37: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:469:40: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:472:54: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:476:45: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] xbc_node_index() returns the position of a node in the xbc_nodes array, which has at most XBC_NODE_MAX (8192) entries, well within uint16_t range. Every caller stores the result in a uint16_t field (node->parent, node->child, node->next, or the keys[] array in compose_key_after), so the int return type causes narrowing warnings at all six call sites. Change the return type to uint16_t and add an explicit cast on the pointer subtraction to match the storage width and eliminate the warnings. Link: https://lore.kernel.org/all/20260318155919.78168-14-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 2 +- lib/bootconfig.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 23a96c5edcf3..692a5acc2ffc 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -66,7 +66,7 @@ struct xbc_node { /* Node tree access raw APIs */ struct xbc_node * __init xbc_root_node(void); -int __init xbc_node_index(struct xbc_node *node); +uint16_t __init xbc_node_index(struct xbc_node *node); struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node); struct xbc_node * __init xbc_node_get_child(struct xbc_node *node); struct xbc_node * __init xbc_node_get_next(struct xbc_node *node); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 343aa9629464..01966ab9a8b5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -131,9 +131,9 @@ struct xbc_node * __init xbc_root_node(void) * * Return the index number of @node in XBC node list. */ -int __init xbc_node_index(struct xbc_node *node) +uint16_t __init xbc_node_index(struct xbc_node *node) { - return node - &xbc_nodes[0]; + return (uint16_t)(node - &xbc_nodes[0]); } /** -- cgit v1.2.3 From 9c6b4009da5991db6bf02a47a578885a04a5e3f6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:03 +0100 Subject: net: mdio-gpio: remove linux/mdio-gpio.h The three defines from the linux/mdio-gpio.h header are only used in the mdio-gpio module. There's no reason to have them in a public header. Move them into the driver and remove mdio-gpio.h. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-1-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-gpio.c | 5 ++++- include/linux/mdio-gpio.h | 9 --------- 2 files changed, 4 insertions(+), 10 deletions(-) delete mode 100644 include/linux/mdio-gpio.h (limited to 'include') diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index 1cfd538b5105..c99310889896 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -20,13 +20,16 @@ #include #include #include -#include #include #include #include #include #include +#define MDIO_GPIO_MDC 0 +#define MDIO_GPIO_MDIO 1 +#define MDIO_GPIO_MDO 2 + struct mdio_gpio_info { struct mdiobb_ctrl ctrl; struct gpio_desc *mdc, *mdio, *mdo; diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h deleted file mode 100644 index cea443a672cb..000000000000 --- a/include/linux/mdio-gpio.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_MDIO_GPIO_H -#define __LINUX_MDIO_GPIO_H - -#define MDIO_GPIO_MDC 0 -#define MDIO_GPIO_MDIO 1 -#define MDIO_GPIO_MDO 2 - -#endif -- cgit v1.2.3 From 356d4fbcf3defaff0f98d2b6b54f3b26f0ff189d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:04 +0100 Subject: net: mdio-gpio: remove linux/platform_data/mdio-gpio.h Nobody defines struct mdio_gpio_platform_data. Remove platform data support from mdio-gpio and drop the header. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-2-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - drivers/net/mdio/mdio-gpio.c | 7 ------- include/linux/platform_data/mdio-gpio.h | 14 -------------- 3 files changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/mdio-gpio.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 5d477fd592db..7d65f9435950 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9547,7 +9547,6 @@ F: include/linux/phy_fixed.h F: include/linux/phy_link_topology.h F: include/linux/phylib_stubs.h F: include/linux/platform_data/mdio-bcm-unimac.h -F: include/linux/platform_data/mdio-gpio.h F: include/net/phy/ F: include/trace/events/mdio.h F: include/uapi/linux/mdio.h diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index c99310889896..958d1c6608ab 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -113,7 +112,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, struct mdio_gpio_info *bitbang, int bus_id) { - struct mdio_gpio_platform_data *pdata = dev_get_platdata(dev); struct mii_bus *new_bus; bitbang->ctrl.ops = &mdio_gpio_ops; @@ -130,11 +128,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, else strscpy(new_bus->id, "gpio", sizeof(new_bus->id)); - if (pdata) { - new_bus->phy_mask = pdata->phy_mask; - new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask; - } - if (device_is_compatible(dev, "microchip,mdio-smi0")) { bitbang->ctrl.op_c22_read = 0; bitbang->ctrl.op_c22_write = 0; diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h deleted file mode 100644 index 13874fa6e767..000000000000 --- a/include/linux/platform_data/mdio-gpio.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * MDIO-GPIO bus platform data structure - */ - -#ifndef __LINUX_MDIO_GPIO_PDATA_H -#define __LINUX_MDIO_GPIO_PDATA_H - -struct mdio_gpio_platform_data { - u32 phy_mask; - u32 phy_ignore_ta_mask; -}; - -#endif /* __LINUX_MDIO_GPIO_PDATA_H */ -- cgit v1.2.3 From 20a107bca2a16c8aa20d62631d4fc45828607664 Mon Sep 17 00:00:00 2001 From: Kathiravan Thirumoorthy Date: Wed, 18 Mar 2026 14:09:43 +0530 Subject: dt-bindings: clock: add Qualcomm IPQ5210 GCC Add binding for the Qualcomm IPQ5210 Global Clock Controller. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Kathiravan Thirumoorthy Link: https://lore.kernel.org/r/20260318-ipq5210_boot_to_shell-v2-1-a87e27c37070@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- .../bindings/clock/qcom,ipq5210-gcc.yaml | 62 ++++++++++ include/dt-bindings/clock/qcom,ipq5210-gcc.h | 126 ++++++++++++++++++++ include/dt-bindings/reset/qcom,ipq5210-gcc.h | 127 +++++++++++++++++++++ 3 files changed, 315 insertions(+) create mode 100644 Documentation/devicetree/bindings/clock/qcom,ipq5210-gcc.yaml create mode 100644 include/dt-bindings/clock/qcom,ipq5210-gcc.h create mode 100644 include/dt-bindings/reset/qcom,ipq5210-gcc.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq5210-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq5210-gcc.yaml new file mode 100644 index 000000000000..f1cc3fc19085 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/qcom,ipq5210-gcc.yaml @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/clock/qcom,ipq5210-gcc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Global Clock & Reset Controller on IPQ5210 + +maintainers: + - Bjorn Andersson + - Kathiravan Thirumoorthy + +description: | + Qualcomm global clock control module provides the clocks, resets and power + domains on IPQ5210 + + See also: + include/dt-bindings/clock/qcom,ipq5210-gcc.h + include/dt-bindings/reset/qcom,ipq5210-gcc.h + +properties: + compatible: + const: qcom,ipq5210-gcc + + clocks: + items: + - description: Board XO source + - description: Sleep clock source + - description: PCIE30 PHY0 pipe clock source + - description: PCIE30 PHY1 pipe clock source + - description: USB3 PHY pipe clock source + - description: NSS common clock source + + '#power-domain-cells': false + + '#interconnect-cells': + const: 1 + +required: + - compatible + - clocks + +allOf: + - $ref: qcom,gcc.yaml# + +unevaluatedProperties: false + +examples: + - | + clock-controller@1800000 { + compatible = "qcom,ipq5210-gcc"; + reg = <0x01800000 0x40000>; + clocks = <&xo_board_clk>, + <&sleep_clk>, + <&pcie30_phy0_pipe_clk>, + <&pcie30_phy1_pipe_clk>, + <&usb3phy_0_cc_pipe_clk>, + <&nss_cmn_clk>; + #clock-cells = <1>; + #reset-cells = <1>; + }; +... diff --git a/include/dt-bindings/clock/qcom,ipq5210-gcc.h b/include/dt-bindings/clock/qcom,ipq5210-gcc.h new file mode 100644 index 000000000000..84116f34ee4d --- /dev/null +++ b/include/dt-bindings/clock/qcom,ipq5210-gcc.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLOCK_IPQ_GCC_IPQ5210_H +#define _DT_BINDINGS_CLOCK_IPQ_GCC_IPQ5210_H + +#define GCC_ADSS_PWM_CLK 0 +#define GCC_ADSS_PWM_CLK_SRC 1 +#define GCC_CMN_12GPLL_AHB_CLK 2 +#define GCC_CMN_12GPLL_SYS_CLK 3 +#define GCC_CNOC_LPASS_CFG_CLK 4 +#define GCC_CNOC_PCIE0_1LANE_S_CLK 5 +#define GCC_CNOC_PCIE1_2LANE_S_CLK 6 +#define GCC_CNOC_USB_CLK 7 +#define GCC_GEPHY_SYS_CLK 8 +#define GCC_LPASS_AXIM_CLK_SRC 9 +#define GCC_LPASS_CORE_AXIM_CLK 10 +#define GCC_LPASS_SWAY_CLK 11 +#define GCC_LPASS_SWAY_CLK_SRC 12 +#define GCC_MDIO_AHB_CLK 13 +#define GCC_MDIO_GEPHY_AHB_CLK 14 +#define GCC_NSS_TS_CLK 15 +#define GCC_NSS_TS_CLK_SRC 16 +#define GCC_NSSCC_CLK 17 +#define GCC_NSSCFG_CLK 18 +#define GCC_NSSNOC_ATB_CLK 19 +#define GCC_NSSNOC_MEMNOC_1_CLK 20 +#define GCC_NSSNOC_MEMNOC_BFDCD_CLK_SRC 21 +#define GCC_NSSNOC_MEMNOC_CLK 22 +#define GCC_NSSNOC_MEMNOC_DIV_CLK_SRC 23 +#define GCC_NSSNOC_NSSCC_CLK 24 +#define GCC_NSSNOC_PCNOC_1_CLK 25 +#define GCC_NSSNOC_QOSGEN_REF_CLK 26 +#define GCC_NSSNOC_SNOC_1_CLK 27 +#define GCC_NSSNOC_SNOC_CLK 28 +#define GCC_NSSNOC_TIMEOUT_REF_CLK 29 +#define GCC_NSSNOC_XO_DCD_CLK 30 +#define GCC_PCIE0_AHB_CLK 31 +#define GCC_PCIE0_AUX_CLK 32 +#define GCC_PCIE0_AXI_M_CLK 33 +#define GCC_PCIE0_AXI_M_CLK_SRC 34 +#define GCC_PCIE0_AXI_S_BRIDGE_CLK 35 +#define GCC_PCIE0_AXI_S_CLK 36 +#define GCC_PCIE0_AXI_S_CLK_SRC 37 +#define GCC_PCIE0_PIPE_CLK 38 +#define GCC_PCIE0_PIPE_CLK_SRC 39 +#define GCC_PCIE0_RCHNG_CLK 40 +#define GCC_PCIE0_RCHNG_CLK_SRC 41 +#define GCC_PCIE1_AHB_CLK 42 +#define GCC_PCIE1_AUX_CLK 43 +#define GCC_PCIE1_AXI_M_CLK 44 +#define GCC_PCIE1_AXI_M_CLK_SRC 45 +#define GCC_PCIE1_AXI_S_BRIDGE_CLK 46 +#define GCC_PCIE1_AXI_S_CLK 47 +#define GCC_PCIE1_AXI_S_CLK_SRC 48 +#define GCC_PCIE1_PIPE_CLK 49 +#define GCC_PCIE1_PIPE_CLK_SRC 50 +#define GCC_PCIE1_RCHNG_CLK 51 +#define GCC_PCIE1_RCHNG_CLK_SRC 52 +#define GCC_PCIE_AUX_CLK_SRC 53 +#define GCC_PCNOC_BFDCD_CLK_SRC 54 +#define GCC_PON_APB_CLK 55 +#define GCC_PON_TM_CLK 56 +#define GCC_PON_TM2X_CLK 57 +#define GCC_PON_TM2X_CLK_SRC 58 +#define GCC_QDSS_AT_CLK 59 +#define GCC_QDSS_AT_CLK_SRC 60 +#define GCC_QDSS_DAP_CLK 61 +#define GCC_QDSS_TSCTR_CLK_SRC 62 +#define GCC_QPIC_AHB_CLK 63 +#define GCC_QPIC_CLK 64 +#define GCC_QPIC_CLK_SRC 65 +#define GCC_QPIC_IO_MACRO_CLK 66 +#define GCC_QPIC_IO_MACRO_CLK_SRC 67 +#define GCC_QRNG_AHB_CLK 68 +#define GCC_QUPV3_AHB_MST_CLK 69 +#define GCC_QUPV3_AHB_SLV_CLK 70 +#define GCC_QUPV3_WRAP_SE0_CLK 71 +#define GCC_QUPV3_WRAP_SE0_CLK_SRC 72 +#define GCC_QUPV3_WRAP_SE1_CLK 73 +#define GCC_QUPV3_WRAP_SE1_CLK_SRC 74 +#define GCC_QUPV3_WRAP_SE2_CLK 75 +#define GCC_QUPV3_WRAP_SE2_CLK_SRC 76 +#define GCC_QUPV3_WRAP_SE3_CLK 77 +#define GCC_QUPV3_WRAP_SE3_CLK_SRC 78 +#define GCC_QUPV3_WRAP_SE4_CLK 79 +#define GCC_QUPV3_WRAP_SE4_CLK_SRC 80 +#define GCC_QUPV3_WRAP_SE5_CLK 81 +#define GCC_QUPV3_WRAP_SE5_CLK_SRC 82 +#define GCC_SDCC1_AHB_CLK 83 +#define GCC_SDCC1_APPS_CLK 84 +#define GCC_SDCC1_APPS_CLK_SRC 85 +#define GCC_SDCC1_ICE_CORE_CLK 86 +#define GCC_SDCC1_ICE_CORE_CLK_SRC 87 +#define GCC_SLEEP_CLK_SRC 88 +#define GCC_SNOC_LPASS_CLK 89 +#define GCC_SNOC_PCIE0_AXI_M_CLK 90 +#define GCC_SNOC_PCIE1_AXI_M_CLK 91 +#define GCC_SYSTEM_NOC_BFDCD_CLK_SRC 92 +#define GCC_UNIPHY0_AHB_CLK 93 +#define GCC_UNIPHY0_SYS_CLK 94 +#define GCC_UNIPHY1_AHB_CLK 95 +#define GCC_UNIPHY1_SYS_CLK 96 +#define GCC_UNIPHY2_AHB_CLK 97 +#define GCC_UNIPHY2_SYS_CLK 98 +#define GCC_UNIPHY_SYS_CLK_SRC 99 +#define GCC_USB0_AUX_CLK 100 +#define GCC_USB0_AUX_CLK_SRC 101 +#define GCC_USB0_MASTER_CLK 102 +#define GCC_USB0_MASTER_CLK_SRC 103 +#define GCC_USB0_MOCK_UTMI_CLK 104 +#define GCC_USB0_MOCK_UTMI_CLK_SRC 105 +#define GCC_USB0_MOCK_UTMI_DIV_CLK_SRC 106 +#define GCC_USB0_PHY_CFG_AHB_CLK 107 +#define GCC_USB0_PIPE_CLK 108 +#define GCC_USB0_PIPE_CLK_SRC 109 +#define GCC_USB0_SLEEP_CLK 110 +#define GCC_XO_CLK_SRC 111 +#define GPLL0_MAIN 112 +#define GPLL0 113 +#define GPLL2_MAIN 114 +#define GPLL2 115 +#define GPLL4_MAIN 116 +#endif diff --git a/include/dt-bindings/reset/qcom,ipq5210-gcc.h b/include/dt-bindings/reset/qcom,ipq5210-gcc.h new file mode 100644 index 000000000000..09890a09087c --- /dev/null +++ b/include/dt-bindings/reset/qcom,ipq5210-gcc.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_RESET_IPQ_GCC_IPQ5210_H +#define _DT_BINDINGS_RESET_IPQ_GCC_IPQ5210_H + +#define GCC_ADSS_BCR 0 +#define GCC_ADSS_PWM_ARES 1 +#define GCC_APC0_VOLTAGE_DROOP_DETECTOR_BCR 2 +#define GCC_APC0_VOLTAGE_DROOP_DETECTOR_GPLL0_ARES 3 +#define GCC_APSS_AHB_ARES 4 +#define GCC_APSS_ATB_ARES 5 +#define GCC_APSS_AXI_ARES 6 +#define GCC_APSS_TS_ARES 7 +#define GCC_BOOT_ROM_AHB_ARES 8 +#define GCC_BOOT_ROM_BCR 9 +#define GCC_GEPHY_BCR 10 +#define GCC_GEPHY_SYS_ARES 11 +#define GCC_GP1_ARES 12 +#define GCC_GP2_ARES 13 +#define GCC_GP3_ARES 14 +#define GCC_MDIO_AHB_ARES 15 +#define GCC_MDIO_BCR 16 +#define GCC_MDIO_GEPHY_AHB_ARES 17 +#define GCC_NSS_BCR 18 +#define GCC_NSS_TS_ARES 19 +#define GCC_NSSCC_ARES 20 +#define GCC_NSSCFG_ARES 21 +#define GCC_NSSNOC_ATB_ARES 22 +#define GCC_NSSNOC_MEMNOC_1_ARES 23 +#define GCC_NSSNOC_MEMNOC_ARES 24 +#define GCC_NSSNOC_NSSCC_ARES 25 +#define GCC_NSSNOC_PCNOC_1_ARES 26 +#define GCC_NSSNOC_QOSGEN_REF_ARES 27 +#define GCC_NSSNOC_SNOC_1_ARES 28 +#define GCC_NSSNOC_SNOC_ARES 29 +#define GCC_NSSNOC_TIMEOUT_REF_ARES 30 +#define GCC_NSSNOC_XO_DCD_ARES 31 +#define GCC_PCIE0_AHB_ARES 32 +#define GCC_PCIE0_AUX_ARES 33 +#define GCC_PCIE0_AXI_M_ARES 34 +#define GCC_PCIE0_AXI_S_BRIDGE_ARES 35 +#define GCC_PCIE0_AXI_S_ARES 36 +#define GCC_PCIE0_BCR 37 +#define GCC_PCIE0_LINK_DOWN_BCR 38 +#define GCC_PCIE0_PHY_BCR 39 +#define GCC_PCIE0_PIPE_ARES 40 +#define GCC_PCIE0PHY_PHY_BCR 41 +#define GCC_PCIE1_AHB_ARES 42 +#define GCC_PCIE1_AUX_ARES 43 +#define GCC_PCIE1_AXI_M_ARES 44 +#define GCC_PCIE1_AXI_S_BRIDGE_ARES 45 +#define GCC_PCIE1_AXI_S_ARES 46 +#define GCC_PCIE1_BCR 47 +#define GCC_PCIE1_LINK_DOWN_BCR 48 +#define GCC_PCIE1_PHY_BCR 49 +#define GCC_PCIE1_PIPE_ARES 50 +#define GCC_PCIE1PHY_PHY_BCR 51 +#define GCC_QRNG_AHB_ARES 52 +#define GCC_QRNG_BCR 53 +#define GCC_QUPV3_2X_CORE_ARES 54 +#define GCC_QUPV3_AHB_MST_ARES 55 +#define GCC_QUPV3_AHB_SLV_ARES 56 +#define GCC_QUPV3_BCR 57 +#define GCC_QUPV3_CORE_ARES 58 +#define GCC_QUPV3_WRAP_SE0_ARES 59 +#define GCC_QUPV3_WRAP_SE0_BCR 60 +#define GCC_QUPV3_WRAP_SE1_ARES 61 +#define GCC_QUPV3_WRAP_SE1_BCR 62 +#define GCC_QUPV3_WRAP_SE2_ARES 63 +#define GCC_QUPV3_WRAP_SE2_BCR 64 +#define GCC_QUPV3_WRAP_SE3_ARES 65 +#define GCC_QUPV3_WRAP_SE3_BCR 66 +#define GCC_QUPV3_WRAP_SE4_ARES 67 +#define GCC_QUPV3_WRAP_SE4_BCR 68 +#define GCC_QUPV3_WRAP_SE5_ARES 69 +#define GCC_QUPV3_WRAP_SE5_BCR 70 +#define GCC_QUSB2_0_PHY_BCR 71 +#define GCC_SDCC1_AHB_ARES 72 +#define GCC_SDCC1_APPS_ARES 73 +#define GCC_SDCC1_ICE_CORE_ARES 74 +#define GCC_SDCC_BCR 75 +#define GCC_TLMM_AHB_ARES 76 +#define GCC_TLMM_ARES 77 +#define GCC_TLMM_BCR 78 +#define GCC_UNIPHY0_AHB_ARES 79 +#define GCC_UNIPHY0_BCR 80 +#define GCC_UNIPHY0_SYS_ARES 81 +#define GCC_UNIPHY1_AHB_ARES 82 +#define GCC_UNIPHY1_BCR 83 +#define GCC_UNIPHY1_SYS_ARES 84 +#define GCC_UNIPHY2_AHB_ARES 85 +#define GCC_UNIPHY2_BCR 86 +#define GCC_UNIPHY2_SYS_ARES 87 +#define GCC_USB0_AUX_ARES 88 +#define GCC_USB0_MASTER_ARES 89 +#define GCC_USB0_MOCK_UTMI_ARES 90 +#define GCC_USB0_PHY_BCR 91 +#define GCC_USB0_PHY_CFG_AHB_ARES 92 +#define GCC_USB0_PIPE_ARES 93 +#define GCC_USB0_SLEEP_ARES 94 +#define GCC_USB3PHY_0_PHY_BCR 95 +#define GCC_USB_BCR 96 +#define GCC_PCIE0_PIPE_RESET 97 +#define GCC_PCIE0_CORE_STICKY_RESET 98 +#define GCC_PCIE0_AXI_S_STICKY_RESET 99 +#define GCC_PCIE0_AXI_S_RESET 100 +#define GCC_PCIE0_AXI_M_STICKY_RESET 101 +#define GCC_PCIE0_AXI_M_RESET 102 +#define GCC_PCIE0_AUX_RESET 103 +#define GCC_PCIE0_AHB_RESET 104 +#define GCC_PCIE1_PIPE_RESET 105 +#define GCC_PCIE1_CORE_STICKY_RESET 106 +#define GCC_PCIE1_AXI_S_STICKY_RESET 107 +#define GCC_PCIE1_AXI_S_RESET 108 +#define GCC_PCIE1_AXI_M_STICKY_RESET 109 +#define GCC_PCIE1_AXI_M_RESET 110 +#define GCC_PCIE1_AUX_RESET 111 +#define GCC_PCIE1_AHB_RESET 112 +#define GCC_UNIPHY0_XPCS_ARES 113 +#define GCC_UNIPHY1_XPCS_ARES 114 +#define GCC_UNIPHY2_XPCS_ARES 115 +#define GCC_QDSS_BCR 116 + +#endif -- cgit v1.2.3 From 76404ffbf07f28a5ec04748e18fce3dac2e78ef6 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Thu, 12 Mar 2026 08:12:06 -0300 Subject: dt-bindings: clock: qcom,gcc-sc8180x: Add missing GDSCs There are 5 more GDSCs that we were ignoring and not putting to sleep, which are listed in downstream DTS. Add them. Signed-off-by: Val Packett Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20260312112321.370983-2-val@packett.cool Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,gcc-sc8180x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,gcc-sc8180x.h b/include/dt-bindings/clock/qcom,gcc-sc8180x.h index b9d8438a15ff..9ed7b794aacc 100644 --- a/include/dt-bindings/clock/qcom,gcc-sc8180x.h +++ b/include/dt-bindings/clock/qcom,gcc-sc8180x.h @@ -322,5 +322,10 @@ #define USB30_MP_GDSC 8 #define USB30_PRIM_GDSC 9 #define USB30_SEC_GDSC 10 +#define HLOS1_VOTE_MMNOC_MMU_TBU_HF0_GDSC 11 +#define HLOS1_VOTE_MMNOC_MMU_TBU_HF1_GDSC 12 +#define HLOS1_VOTE_MMNOC_MMU_TBU_SF_GDSC 13 +#define HLOS1_VOTE_TURING_MMU_TBU0_GDSC 14 +#define HLOS1_VOTE_TURING_MMU_TBU1_GDSC 15 #endif -- cgit v1.2.3 From dc3d720e12f602059490c1ab2bfee84a7465998f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 17 Mar 2026 12:18:05 -0700 Subject: net: ethtool: add ethtool COALESCE_RX_CQE_FRAMES/NSECS Add two parameters for drivers supporting Rx CQE coalescing / descriptor writeback. ETHTOOL_A_COALESCE_RX_CQE_FRAMES: Maximum number of frames that can be coalesced into a CQE or writeback. ETHTOOL_A_COALESCE_RX_CQE_NSECS: Max time in nanoseconds after the first packet arrival in a coalesced CQE or writeback to be sent. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/20260317191826.1346111-2-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 8 ++++++++ Documentation/networking/ethtool-netlink.rst | 11 +++++++++++ include/linux/ethtool.h | 6 +++++- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ net/ethtool/coalesce.c | 14 +++++++++++++- 5 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index a05b5425b76a..5dd4d1b5d94b 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -857,6 +857,12 @@ attribute-sets: name: tx-profile type: nest nested-attributes: profile + - + name: rx-cqe-frames + type: u32 + - + name: rx-cqe-nsecs + type: u32 - name: pause-stat @@ -2253,6 +2259,8 @@ operations: - tx-aggr-time-usecs - rx-profile - tx-profile + - rx-cqe-frames + - rx-cqe-nsecs dump: *coalesce-get-op - name: coalesce-set diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 32179168eb73..e92abf45faf5 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1076,6 +1076,8 @@ Kernel response contents: ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE =========================================== ====== ======================= Attributes are only included in reply if their value is not zero or the @@ -1109,6 +1111,13 @@ well with frequent small-sized URBs transmissions. to DIM parameters, see `Generic Network Dynamic Interrupt Moderation (Net DIM) `_. +Rx CQE coalescing allows multiple received packets to be coalesced into a +single Completion Queue Entry (CQE) or descriptor writeback. +``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` describes the maximum number of +frames that can be coalesced into a CQE or writeback. +``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` describes max time in nanoseconds after +the first packet arrival in a coalesced CQE or writeback to be sent. + COALESCE_SET ============ @@ -1147,6 +1156,8 @@ Request contents: ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE =========================================== ====== ======================= Request is rejected if it attributes declared as unsupported by driver (i.e. diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 83c375840835..656d465bcd06 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -332,6 +332,8 @@ struct kernel_ethtool_coalesce { u32 tx_aggr_max_bytes; u32 tx_aggr_max_frames; u32 tx_aggr_time_usecs; + u32 rx_cqe_frames; + u32 rx_cqe_nsecs; }; /** @@ -380,7 +382,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) #define ETHTOOL_COALESCE_RX_PROFILE BIT(27) #define ETHTOOL_COALESCE_TX_PROFILE BIT(28) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) +#define ETHTOOL_COALESCE_RX_CQE_FRAMES BIT(29) +#define ETHTOOL_COALESCE_RX_CQE_NSECS BIT(30) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(30, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 114b83017297..8134baf7860f 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -371,6 +371,8 @@ enum { ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, ETHTOOL_A_COALESCE_RX_PROFILE, ETHTOOL_A_COALESCE_TX_PROFILE, + ETHTOOL_A_COALESCE_RX_CQE_FRAMES, + ETHTOOL_A_COALESCE_RX_CQE_NSECS, __ETHTOOL_A_COALESCE_CNT, ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 3e18ca1ccc5e..349bb02c517a 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -118,6 +118,8 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ + nla_total_size(sizeof(u32)) + /* _RX_CQE_FRAMES */ + nla_total_size(sizeof(u32)) + /* _RX_CQE_NSECS */ total_modersz * 2; /* _{R,T}X_PROFILE */ } @@ -269,7 +271,11 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, kcoal->tx_aggr_max_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, - kcoal->tx_aggr_time_usecs, supported)) + kcoal->tx_aggr_time_usecs, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_FRAMES, + kcoal->rx_cqe_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_NSECS, + kcoal->rx_cqe_nsecs, supported)) return -EMSGSIZE; if (!req_base->dev || !req_base->dev->irq_moder) @@ -338,6 +344,8 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_CQE_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_CQE_NSECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), [ETHTOOL_A_COALESCE_TX_PROFILE] = @@ -570,6 +578,10 @@ __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); + ethnl_update_u32(&kernel_coalesce.rx_cqe_frames, + tb[ETHTOOL_A_COALESCE_RX_CQE_FRAMES], &mod); + ethnl_update_u32(&kernel_coalesce.rx_cqe_nsecs, + tb[ETHTOOL_A_COALESCE_RX_CQE_NSECS], &mod); if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, -- cgit v1.2.3 From c2fe3ff3d66d6f53ec5857c277fae5b3ff9881c1 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 17 Mar 2026 12:18:06 -0700 Subject: net: mana: Add support for RX CQE Coalescing Our NIC can have up to 4 RX packets on 1 CQE. To support this feature, check and process the type CQE_RX_COALESCED_4. The default setting is disabled, to avoid possible regression on latency. And, add ethtool handler to switch this feature. To turn it on, run: ethtool -C rx-cqe-frames 4 To turn it off: ethtool -C rx-cqe-frames 1 The rx-cqe-nsec is the time out value in nanoseconds after the first packet arrival in a coalesced CQE to be sent. It's read-only for this NIC. Reviewed-by: Long Li Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/20260317191826.1346111-3-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 74 ++++++++++++++-------- drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 60 +++++++++++++++++- include/net/mana/mana.h | 8 ++- 3 files changed, 113 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ea71de39f996..fa30046dcd3d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1365,6 +1365,7 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, sizeof(resp)); req->hdr.req.msg_version = GDMA_MESSAGE_V2; + req->hdr.resp.msg_version = GDMA_MESSAGE_V2; req->vport = apc->port_handle; req->num_indir_entries = apc->indir_table_sz; @@ -1376,7 +1377,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, req->update_hashkey = update_key; req->update_indir_tab = update_tab; req->default_rxobj = apc->default_rxobj; - req->cqe_coalescing_enable = 0; + + if (rx != TRI_STATE_FALSE) + req->cqe_coalescing_enable = apc->cqe_coalescing_enable; if (update_key) memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE); @@ -1405,8 +1408,13 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, netdev_err(ndev, "vPort RX configuration failed: 0x%x\n", resp.hdr.status); err = -EPROTO; + goto out; } + if (resp.hdr.response.msg_version >= GDMA_MESSAGE_V2) + apc->cqe_coalescing_timeout_ns = + resp.cqe_coalescing_timeout_ns; + netdev_info(ndev, "Configured steering vPort %llu entries %u\n", apc->port_handle, apc->indir_table_sz); out: @@ -1915,11 +1923,12 @@ static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va, } static void mana_rx_skb(void *buf_va, bool from_pool, - struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq) + struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq, + int i) { struct mana_stats_rx *rx_stats = &rxq->stats; struct net_device *ndev = rxq->ndev; - uint pkt_len = cqe->ppi[0].pkt_len; + uint pkt_len = cqe->ppi[i].pkt_len; u16 rxq_idx = rxq->rxq_idx; struct napi_struct *napi; struct xdp_buff xdp = {}; @@ -1963,7 +1972,7 @@ static void mana_rx_skb(void *buf_va, bool from_pool, } if (cqe->rx_hashtype != 0 && (ndev->features & NETIF_F_RXHASH)) { - hash_value = cqe->ppi[0].pkt_hash; + hash_value = cqe->ppi[i].pkt_hash; if (cqe->rx_hashtype & MANA_HASH_L4) skb_set_hash(skb, hash_value, PKT_HASH_TYPE_L4); @@ -2098,9 +2107,11 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, struct mana_recv_buf_oob *rxbuf_oob; struct mana_port_context *apc; struct device *dev = gc->dev; + bool coalesced = false; void *old_buf = NULL; u32 curr, pktlen; bool old_fp; + int i; apc = netdev_priv(ndev); @@ -2112,13 +2123,16 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, ++ndev->stats.rx_dropped; rxbuf_oob = &rxq->rx_oobs[rxq->buf_index]; netdev_warn_once(ndev, "Dropped a truncated packet\n"); - goto drop; - case CQE_RX_COALESCED_4: - netdev_err(ndev, "RX coalescing is unsupported\n"); - apc->eth_stats.rx_coalesced_err++; + mana_move_wq_tail(rxq->gdma_rq, + rxbuf_oob->wqe_inf.wqe_size_in_bu); + mana_post_pkt_rxq(rxq); return; + case CQE_RX_COALESCED_4: + coalesced = true; + break; + case CQE_RX_OBJECT_FENCE: complete(&rxq->fence_event); return; @@ -2130,30 +2144,37 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, return; } - pktlen = oob->ppi[0].pkt_len; + for (i = 0; i < MANA_RXCOMP_OOB_NUM_PPI; i++) { + old_buf = NULL; + pktlen = oob->ppi[i].pkt_len; + if (pktlen == 0) { + if (i == 0) + netdev_err_once( + ndev, + "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); + break; + } - if (pktlen == 0) { - /* data packets should never have packetlength of zero */ - netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", - rxq->gdma_id, cq->gdma_id, rxq->rxobj); - return; - } + curr = rxq->buf_index; + rxbuf_oob = &rxq->rx_oobs[curr]; + WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1); - curr = rxq->buf_index; - rxbuf_oob = &rxq->rx_oobs[curr]; - WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1); + mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf, &old_fp); - mana_refill_rx_oob(dev, rxq, rxbuf_oob, &old_buf, &old_fp); + /* Unsuccessful refill will have old_buf == NULL. + * In this case, mana_rx_skb() will drop the packet. + */ + mana_rx_skb(old_buf, old_fp, oob, rxq, i); - /* Unsuccessful refill will have old_buf == NULL. - * In this case, mana_rx_skb() will drop the packet. - */ - mana_rx_skb(old_buf, old_fp, oob, rxq); + mana_move_wq_tail(rxq->gdma_rq, + rxbuf_oob->wqe_inf.wqe_size_in_bu); -drop: - mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu); + mana_post_pkt_rxq(rxq); - mana_post_pkt_rxq(rxq); + if (!coalesced) + break; + } } static void mana_poll_rx_cq(struct mana_cq *cq) @@ -3332,6 +3353,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, apc->port_handle = INVALID_MANA_HANDLE; apc->pf_filter_handle = INVALID_MANA_HANDLE; apc->port_idx = port_idx; + apc->cqe_coalescing_enable = 0; mutex_init(&apc->vport_mutex); apc->vport_use_count = 0; diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index f2d220b371b5..4b234b16e57a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -20,8 +20,6 @@ static const struct mana_stats_desc mana_eth_stats[] = { tx_cqe_unknown_type)}, {"tx_linear_pkt_cnt", offsetof(struct mana_ethtool_stats, tx_linear_pkt_cnt)}, - {"rx_coalesced_err", offsetof(struct mana_ethtool_stats, - rx_coalesced_err)}, {"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats, rx_cqe_unknown_type)}, }; @@ -390,6 +388,61 @@ static void mana_get_channels(struct net_device *ndev, channel->combined_count = apc->num_queues; } +#define MANA_RX_CQE_NSEC_DEF 2048 +static int mana_get_coalesce(struct net_device *ndev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(ndev); + + kernel_coal->rx_cqe_frames = + apc->cqe_coalescing_enable ? MANA_RXCOMP_OOB_NUM_PPI : 1; + + kernel_coal->rx_cqe_nsecs = apc->cqe_coalescing_timeout_ns; + + /* Return the default timeout value for old FW not providing + * this value. + */ + if (apc->port_is_up && apc->cqe_coalescing_enable && + !kernel_coal->rx_cqe_nsecs) + kernel_coal->rx_cqe_nsecs = MANA_RX_CQE_NSEC_DEF; + + return 0; +} + +static int mana_set_coalesce(struct net_device *ndev, + struct ethtool_coalesce *ec, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(ndev); + u8 saved_cqe_coalescing_enable; + int err; + + if (kernel_coal->rx_cqe_frames != 1 && + kernel_coal->rx_cqe_frames != MANA_RXCOMP_OOB_NUM_PPI) { + NL_SET_ERR_MSG_FMT(extack, + "rx-frames must be 1 or %u, got %u", + MANA_RXCOMP_OOB_NUM_PPI, + kernel_coal->rx_cqe_frames); + return -EINVAL; + } + + saved_cqe_coalescing_enable = apc->cqe_coalescing_enable; + apc->cqe_coalescing_enable = + kernel_coal->rx_cqe_frames == MANA_RXCOMP_OOB_NUM_PPI; + + if (!apc->port_is_up) + return 0; + + err = mana_config_rss(apc, TRI_STATE_TRUE, false, false); + if (err) + apc->cqe_coalescing_enable = saved_cqe_coalescing_enable; + + return err; +} + static int mana_set_channels(struct net_device *ndev, struct ethtool_channels *channels) { @@ -510,6 +563,7 @@ static int mana_get_link_ksettings(struct net_device *ndev, } const struct ethtool_ops mana_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES, .get_ethtool_stats = mana_get_ethtool_stats, .get_sset_count = mana_get_sset_count, .get_strings = mana_get_strings, @@ -520,6 +574,8 @@ const struct ethtool_ops mana_ethtool_ops = { .set_rxfh = mana_set_rxfh, .get_channels = mana_get_channels, .set_channels = mana_set_channels, + .get_coalesce = mana_get_coalesce, + .set_coalesce = mana_set_coalesce, .get_ringparam = mana_get_ringparam, .set_ringparam = mana_set_ringparam, .get_link_ksettings = mana_get_link_ksettings, diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index a078af283bdd..a7f89e7ddc56 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -378,7 +378,6 @@ struct mana_ethtool_stats { u64 tx_cqe_err; u64 tx_cqe_unknown_type; u64 tx_linear_pkt_cnt; - u64 rx_coalesced_err; u64 rx_cqe_unknown_type; }; @@ -557,6 +556,9 @@ struct mana_port_context { bool port_is_up; bool port_st_save; /* Saved port state */ + u8 cqe_coalescing_enable; + u32 cqe_coalescing_timeout_ns; + struct mana_ethtool_stats eth_stats; struct mana_ethtool_phy_stats phy_stats; @@ -902,6 +904,10 @@ struct mana_cfg_rx_steer_req_v2 { struct mana_cfg_rx_steer_resp { struct gdma_resp_hdr hdr; + + /* V2 */ + u32 cqe_coalescing_timeout_ns; + u32 reserved1; }; /* HW DATA */ /* Register HW vPort */ -- cgit v1.2.3 From d01440e10a82cae2c4a28c76e46e6a8b94b27a84 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 17 Mar 2026 12:18:07 -0700 Subject: net: mana: Add ethtool counters for RX CQEs in coalesced type For RX CQEs with type CQE_RX_COALESCED_4, to measure the coalescing efficiency, add counters to count how many contains 2, 3, 4 packets respectively. Also, add a counter for the error case of first packet with length == 0. Reviewed-by: Long Li Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/20260317191826.1346111-4-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 24 +++++++++++++++------- drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 15 ++++++++++++-- include/net/mana/mana.h | 9 +++++--- 3 files changed, 36 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index fa30046dcd3d..49c65cc1697c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2147,14 +2147,8 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, for (i = 0; i < MANA_RXCOMP_OOB_NUM_PPI; i++) { old_buf = NULL; pktlen = oob->ppi[i].pkt_len; - if (pktlen == 0) { - if (i == 0) - netdev_err_once( - ndev, - "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", - rxq->gdma_id, cq->gdma_id, rxq->rxobj); + if (pktlen == 0) break; - } curr = rxq->buf_index; rxbuf_oob = &rxq->rx_oobs[curr]; @@ -2175,6 +2169,22 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, if (!coalesced) break; } + + /* Collect coalesced CQE count based on packets processed. + * Coalesced CQEs have at least 2 packets, so index is i - 2. + */ + if (i > 1) { + u64_stats_update_begin(&rxq->stats.syncp); + rxq->stats.coalesced_cqe[i - 2]++; + u64_stats_update_end(&rxq->stats.syncp); + } else if (!i && !pktlen) { + u64_stats_update_begin(&rxq->stats.syncp); + rxq->stats.pkt_len0_err++; + u64_stats_update_end(&rxq->stats.syncp); + netdev_err_once(ndev, + "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); + } } static void mana_poll_rx_cq(struct mana_cq *cq) diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index 4b234b16e57a..6a4b42fe0944 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -149,7 +149,7 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) { struct mana_port_context *apc = netdev_priv(ndev); unsigned int num_queues = apc->num_queues; - int i; + int i, j; if (stringset != ETH_SS_STATS) return; @@ -168,6 +168,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) ethtool_sprintf(&data, "rx_%d_xdp_drop", i); ethtool_sprintf(&data, "rx_%d_xdp_tx", i); ethtool_sprintf(&data, "rx_%d_xdp_redirect", i); + ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i); + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++) + ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2); } for (i = 0; i < num_queues; i++) { @@ -201,6 +204,8 @@ static void mana_get_ethtool_stats(struct net_device *ndev, u64 xdp_xmit; u64 xdp_drop; u64 xdp_tx; + u64 pkt_len0_err; + u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1]; u64 tso_packets; u64 tso_bytes; u64 tso_inner_packets; @@ -209,7 +214,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev, u64 short_pkt_fmt; u64 csum_partial; u64 mana_map_err; - int q, i = 0; + int q, i = 0, j; if (!apc->port_is_up) return; @@ -239,6 +244,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev, xdp_drop = rx_stats->xdp_drop; xdp_tx = rx_stats->xdp_tx; xdp_redirect = rx_stats->xdp_redirect; + pkt_len0_err = rx_stats->pkt_len0_err; + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++) + coalesced_cqe[j] = rx_stats->coalesced_cqe[j]; } while (u64_stats_fetch_retry(&rx_stats->syncp, start)); data[i++] = packets; @@ -246,6 +254,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev, data[i++] = xdp_drop; data[i++] = xdp_tx; data[i++] = xdp_redirect; + data[i++] = pkt_len0_err; + for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++) + data[i++] = coalesced_cqe[j]; } for (q = 0; q < num_queues; q++) { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index a7f89e7ddc56..3336688fed5e 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -61,8 +61,11 @@ enum TRI_STATE { #define MAX_PORTS_IN_MANA_DEV 256 +/* Maximum number of packets per coalesced CQE */ +#define MANA_RXCOMP_OOB_NUM_PPI 4 + /* Update this count whenever the respective structures are changed */ -#define MANA_STATS_RX_COUNT 5 +#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1) #define MANA_STATS_TX_COUNT 11 #define MANA_RX_FRAG_ALIGNMENT 64 @@ -73,6 +76,8 @@ struct mana_stats_rx { u64 xdp_drop; u64 xdp_tx; u64 xdp_redirect; + u64 pkt_len0_err; + u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1]; struct u64_stats_sync syncp; }; @@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info { u32 pkt_hash; }; /* HW DATA */ -#define MANA_RXCOMP_OOB_NUM_PPI 4 - /* Receive completion OOB */ struct mana_rxcomp_oob { struct mana_cqe_header cqe_hdr; -- cgit v1.2.3 From a8eed0ba6a4b2f1803ecdfa9f11a4818cf87c474 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Wed, 18 Mar 2026 13:08:22 +0530 Subject: hfsplus: refactor b-tree map page access and add node-type validation In HFS+ b-trees, the node allocation bitmap is stored across multiple records. The first chunk resides in the b-tree Header Node at record index 2, while all subsequent chunks are stored in dedicated Map Nodes at record index 0. This structural quirk forces callers like hfs_bmap_alloc() and hfs_bmap_free() to duplicate boilerplate code to validate offsets, correct lengths, and map the underlying pages via kmap_local_page(). There is also currently no strict node-type validation before reading these records, leaving the allocator vulnerable if a corrupted image points a map linkage to an Index or Leaf node. Introduce a unified bit-level API to encapsulate the map record access: 1. A new `struct hfs_bmap_ctx` to cleanly pass state and safely handle page math across all architectures. 2. `hfs_bmap_get_map_page()`: Automatically validates node types (HFS_NODE_HEADER vs HFS_NODE_MAP), infers the correct record index, handles page-boundary math, and returns the unmapped `struct page *` directly to the caller to avoid asymmetric mappings. 3. `hfs_bmap_clear_bit()`: A clean wrapper that internally handles page mapping/unmapping for single-bit operations. Refactor hfs_bmap_alloc() and hfs_bmap_free() to utilize this new API. This deduplicates the allocator logic, hardens the map traversal against fuzzed images, and provides the exact abstractions needed for upcoming mount-time validation checks. Signed-off-by: Shardul Bankar Reviewed-by: Viacheslav Dubeyko Tested-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20260318073823.3933718-2-shardul.b@mpiricsoftware.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/btree.c | 169 ++++++++++++++++++++++++++++++++------------- include/linux/hfs_common.h | 2 + 2 files changed, 124 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 1220a2f22737..64d168347b4b 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -129,6 +129,95 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, return clump_size; } +/* Context for iterating b-tree map pages + * @page_idx: The index of the page within the b-node's page array + * @off: The byte offset within the mapped page + * @len: The remaining length of the map record + */ +struct hfs_bmap_ctx { + unsigned int page_idx; + unsigned int off; + u16 len; +}; + +/* + * Finds the specific page containing the requested byte offset within the map + * record. Automatically handles the difference between header and map nodes. + * Returns the struct page pointer, or an ERR_PTR on failure. + * Note: The caller is responsible for mapping/unmapping the returned page. + */ +static struct page *hfs_bmap_get_map_page(struct hfs_bnode *node, struct hfs_bmap_ctx *ctx, + u32 byte_offset) +{ + u16 rec_idx, off16; + unsigned int page_off; + + if (node->this == HFSPLUS_TREE_HEAD) { + if (node->type != HFS_NODE_HEADER) { + pr_err("hfsplus: invalid btree header node\n"); + return ERR_PTR(-EIO); + } + rec_idx = HFSPLUS_BTREE_HDR_MAP_REC_INDEX; + } else { + if (node->type != HFS_NODE_MAP) { + pr_err("hfsplus: invalid btree map node\n"); + return ERR_PTR(-EIO); + } + rec_idx = HFSPLUS_BTREE_MAP_NODE_REC_INDEX; + } + + ctx->len = hfs_brec_lenoff(node, rec_idx, &off16); + if (!ctx->len) + return ERR_PTR(-ENOENT); + + if (!is_bnode_offset_valid(node, off16)) + return ERR_PTR(-EIO); + + ctx->len = check_and_correct_requested_length(node, off16, ctx->len); + + if (byte_offset >= ctx->len) + return ERR_PTR(-EINVAL); + + page_off = (u32)off16 + node->page_offset + byte_offset; + ctx->page_idx = page_off >> PAGE_SHIFT; + ctx->off = page_off & ~PAGE_MASK; + + return node->page[ctx->page_idx]; +} + +/** + * hfs_bmap_clear_bit - clear a bit in the b-tree map + * @node: the b-tree node containing the map record + * @node_bit_idx: the relative bit index within the node's map record + * + * Returns 0 on success, -EINVAL if already clear, or negative error code. + */ +static int hfs_bmap_clear_bit(struct hfs_bnode *node, u32 node_bit_idx) +{ + struct hfs_bmap_ctx ctx; + struct page *page; + u8 *bmap, mask; + + page = hfs_bmap_get_map_page(node, &ctx, node_bit_idx / BITS_PER_BYTE); + if (IS_ERR(page)) + return PTR_ERR(page); + + bmap = kmap_local_page(page); + + mask = 1 << (7 - (node_bit_idx % BITS_PER_BYTE)); + + if (!(bmap[ctx.off] & mask)) { + kunmap_local(bmap); + return -EINVAL; + } + + bmap[ctx.off] &= ~mask; + set_page_dirty(page); + kunmap_local(bmap); + + return 0; +} + /* Get a reference to a B*Tree and do some initial checks */ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) { @@ -374,11 +463,9 @@ int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes) struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) { struct hfs_bnode *node, *next_node; - struct page **pagep; + struct hfs_bmap_ctx ctx; + struct page *page; u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; u8 *data, byte, m; int i, res; @@ -390,30 +477,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; - len = hfs_brec_lenoff(node, 2, &off16); - off = off16; - if (!is_bnode_offset_valid(node, off)) { + page = hfs_bmap_get_map_page(node, &ctx, 0); + if (IS_ERR(page)) { + res = PTR_ERR(page); hfs_bnode_put(node); - return ERR_PTR(-EIO); + return ERR_PTR(res); } - len = check_and_correct_requested_length(node, off, len); - off += node->page_offset; - pagep = node->page + (off >> PAGE_SHIFT); - data = kmap_local_page(*pagep); - off &= ~PAGE_MASK; + data = kmap_local_page(page); idx = 0; for (;;) { - while (len) { - byte = data[off]; + while (ctx.len) { + byte = data[ctx.off]; if (byte != 0xff) { for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { if (!(byte & m)) { idx += i; - data[off] |= m; - set_page_dirty(*pagep); + data[ctx.off] |= m; + set_page_dirty(page); kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); @@ -423,13 +506,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } } - if (++off >= PAGE_SIZE) { + if (++ctx.off >= PAGE_SIZE) { kunmap_local(data); - data = kmap_local_page(*++pagep); - off = 0; + page = node->page[++ctx.page_idx]; + data = kmap_local_page(page); + ctx.off = 0; } idx += 8; - len--; + ctx.len--; } kunmap_local(data); nidx = node->next; @@ -443,22 +527,22 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) return next_node; node = next_node; - len = hfs_brec_lenoff(node, 0, &off16); - off = off16; - off += node->page_offset; - pagep = node->page + (off >> PAGE_SHIFT); - data = kmap_local_page(*pagep); - off &= ~PAGE_MASK; + page = hfs_bmap_get_map_page(node, &ctx, 0); + if (IS_ERR(page)) { + res = PTR_ERR(page); + hfs_bnode_put(node); + return ERR_PTR(res); + } + data = kmap_local_page(page); } } void hfs_bmap_free(struct hfs_bnode *node) { struct hfs_btree *tree; - struct page *page; u16 off, len; u32 nidx; - u8 *data, byte, m; + int res; hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); @@ -495,24 +579,15 @@ void hfs_bmap_free(struct hfs_bnode *node) } len = hfs_brec_lenoff(node, 0, &off); } - off += node->page_offset + nidx / 8; - page = node->page[off >> PAGE_SHIFT]; - data = kmap_local_page(page); - off &= ~PAGE_MASK; - m = 1 << (~nidx & 7); - byte = data[off]; - if (!(byte & m)) { - pr_crit("trying to free free bnode " - "%u(%d)\n", - node->this, node->type); - kunmap_local(data); - hfs_bnode_put(node); - return; + + res = hfs_bmap_clear_bit(node, nidx); + if (res == -EINVAL) { + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); + } else if (!res) { + tree->free_nodes++; + mark_inode_dirty(tree->inode); } - data[off] = byte & ~m; - set_page_dirty(page); - kunmap_local(data); + hfs_bnode_put(node); - tree->free_nodes++; - mark_inode_dirty(tree->inode); } diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index dadb5e0aa8a3..be24c687858e 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -510,6 +510,8 @@ struct hfs_btree_header_rec { #define HFSPLUS_NODE_MXSZ 32768 #define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 #define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_MAP_REC_INDEX 2 /* Map (bitmap) record in Header node */ +#define HFSPLUS_BTREE_MAP_NODE_REC_INDEX 0 /* Map record in Map Node */ #define HFSPLUS_BTREE_HDR_USER_BYTES 128 /* btree key type */ -- cgit v1.2.3 From c888c4c834c9afc18879fde91ad405be21b7425d Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Tue, 3 Mar 2026 10:42:28 +0200 Subject: gpu: host1x: convert MIPI to use operation function pointers Convert existing MIPI code to use operation function pointers, a necessary step for supporting Tegra20/Tegra30 SoCs. All common MIPI configuration that is SoC-independent remains in mipi.c, while all SoC-specific code is moved to tegra114-mipi.c (The naming matches the first SoC generation with a dedicated calibration block). Shared structures and function calls are placed into tegra-mipi-cal.h. Tested-by: Luca Ceresoli # tegra20, parallel camera Signed-off-by: Svyatoslav Ryhel Acked-by: Mikko Perttunen Signed-off-by: Hans Verkuil --- drivers/gpu/drm/tegra/dsi.c | 1 + drivers/gpu/host1x/Makefile | 1 + drivers/gpu/host1x/mipi.c | 592 +++++++------------------------- drivers/gpu/host1x/tegra114-mipi.c | 483 ++++++++++++++++++++++++++ drivers/staging/media/tegra-video/csi.c | 1 + include/linux/host1x.h | 10 - include/linux/tegra-mipi-cal.h | 57 +++ 7 files changed, 666 insertions(+), 479 deletions(-) create mode 100644 drivers/gpu/host1x/tegra114-mipi.c create mode 100644 include/linux/tegra-mipi-cal.h (limited to 'include') diff --git a/drivers/gpu/drm/tegra/dsi.c b/drivers/gpu/drm/tegra/dsi.c index 2c5aefe9621a..7f25c50621c9 100644 --- a/drivers/gpu/drm/tegra/dsi.c +++ b/drivers/gpu/drm/tegra/dsi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include